feat: url exclusion list

https://gitlab.com/malware-filter/urlhaus-filter/-/issues/110

remove url-without-path workaround from script.sh
  workaround already included in clean_url.js
This commit is contained in:
MDLeom 2025-06-09 09:00:50 +00:00
parent 8f15bb89db
commit cb5c0f6d90
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
3 changed files with 21 additions and 15 deletions

5
src/exclude-url.txt Normal file
View File

@ -0,0 +1,5 @@
# URL exclusion list
# Domains/URLs listed here will be excluded from domain-based and URL-based filters
# Any entry with slash (/) will not be applied to domain-based filters
# Include only top 1m (sub)domains that do not host user content
github.githubassets.com

View File

@ -1,5 +1,5 @@
# Exclusion list
# phishing links are still included in "phishing-filter.txt"
# Domain exclusion list
# Domains listed here will be excluded from domain-based filters only, not URL-based filters
s3.amazonaws.com
s3.us-east-2.amazonaws.com
s3.us-east-1.amazonaws.com

View File

@ -178,6 +178,7 @@ sort -u > "phishing-domains.txt"
cp "../src/exclude.txt" "."
cp "../src/exclude-url.txt" "."
## Parse the Umbrella 1 Million
unzip "top-1m-umbrella.zip" | \
@ -219,6 +220,12 @@ if [ -n "$CF_API" ] && [ -f "top-1m-radar.txt" ]; then
fi
cat "exclude-url.txt" | \
sed "/^#/d" | \
# "example.com/path" -> "^example\.com/path"
# slash doesn't need to be escaped
sed -e "s/^/^/" -e "s/\./\\\./g" > "exclude-url-grep.txt"
## Parse popular domains
cat "phishing-domains.txt" | \
# grep match whole line
@ -227,7 +234,10 @@ grep -Fx -f "top-1m-well-known.txt" > "phishing-top-domains.txt"
## Exclude popular domains
cat "phishing-domains.txt" | \
grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains-temp.txt"
grep -F -vf "phishing-top-domains.txt" | \
# exclude domains from domains-based filters
grep -vf "exclude-url-grep.txt" | \
sort -u > "phishing-notop-domains.txt"
cat "phishing-top-domains.txt" | \
# "example.com" -> "^example\.com"
@ -237,20 +247,11 @@ cat "phishing.txt" | \
# exact match hostname
grep -f "phishing-top-domains-grep.txt" | \
# exclude URL of top domains without path #43
grep -Fx -vf "phishing-top-domains.txt" > "phishing-url-top-domains-temp.txt"
cat "phishing-url-top-domains-temp.txt" | \
# url with path
grep -F "/" | \
grep -Fx -vf "phishing-top-domains.txt" | \
# exclude domains/URLs from URL-based filters
grep -vf "exclude-url-grep.txt" | \
sort -u > "phishing-url-top-domains-raw.txt"
cat "phishing-url-top-domains-temp.txt" | \
# url without path
grep -F -v "/" >> "phishing-notop-domains-temp.txt"
cat "phishing-notop-domains-temp.txt" | \
sort -u > "phishing-notop-domains.txt"
## Merge malware domains and URLs
CURRENT_TIME=$(date -u +"%Y-%m-%dT%H:%M:%SZ")