feat: url exclusion list

fixes #110
This commit is contained in:
MDLeom 2025-06-10 10:03:51 +00:00
parent be8abfc2f7
commit 355338e407
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
3 changed files with 24 additions and 4 deletions

5
src/exclude-url.txt Normal file
View File

@ -0,0 +1,5 @@
# URL exclusion list
# Domains/URLs listed here will be excluded from domain-based and URL-based filters
# Any entry with slash (/) will not be applied to domain-based filters
# Include only top 1m (sub)domains that do not host user content
github.githubassets.com

View File

@ -1,5 +1,5 @@
# Exclusion list
# malicious links are still included in "urlhaus-filter.txt" & "urlhaus-filter-online.txt"
# Domain exclusion list
# Domains listed here will be excluded from domain-based filters only, not URL-based filters
void.cat
pdesaa.cimaa.pt
users.telenet.be

View File

@ -102,7 +102,8 @@ if [ -n "$CF_API" ]; then
sort -u > "top-1m-radar.txt"
fi
cp -f "../src/exclude.txt" "."
cp "../src/exclude.txt" "."
cp "../src/exclude-url.txt" "."
## Prepare URLhaus.csv
unzip "urlhaus.zip" | \
@ -173,6 +174,13 @@ if [ -n "$CF_API" ] && [ -f "top-1m-radar.txt" ]; then
sort "top-1m-well-known.txt" -u -o "top-1m-well-known.txt"
fi
cat "exclude-url.txt" | \
sed "/^#/d" | \
# "example.com/path" -> "^example\.com/path"
# slash doesn't need to be escaped
sed -e "s/^/^/" -e "s/\./\\\./g" > "exclude-url-grep.txt"
## Parse popular domains from URLhaus
cat "urlhaus-domains.txt" | \
# grep match whole line
@ -182,26 +190,33 @@ grep -Fx -f "top-1m-well-known.txt" > "urlhaus-top-domains.txt"
## Parse domains from URLhaus excluding popular domains
cat "urlhaus-domains.txt" | \
grep -F -vf "urlhaus-top-domains.txt" | \
# exclude domains from domains-based filters #110
grep -vf "exclude-url-grep.txt" | \
# Remove blank lines
sed "/^$/d" > "malware-domains.txt"
cat "urlhaus-domains-online.txt" | \
grep -F -vf "urlhaus-top-domains.txt" | \
grep -vf "exclude-url-grep.txt" | \
sed "/^$/d" > "malware-domains-online.txt"
## Parse malware URLs from popular domains
cat "urlhaus.txt" | \
grep -F -f "urlhaus-top-domains.txt" | \
# exclude domains/URLs from URL-based filters #110
grep -vf "exclude-url-grep.txt" | \
sed "s/^/||/" | \
sed 's/$/^$all/' > "malware-url-top-domains.txt"
cat "urlhaus-online.txt" | \
grep -F -f "urlhaus-top-domains.txt" | \
grep -vf "exclude-url-grep.txt" | \
sed "s/^/||/" | \
sed 's/$/^$all/' > "malware-url-top-domains-online.txt"
cat "urlhaus-online.txt" | \
grep -F -f "urlhaus-top-domains.txt" > "malware-url-top-domains-raw-online.txt"
grep -F -f "urlhaus-top-domains.txt" | \
grep -vf "exclude-url-grep.txt" > "malware-url-top-domains-raw-online.txt"
## Merge malware domains and URLs