fix: match top domains to input hostname

instead of url.
to minimise entries such as "bad.com/interactivelogin?continue=https://accounts.google.com"
however, subdomains of top domains will no longer match
This commit is contained in:
MDLeom 2025-03-18 10:31:00 +00:00
parent 58a15ee1df
commit 9d4668bcbd
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
1 changed files with 6 additions and 1 deletions

View File

@ -230,8 +230,13 @@ grep -Fx -f "top-1m-well-known.txt" > "phishing-top-domains.txt"
cat "phishing-domains.txt" | \
grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains.txt"
cat "phishing-top-domains.txt" | \
# "example.com" -> "^example\.com"
sed -e "s/^/^/g" -e "s/\./\\\./g" > "phishing-top-domains-grep.txt"
cat "phishing.txt" | \
grep -F -f "phishing-top-domains.txt" | \
# exact match hostname
grep -f "phishing-top-domains-grep.txt" | \
# exclude URL of top domains without path #43
grep -Fx -vf "phishing-top-domains.txt" > "phishing-url-top-domains-temp.txt"