From 9d4668bcbdfab4b5834cbb5a460dae594b6d49e1 Mon Sep 17 00:00:00 2001 From: MDLeom <2809763-curben@users.noreply.gitlab.com> Date: Tue, 18 Mar 2025 10:31:00 +0000 Subject: [PATCH] fix: match top domains to input hostname instead of url. to minimise entries such as "bad.com/interactivelogin?continue=https://accounts.google.com" however, subdomains of top domains will no longer match --- src/script.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/script.sh b/src/script.sh index a346c85c..ffd66a36 100644 --- a/src/script.sh +++ b/src/script.sh @@ -230,8 +230,13 @@ grep -Fx -f "top-1m-well-known.txt" > "phishing-top-domains.txt" cat "phishing-domains.txt" | \ grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains.txt" +cat "phishing-top-domains.txt" | \ +# "example.com" -> "^example\.com" +sed -e "s/^/^/g" -e "s/\./\\\./g" > "phishing-top-domains-grep.txt" + cat "phishing.txt" | \ -grep -F -f "phishing-top-domains.txt" | \ +# exact match hostname +grep -f "phishing-top-domains-grep.txt" | \ # exclude URL of top domains without path #43 grep -Fx -vf "phishing-top-domains.txt" > "phishing-url-top-domains-temp.txt"