From 23bcdbb2c63ad767d11c12bd8452bf1b2d5688b3 Mon Sep 17 00:00:00 2001 From: MDLeom <2809763-curben@users.noreply.gitlab.com> Date: Sat, 22 Mar 2025 04:19:43 +0000 Subject: [PATCH] fix: dedup domains and url --- src/script.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/script.sh b/src/script.sh index fd056d1c..d720fbbe 100644 --- a/src/script.sh +++ b/src/script.sh @@ -226,7 +226,7 @@ grep -Fx -f "top-1m-well-known.txt" > "phishing-top-domains.txt" ## Exclude popular domains cat "phishing-domains.txt" | \ -grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains.txt" +grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains-temp.txt" cat "phishing-top-domains.txt" | \ # "example.com" -> "^example\.com" @@ -240,11 +240,15 @@ grep -Fx -vf "phishing-top-domains.txt" > "phishing-url-top-domains-temp.txt" cat "phishing-url-top-domains-temp.txt" | \ # url with path -grep -F "/" > "phishing-url-top-domains-raw.txt" +grep -F "/" | \ +sort -u > "phishing-url-top-domains-raw.txt" cat "phishing-url-top-domains-temp.txt" | \ # url without path -grep -F -v "/" >> "phishing-notop-domains.txt" +grep -F -v "/" >> "phishing-notop-domains-temp.txt" + +cat "phishing-notop-domains-temp.txt" | \ +sort -u > "phishing-notop-domains.txt" ## Merge malware domains and URLs @@ -369,7 +373,6 @@ sed "1s/Domains/Names/" > "../public/phishing-filter-dnscrypt-blocked-names.txt" # IPv4/6 if grep -Eq "^(([0-9]{1,3}[\.]){3}[0-9]{1,3}$|\[)" "phishing-notop-domains.txt"; then cat "phishing-notop-domains.txt" | \ - sort | \ grep -E "^(([0-9]{1,3}[\.]){3}[0-9]{1,3}$|\[)" | \ sed -r "s/\[|\]//g" | \ sed "1i $COMMENT" | \