fix: remove port number and deduplicate entries

- Fixes #8
This commit is contained in:
MDLeom 2021-06-20 07:38:55 +00:00
parent 5880487d27
commit 0e9845b69a
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
1 changed files with 6 additions and 2 deletions

View File

@ -106,7 +106,7 @@ grep -Fx -f "top-1m-well-known.txt" > "phishing-top-domains.txt"
## Exclude popular domains
cat "phishing-domains.txt" | \
grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains.txt"
grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains-temp.txt"
cat "phishing.txt" | \
grep -F -f "phishing-top-domains.txt" > "phishing-url-top-domains-temp.txt"
@ -122,7 +122,8 @@ while read URL; do
## Separate host-only URL
if [ -z "$URI" ] || [ "$URI" = "/" ]; then
echo "$HOST" >> "phishing-notop-domains.txt"
echo "$HOST" | \
cut -f 1 -d ":" >> "phishing-notop-domains-temp.txt"
else
## Parse phishing URLs from popular domains
echo "$URL" | \
@ -134,6 +135,9 @@ done < "phishing-url-top-domains-temp.txt"
## Re-enable command print
set -x
## "phishing-url-top-domains-temp.txt" may add duplicate entries
sort -u "phishing-notop-domains-temp.txt" > "phishing-notop-domains.txt"
## Merge malware domains and URLs
CURRENT_TIME="$(date -R -u)"