fix: "phishing-subdomains.txt" may be empty

This commit is contained in:
MDLeom 2024-03-08 07:54:33 +00:00
parent 352aa34c32
commit 1b2312f492
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
1 changed files with 8 additions and 5 deletions

View File

@ -181,13 +181,14 @@ while read URL; do
DOMAIN=$(echo "$URL" | cut -d"/" -f1)
PATHNAME=$(echo "$URL" | sed "s/^$DOMAIN//")
# Separate domain-only/no-path URL (e.g. "example.com/")
if [ -z "$PATHNAME" ] || [ "$PATHNAME" = "/" ]; then
# Separate domain-only/no-path URL (e.g. "example.com/")
echo "$DOMAIN" | \
# Remove port
cut -f 1 -d ":" >> "phishing-subdomains.txt"
# "phishing-subdomains.txt" may be empty if the data source is clean
# Parse hostname from O365 safelink
elif test "${URL#*safelinks.protection.outlook.com}" != "$URL"; then
## Parse hostname from O365 safelink
SAFELINK=$(node "../src/safelinks.js" "$URL")
if grep -Fq "$SAFELINK" "top-1m-well-known.txt"; then
echo "$SAFELINK" >> "phishing-url-top-domains-temp.txt"
@ -195,8 +196,8 @@ while read URL; do
echo "$SAFELINK" | \
cut -d"/" -f1 >> "phishing-notop-domains-temp.txt"
fi
# Parse phishing URLs from popular domains
else
## Parse phishing URLs from popular domains
echo "$URL" | \
sed -e "s/^/||/g" -e "s/$/\$all/g" >> "phishing-url-top-domains.txt"
echo "$URL" >> "phishing-url-top-domains-raw.txt"
@ -208,8 +209,10 @@ set -x
## "phishing-subdomains.txt" is derived from URLs of top domains that does not have a path
# exclude from top (sub)domains
cat "phishing-subdomains.txt" | \
grep -Fx -vf "phishing-top-domains.txt" >> "phishing-notop-domains-temp.txt"
if [ -f "phishing-subdomains.txt" ]; then
cat "phishing-subdomains.txt" | \
grep -Fx -vf "phishing-top-domains.txt" >> "phishing-notop-domains-temp.txt"
fi
## "phishing-subdomains.txt" & "phishing-url-top-domains-temp.txt" may add duplicate entries
sort -u "phishing-notop-domains-temp.txt" > "phishing-notop-domains.txt"