refactor: handle url-without-path & safelinks without "while read"

"while read" can be inefficient
This commit is contained in:
MDLeom 2025-03-16 12:37:56 +00:00
parent 6e359f9a79
commit ab5dca49b4
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
2 changed files with 38 additions and 52 deletions

View File

@ -1,5 +1,9 @@
// Decode O365 Safelinks
// https://support.microsoft.com/en-us/office/advanced-outlook-com-security-for-microsoft-365-subscribers-882d2243-eab9-4545-a58a-b36fee4a46e2
const inputUrl = new URL(`http://${process.argv[2]}`)
const outputUrl = new URL(inputUrl.searchParams.get('url'))
console.log(`${outputUrl.host}${outputUrl.pathname}${outputUrl.search}`)
import { createInterface } from "node:readline"
for await (const line of createInterface({ input: process.stdin })) {
const inputUrl = new URL(`http://${line}`)
const outputUrl = new URL(inputUrl.searchParams.get('url'))
console.log(`${outputUrl.host}${outputUrl.pathname}${outputUrl.search}`)
}

View File

@ -182,7 +182,23 @@ sort -u > "ipthreat.txt"
## Combine all sources
cat "openphish.txt" "ipthreat.txt" "phishtank.txt" | \
sort -u > "phishing.txt"
sort -u > "phishing-temp.txt"
## Parse O365 safelink
safelinks="$(cat 'phishing-temp.txt' | grep -F 'safelinks.protection.outlook.com' || [ $? = 1 ])"
if [ -n "$safelinks" ]; then
echo "$safelinks" > "safelinks.txt"
cat "phishing-temp.txt" | \
grep -Fx -vf "safelinks.txt" > "phishing.txt"
cat "safelinks.txt" | \
node "../src/safelinks.js" | \
sed -r "s/(^[^\/]*)\/+$/\1/g" | \
sort -u >> "phishing.txt"
else
cp "phishing-temp.txt" "phishing.txt"
fi
## Parse domain and IP address only
cat "phishing.txt" | \
@ -245,58 +261,20 @@ grep -Fx -f "top-1m-well-known.txt" > "phishing-top-domains.txt"
## Exclude popular domains
cat "phishing-domains.txt" | \
grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains-temp.txt"
grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains.txt"
cat "phishing.txt" | \
grep -F -f "phishing-top-domains.txt" > "phishing-url-top-domains-temp.txt"
grep -F -f "phishing-top-domains.txt" | \
# exclude URL of top domains without path #43
grep -Fx -vf "phishing-top-domains.txt" > "phishing-url-top-domains-temp.txt"
rm "phishing-url-top-domains.txt" "phishing-url-top-domains-raw.txt"
cat "phishing-url-top-domains-temp.txt" | \
# url with path
grep -F "/" > "phishing-url-top-domains-raw.txt"
## Temporarily disable command print
set +x
while read URL; do
DOMAIN=$(echo "$URL" | cut -d"/" -f1)
PATHNAME=$(echo "$URL" | sed "s/^$DOMAIN//")
# Separate domain-only/no-path URL (e.g. "example.com/")
if [ -z "$PATHNAME" ] || [ "$PATHNAME" = "/" ]; then
echo "$DOMAIN" | \
# Remove port
cut -f 1 -d ":" >> "phishing-subdomains.txt"
# "phishing-subdomains.txt" may be empty if the data source is clean
# Parse hostname from O365 safelink
elif test "${URL#*safelinks.protection.outlook.com}" != "$URL"; then
SAFELINK=$(node "../src/safelinks.js" "$URL")
if grep -Fq "$SAFELINK" "top-1m-well-known.txt"; then
echo "$SAFELINK" >> "phishing-url-top-domains-temp.txt"
else
echo "$SAFELINK" | \
cut -d"/" -f1 >> "phishing-notop-domains-temp.txt"
fi
# Parse phishing URLs from popular domains
else
echo "$URL" | \
sed -e "s/^/||/g" -e "s/$/\$all/g" >> "phishing-url-top-domains.txt"
echo "$URL" >> "phishing-url-top-domains-raw.txt"
fi
done < "phishing-url-top-domains-temp.txt"
## Re-enable command print
set -x
## "phishing-subdomains.txt" is derived from URLs of top domains that does not have a path
# exclude from top (sub)domains
if [ -s "phishing-subdomains.txt" ]; then
excluded_subdomains=$(cat "phishing-subdomains.txt" | grep -Fx -vf "phishing-top-domains.txt" || [ $? = 1 ])
if [ "$excluded_subdomains" != "" ] && [ -n "$excluded_subdomains" ]; then
echo "$excluded_subdomains" >> "phishing-notop-domains-temp.txt"
fi
fi
## "phishing-subdomains.txt" & "phishing-url-top-domains-temp.txt" may add duplicate entries
sort -u "phishing-notop-domains-temp.txt" > "phishing-notop-domains.txt"
cat "phishing-url-top-domains-temp.txt" | \
# url without path
grep -F -v "/" >> "phishing-notop-domains.txt"
## Merge malware domains and URLs
@ -311,6 +289,10 @@ COMMENT_UBO="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n
mkdir -p "../public/"
cat "phishing-url-top-domains-raw.txt" | \
sed "s/^/||/g" | \
sed "s/$/\$all/g" > "phishing-url-top-domains.txt"
cat "phishing-notop-domains.txt" "phishing-url-top-domains.txt" | \
sort | \
sed "1i $COMMENT_UBO" > "../public/phishing-filter.txt"