feat: more robust url parsing

better handle of edge cases
add IPv6 support
increase nodejs requirement to v20 due to URL.canParse()
  https://developer.mozilla.org/en-US/docs/Web/API/URL/canParse_static#browser_compatibility
This commit is contained in:
MDLeom 2025-03-18 10:09:47 +00:00
parent 9b2fe384fd
commit 58a15ee1df
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
4 changed files with 69 additions and 54 deletions

View File

@ -8,7 +8,7 @@
"unzipper": "^0.12.3"
},
"engines": {
"node": ">= 18.12.0"
"node": ">= 20.9.0"
},
"type": "module"
}

53
src/clean_url.js Normal file
View File

@ -0,0 +1,53 @@
import { createInterface } from "node:readline"
for await (const line of createInterface({ input: process.stdin })) {
// parse hostname from url
if (process.argv[2] === 'hostname') {
if (URL.canParse(`http://${line}`)) {
const { hostname } = new URL(`http://${line}`)
console.log(hostname)
}
else {
const hostname = line
// host
.split('/')[0]
// exclude credential
.replace(/.*@(.+)/, '$1')
// exclude port
.replace(/:\d+$/, '')
// #2
.split('?')[0]
console.log(hostname)
}
} else {
if (URL.canParse(line)) {
let url = new URL(line)
// Decode O365 Safelinks
// https://support.microsoft.com/en-us/office/advanced-outlook-com-security-for-microsoft-365-subscribers-882d2243-eab9-4545-a58a-b36fee4a46e2
if (url.hostname.endsWith('safelinks.protection.outlook.com')) {
url = new URL(url.searchParams.get('url'))
}
const outUrl = `${url.host.replace(/^www\./, '')}${url.pathname}${url.search}`
// remove trailing slash from domain except path #43
.replace(/(^[^\/]*)\/+$/, '$1')
console.log(outUrl)
}
else {
const outUrl = line
// remove protocol
.split('/').slice(2).join('/')
// remove www
.replace(/^www\./, '')
// url encode space #11
.replace(' ', '%20')
.replace(/(^[^\/]*)\/+$/, '$1')
console.log(outUrl)
}
}
}

View File

@ -1,9 +0,0 @@
// Decode O365 Safelinks
// https://support.microsoft.com/en-us/office/advanced-outlook-com-security-for-microsoft-365-subscribers-882d2243-eab9-4545-a58a-b36fee4a46e2
import { createInterface } from "node:readline"
for await (const line of createInterface({ input: process.stdin })) {
const inputUrl = new URL(`http://${line}`)
const outputUrl = new URL(inputUrl.searchParams.get('url'))
console.log(`${outputUrl.host}${outputUrl.pathname}${outputUrl.search}`)
}

View File

@ -143,14 +143,7 @@ if [ -n "$(file 'phishtank.bz2' | grep 'bzip2 compressed data')" ]; then
cut -f 2 -d "," | \
"./$CSVQUOTE" -u | \
sed 's/"//g' | \
cut -f 3- -d "/" | \
# Domain must have at least a 'dot'
grep -F "." | \
sed "s/^www\.//g" | \
# url encode space #11
sed "s/ /%20/g" | \
# remove trailing slash from domain except path #43
sed -r "s/(^[^\/]*)\/+$/\1/g" | \
node "../src/clean_url.js" | \
sort -u > "phishtank.txt"
else
# cloudflare may impose captcha
@ -161,11 +154,7 @@ fi
cat "openphish-raw.txt" | \
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
cut -f 3- -d "/" | \
grep -F "." | \
sed "s/^www\.//g" | \
sed "s/ /%20/g" | \
sed -r "s/(^[^\/]*)\/+$/\1/g" | \
node "../src/clean_url.js" | \
sort -u > "openphish.txt"
gzip -dc "ipthreat.gz" | \
@ -173,41 +162,19 @@ gzip -dc "ipthreat.gz" | \
sed "/^#/d" | \
sed "s/ # .*//g" | \
tr "[:upper:]" "[:lower:]" | \
cut -f 3- -d "/" | \
grep -F "." | \
sed "s/^www\.//g" | \
sed "s/ /%20/g" | \
sed -r "s/(^[^\/]*)\/+$/\1/g" | \
node "../src/clean_url.js" | \
sort -u > "ipthreat.txt"
## Combine all sources
cat "openphish.txt" "ipthreat.txt" "phishtank.txt" | \
sort -u > "phishing-temp.txt"
# remove blank lines
sed "/^$/d" | \
sort -u > "phishing.txt"
## Parse O365 safelink
safelinks="$(cat 'phishing-temp.txt' | grep -P '^(?:[a-z]{3}\d{2}\.)?safelinks\.protection\.outlook\.com' || [ $? = 1 ])"
if [ -n "$safelinks" ]; then
echo "$safelinks" > "safelinks.txt"
cat "phishing-temp.txt" | \
grep -Fx -vf "safelinks.txt" > "phishing.txt"
cat "safelinks.txt" | \
node "../src/safelinks.js" | \
sed -r "s/(^[^\/]*)\/+$/\1/g" | \
sort -u >> "phishing.txt"
else
cp "phishing-temp.txt" "phishing.txt"
fi
## Parse domain and IP address only
cat "phishing.txt" | \
cut -f 1 -d "/" | \
cut -f 1 -d ":" | \
# #2
cut -f 1 -d "?" | \
# #91
sed -r "s/.*@(.+)/\1/g" | \
node "../src/clean_url.js" hostname | \
sort -u > "phishing-domains.txt"
@ -341,7 +308,10 @@ sort | \
sed "1i $COMMENT" > "../public/phishing-filter-domains.txt"
cat "phishing-notop-domains.txt" | \
grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" > "phishing-notop-hosts.txt"
# exclude IPv4
grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \
# exclude IPv6
grep -vE "^\[" > "phishing-notop-hosts.txt"
## Hosts file blocklist
cat "phishing-notop-hosts.txt" | \
@ -393,11 +363,12 @@ cat "phishing-notop-hosts.txt" | \
sed "1i $COMMENT" | \
sed "1s/Domains/Names/" > "../public/phishing-filter-dnscrypt-blocked-names.txt"
# IPv4-based
if grep -Eq "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" "phishing-notop-domains.txt"; then
# IPv4/6
if grep -Eq "^(([0-9]{1,3}[\.]){3}[0-9]{1,3}$|\[)" "phishing-notop-domains.txt"; then
cat "phishing-notop-domains.txt" | \
sort | \
grep -E "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \
grep -E "^(([0-9]{1,3}[\.]){3}[0-9]{1,3}$|\[)" | \
sed -r "s/\[|\]//g" | \
sed "1i $COMMENT" | \
sed "1s/Domains/IPs/" > "../public/phishing-filter-dnscrypt-blocked-ips.txt"
else