feat: more robust url parsing

percent-encode, hostname/pathname/query-string parsing
- increase nodejs requirement to v20 due to URL.canParse()
- IPv6 support
- include query string in IDS blocklists
  * and escape ; on query string
- add address separator to adblock filters
  * f4377f1fe6
- remove unnecessary global flag in sed
- remove unnecessary sort
- clean tmp at start
This commit is contained in:
MDLeom 2025-03-23 04:17:27 +00:00
parent d2f18b753c
commit fd2659431e
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
3 changed files with 168 additions and 97 deletions

79
src/clean_url.js Normal file
View File

@ -0,0 +1,79 @@
'use strict'
import { createInterface } from 'node:readline'
const cleanHost = (hostname) => {
return hostname
// Remove invalid protocol, see #32
.replace(/^(https?:\/\/)(?:ttps:\/\/|https:\/|http\/)/, '$1')
.replace(/^(https?:\/\/)?www\./, '$1')
}
// nodejs does not percent-encode ^ yet
// https://github.com/nodejs/node/issues/57313
// Applies to path, exclude query string
const caretPath = (pathname) => {
if (!pathname.includes('?')) return pathname.replaceAll('^', '%5E')
const pathArray = pathname.split('?')
const path = pathArray[0].replaceAll('^', '%5E')
const search = pathArray.slice(1).join('?')
return `${path}?${search}`
}
for await (const line of createInterface({ input: process.stdin, terminal: false })) {
// parse hostname from url
if (process.argv[2] === 'hostname') {
if (URL.canParse(`http://${line}`)) {
const url = new URL(`http://${line}`)
console.log(url.hostname)
} else {
const hostname = line
// host
.split('/')[0]
// exclude credential
.replace(/.*@(.+)/, '$1')
// exclude port
.replace(/:\d+$/, '')
// #2
.split('?')[0]
console.log(hostname)
}
} else {
// Skip invalid domains, see #15
if (line.split('/')[2].includes('??')) continue
if (URL.canParse(line)) {
let url = new URL(cleanHost(line))
// Decode O365 Safelinks
// https://support.microsoft.com/en-us/office/advanced-outlook-com-security-for-microsoft-365-subscribers-882d2243-eab9-4545-a58a-b36fee4a46e2
if (url.hostname.endsWith('safelinks.protection.outlook.com')) {
url = new URL(url.searchParams.get('url'))
}
url.host = cleanHost(url.host)
// nodejs does not percent-encode ^ yet
// https://github.com/nodejs/node/issues/57313
url.pathname = caretPath(url.pathname)
const outUrl = `${url.host}${url.pathname}${url.search}`
// remove trailing slash from domain except path
.replace(/(^[^/]*)\/+$/, '$1')
console.log(outUrl)
} else {
const outUrl = caretPath(cleanHost(line
// remove protocol
.split('/').slice(2).join('/')))
// url encode space
.replaceAll(' ', '%20')
.replace(/(^[^/]*)\/+$/, '$1')
console.log(outUrl)
}
}
}

View File

@ -33,14 +33,20 @@ for await (const domain of domains.readLines()) {
}
for await (const line of urls.readLines()) {
if (!URL.canParse(`http://${line}`)) {
console.error(`Invalid URL: ${line}`)
continue
}
const url = new URL(`http://${line}`)
const { hostname } = url
let pathname = url.pathname.replace(';', '\\;')
snort2.write(`alert tcp $HOME_NET any -> $EXTERNAL_NET [80,443] (msg:"urlhaus-filter malicious website detected"; flow:established,from_client; content:"GET"; http_method; content:"${pathname.substring(0, 2048)}"; http_uri; nocase; content:"${hostname}"; content:"Host"; http_header; classtype:trojan-activity; sid:${sid}; rev:1;)\n`)
snort3.write(`alert http $HOME_NET any -> $EXTERNAL_NET any (msg:"urlhaus-filter malicious website detected"; http_header:field host; content:"${hostname}",nocase; http_uri; content:"${pathname}",nocase; classtype:trojan-activity; sid:${sid}; rev:1;)\n`)
suricata.write(`alert http $HOME_NET any -> $EXTERNAL_NET any (msg:"urlhaus-filter malicious website detected"; flow:established,from_client; http.method; content:"GET"; http.uri; content:"${pathname}"; endswith; nocase; http.host; content:"${hostname}"; classtype:trojan-activity; sid:${sid}; rev:1;)\n`)
pathname = url.pathname
splunk.write(`"${hostname}","${pathname}","urlhaus-filter malicious website detected","${process.env.CURRENT_TIME}"\n`)
const { hostname, pathname, search } = url
const pathEscape = `${pathname}${search}`.replaceAll(';', '\\;')
const path = pathname + search
snort2.write(`alert tcp $HOME_NET any -> $EXTERNAL_NET [80,443] (msg:"urlhaus-filter malicious website detected"; flow:established,from_client; content:"GET"; http_method; content:"${pathEscape.substring(0, 2048)}"; http_uri; nocase; content:"${hostname}"; content:"Host"; http_header; classtype:trojan-activity; sid:${sid}; rev:1;)\n`)
snort3.write(`alert http $HOME_NET any -> $EXTERNAL_NET any (msg:"urlhaus-filter malicious website detected"; http_header:field host; content:"${hostname}",nocase; http_uri; content:"${pathEscape}",nocase; classtype:trojan-activity; sid:${sid}; rev:1;)\n`)
suricata.write(`alert http $HOME_NET any -> $EXTERNAL_NET any (msg:"urlhaus-filter malicious website detected"; flow:established,from_client; http.method; content:"GET"; http.uri; content:"${pathEscape}"; endswith; nocase; http.host; content:"${hostname}"; classtype:trojan-activity; sid:${sid}; rev:1;)\n`)
splunk.write(`"${hostname}","${path}","urlhaus-filter malicious website detected","${process.env.CURRENT_TIME}"\n`)
sid++
}

View File

@ -65,6 +65,7 @@ else
fi
## Create a temporary working folder
rm "tmp/"
mkdir -p "tmp/"
cd "tmp/"
@ -97,7 +98,7 @@ if [ -n "$CF_API" ]; then
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
grep -F "." | \
sed "s/^www\.//g" | \
sed "s/^www\.//" | \
sort -u > "top-1m-radar.txt"
fi
@ -114,38 +115,23 @@ sed "/^#/d" > "URLhaus.csv"
## Parse URLs
cat "URLhaus.csv" | \
cut -f 6 -d '"' | \
cut -f 3- -d "/" | \
# Domain must have at least a 'dot'
grep -F "." | \
# Remove invalid protocol, see #32
sed -E "s/^(ttps:\/\/|https:\/|http\/)//g" | \
# Remove www.
sed "s/^www\.//g" | \
node "../src/clean_url.js" | \
sort -u > "urlhaus.txt"
## Parse domain and IP address only
cat "urlhaus.txt" | \
cut -f 1 -d "/" | \
# Remove port
cut -f 1 -d ":" | \
# Remove invalid domains, see #15
grep -vF "??" | \
cut -f 1 -d "?" | \
node "../src/clean_url.js" hostname | \
sort -u > "urlhaus-domains.txt"
## Parse online URLs only
cat "URLhaus.csv" | \
grep -F '"online"' | \
cut -f 6 -d '"' | \
cut -f 3- -d "/" | \
sed "s/^www\.//g" | \
node "../src/clean_url.js" | \
sort -u > "urlhaus-online.txt"
cat "urlhaus-online.txt" | \
cut -f 1 -d "/" | \
cut -f 1 -d ":" | \
grep -vF "??" | \
cut -f 1 -d "?" | \
node "../src/clean_url.js" hostname | \
sort -u > "urlhaus-domains-online.txt"
@ -157,7 +143,7 @@ tr "[:upper:]" "[:lower:]" | \
cut -f 2 -d "," | \
grep -F "." | \
# Remove www.
sed "s/^www\.//g" | \
sed "s/^www\.//" | \
sort -u > "top-1m-umbrella.txt"
## Parse the Tranco 1 Million
@ -169,7 +155,7 @@ if [ -n "$(file 'top-1m-tranco.zip' | grep 'Zip archive data')" ]; then
cut -f 2 -d "," | \
grep -F "." | \
# Remove www.
sed "s/^www\.//g" | \
sed "s/^www\.//" | \
sort -u > "top-1m-tranco.txt"
else
# tranco has unreliable download
@ -206,13 +192,13 @@ sed "/^$/d" > "malware-domains-online.txt"
## Parse malware URLs from popular domains
cat "urlhaus.txt" | \
grep -F -f "urlhaus-top-domains.txt" | \
sed "s/^/||/g" | \
sed "s/$/\$all/g" > "malware-url-top-domains.txt"
sed "s/^/||/" | \
sed 's/$/^$all/' > "malware-url-top-domains.txt"
cat "urlhaus-online.txt" | \
grep -F -f "urlhaus-top-domains.txt" | \
sed "s/^/||/g" | \
sed "s/$/\$all/g" > "malware-url-top-domains-online.txt"
sed "s/^/||/" | \
sed 's/$/^$all/' > "malware-url-top-domains-online.txt"
cat "urlhaus-online.txt" | \
grep -F -f "urlhaus-top-domains.txt" > "malware-url-top-domains-raw-online.txt"
@ -231,31 +217,28 @@ COMMENT_ABP="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n
mkdir -p "../public/"
cat "malware-domains.txt" "malware-url-top-domains.txt" | \
sort | \
sed "1i $COMMENT_ABP" > "../public/urlhaus-filter.txt"
cat "malware-domains-online.txt" "malware-url-top-domains-online.txt" | \
sort | \
sed "1i $COMMENT_ABP" | \
sed "1s/Malicious/Online Malicious/" > "../public/urlhaus-filter-online.txt"
# Adguard Home (#19, #22)
cat "malware-domains.txt" | \
sed "s/^/||/g" | \
sed "s/$/^/g" > "malware-domains-adguard-home.txt"
sed "s/^/||/" | \
sed "s/$/^/" | \
sort -u > "malware-domains-adguard-home.txt"
cat "malware-domains-online.txt" | \
sed "s/^/||/g" | \
sed "s/$/^/g" > "malware-domains-online-adguard-home.txt"
sed "s/^/||/" | \
sed "s/$/^/" > "malware-domains-online-adguard-home.txt"
cat "malware-domains-adguard-home.txt" | \
sort | \
sed "1i $COMMENT_ABP" | \
sed "1s/Blocklist/Blocklist (AdGuard Home)/" > "../public/urlhaus-filter-agh.txt"
cat "malware-domains-online-adguard-home.txt" | \
sort | \
sed "1i $COMMENT_ABP" | \
sed "1s/Malicious/Online Malicious/" | \
sed "1s/Blocklist/Blocklist (AdGuard Home)/" > "../public/urlhaus-filter-agh-online.txt"
@ -263,20 +246,18 @@ sed "1s/Blocklist/Blocklist (AdGuard Home)/" > "../public/urlhaus-filter-agh-onl
# Adguard browser extension
cat "malware-domains.txt" | \
sed "s/^/||/g" | \
sed "s/$/\$all/g" > "malware-domains-adguard.txt"
sed "s/^/||/" | \
sed 's/$/^$all/' > "malware-domains-adguard.txt"
cat "malware-domains-online.txt" | \
sed "s/^/||/g" | \
sed "s/$/\$all/g" > "malware-domains-online-adguard.txt"
sed "s/^/||/" | \
sed 's/$/^$all/' > "malware-domains-online-adguard.txt"
cat "malware-domains-adguard.txt" "malware-url-top-domains.txt" | \
sort | \
sed "1i $COMMENT_ABP" | \
sed "1s/Blocklist/Blocklist (AdGuard)/" > "../public/urlhaus-filter-ag.txt"
cat "malware-domains-online-adguard.txt" "malware-url-top-domains-online.txt" | \
sort | \
sed "1i $COMMENT_ABP" | \
sed "1s/Malicious/Online Malicious/" | \
sed "1s/Blocklist/Blocklist (AdGuard)/" > "../public/urlhaus-filter-ag-online.txt"
@ -284,22 +265,20 @@ sed "1s/Blocklist/Blocklist (AdGuard)/" > "../public/urlhaus-filter-ag-online.tx
# Vivaldi
cat "malware-domains.txt" | \
sed "s/^/||/g" | \
sed "s/$/\$document/g" > "malware-domains-vivaldi.txt"
sed "s/^/||/" | \
sed 's/$/^$document/' > "malware-domains-vivaldi.txt"
cat "malware-domains-online.txt" | \
sed "s/^/||/g" | \
sed "s/$/\$document/g" > "malware-domains-online-vivaldi.txt"
sed "s/^/||/" | \
sed 's/$/^$document/' > "malware-domains-online-vivaldi.txt"
cat "malware-domains-vivaldi.txt" "malware-url-top-domains.txt" | \
sed "s/\$all$/\$document/g" | \
sort | \
sed 's/\$all$/$document/' | \
sed "1i $COMMENT_ABP" | \
sed "1s/Blocklist/Blocklist (Vivaldi)/" > "../public/urlhaus-filter-vivaldi.txt"
cat "malware-domains-online-vivaldi.txt" "malware-url-top-domains-online.txt" | \
sed "s/\$all$/\$document/g" | \
sort | \
sed 's/\$all$/$document/' | \
sed "1i $COMMENT_ABP" | \
sed "1s/Malicious/Online Malicious/" | \
sed "1s/Blocklist/Blocklist (Vivaldi)/" > "../public/urlhaus-filter-vivaldi-online.txt"
@ -307,67 +286,65 @@ sed "1s/Blocklist/Blocklist (Vivaldi)/" > "../public/urlhaus-filter-vivaldi-onli
## Domains-only blocklist
# awk + head is a workaround for sed prepend
COMMENT=$(printf "$COMMENT_ABP" | sed "s/^!/#/g" | sed "1s/URL/Domains/" | awk '{printf "%s\\n", $0}' | head -c -2)
COMMENT=$(printf "$COMMENT_ABP" | sed "s/^!/#/" | sed "1s/URL/Domains/" | awk '{printf "%s\\n", $0}' | head -c -2)
COMMENT_ONLINE=$(printf "$COMMENT" | sed "1s/Malicious/Online Malicious/" | awk '{printf "%s\\n", $0}' | head -c -2)
cat "malware-domains.txt" | \
sort | \
sed "1i $COMMENT" > "../public/urlhaus-filter-domains.txt"
cat "malware-domains-online.txt" | \
sort | \
sed "1i $COMMENT_ONLINE" > "../public/urlhaus-filter-domains-online.txt"
## Hosts only
cat "malware-domains.txt" | \
sort | \
# Remove IPv4 address
grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" > "malware-hosts.txt"
# exclude IPv4
grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \
# exclude IPv6
grep -vE "^\[" > "malware-hosts.txt"
cat "malware-domains-online.txt" | \
sort | \
# Remove IPv4 address
grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" > "malware-hosts-online.txt"
grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \
grep -vE "^\[" > "malware-hosts-online.txt"
## Hosts file blocklist
cat "malware-hosts.txt" | \
sed "s/^/0.0.0.0 /g" | \
sed "s/^/0.0.0.0 /" | \
# Re-insert comment
sed "1i $COMMENT" | \
sed "1s/Domains/Hosts/" > "../public/urlhaus-filter-hosts.txt"
cat "malware-hosts-online.txt" | \
sed "s/^/0.0.0.0 /g" | \
sed "s/^/0.0.0.0 /" | \
sed "1i $COMMENT_ONLINE" | \
sed "1s/Domains/Hosts/" > "../public/urlhaus-filter-hosts-online.txt"
## Dnsmasq-compatible blocklist
cat "malware-hosts.txt" | \
sed "s/^/address=\//g" | \
sed "s/$/\/0.0.0.0/g" | \
sed "s/^/address=\//" | \
sed "s/$/\/0.0.0.0/" | \
sed "1i $COMMENT" | \
sed "1s/Blocklist/dnsmasq Blocklist/" > "../public/urlhaus-filter-dnsmasq.conf"
cat "malware-hosts-online.txt" | \
sed "s/^/address=\//g" | \
sed "s/$/\/0.0.0.0/g" | \
sed "s/^/address=\//" | \
sed "s/$/\/0.0.0.0/" | \
sed "1i $COMMENT_ONLINE" | \
sed "1s/Blocklist/dnsmasq Blocklist/" > "../public/urlhaus-filter-dnsmasq-online.conf"
## BIND-compatible blocklist
cat "malware-hosts.txt" | \
sed 's/^/zone "/g' | \
sed 's/$/" { type master; notify no; file "null.zone.file"; };/g' | \
sed 's/^/zone "/' | \
sed 's/$/" { type master; notify no; file "null.zone.file"; };/' | \
sed "1i $COMMENT" | \
sed "1s/Blocklist/BIND Blocklist/" > "../public/urlhaus-filter-bind.conf"
cat "malware-hosts-online.txt" | \
sed 's/^/zone "/g' | \
sed 's/$/" { type master; notify no; file "null.zone.file"; };/g' | \
sed 's/^/zone "/' | \
sed 's/$/" { type master; notify no; file "null.zone.file"; };/' | \
sed "1i $COMMENT_ONLINE" | \
sed "1s/Blocklist/BIND Blocklist/" > "../public/urlhaus-filter-bind-online.conf"
@ -377,30 +354,30 @@ CURRENT_UNIX_TIME="$(date +%s)"
RPZ_SYNTAX="\n\$TTL 30\n@ IN SOA localhost. root.localhost. $CURRENT_UNIX_TIME 86400 3600 604800 30\n NS localhost.\n"
cat "malware-hosts.txt" | \
sed "s/$/ CNAME ./g" | \
sed "s/$/ CNAME ./" | \
sed '1 i\'"$RPZ_SYNTAX"'' | \
sed "1i $COMMENT" | \
sed "s/^#/;/g" | \
sed "s/^#/;/" | \
sed "1s/Blocklist/RPZ Blocklist/" > "../public/urlhaus-filter-rpz.conf"
cat "malware-hosts-online.txt" | \
sed "s/$/ CNAME ./g" | \
sed "s/$/ CNAME ./" | \
sed '1 i\'"$RPZ_SYNTAX"'' | \
sed "1i $COMMENT_ONLINE" | \
sed "s/^#/;/g" | \
sed "s/^#/;/" | \
sed "1s/Blocklist/RPZ Blocklist/" > "../public/urlhaus-filter-rpz-online.conf"
## Unbound-compatible blocklist
cat "malware-hosts.txt" | \
sed 's/^/local-zone: "/g' | \
sed 's/$/" always_nxdomain/g' | \
sed 's/^/local-zone: "/' | \
sed 's/$/" always_nxdomain/' | \
sed "1i $COMMENT" | \
sed "1s/Blocklist/Unbound Blocklist/" > "../public/urlhaus-filter-unbound.conf"
cat "malware-hosts-online.txt" | \
sed 's/^/local-zone: "/g' | \
sed 's/$/" always_nxdomain/g' | \
sed 's/^/local-zone: "/' | \
sed 's/$/" always_nxdomain/' | \
sed "1i $COMMENT_ONLINE" | \
sed "1s/Blocklist/Unbound Blocklist/" > "../public/urlhaus-filter-unbound-online.conf"
@ -415,28 +392,37 @@ cat "malware-hosts-online.txt" | \
sed "1i $COMMENT_ONLINE" | \
sed "1s/Domains/Names/" > "../public/urlhaus-filter-dnscrypt-blocked-names-online.txt"
# IPv4-based
cat "malware-domains.txt" | \
sort | \
grep -E "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \
sed "1i $COMMENT" | \
sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips.txt"
# IPv4/6
if grep -Eq "^(([0-9]{1,3}[\.]){3}[0-9]{1,3}$|\[)" "phishing-notop-domains.txt"; then
cat "malware-domains.txt" | \
grep -E "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \
sed -r "s/\[|\]//g" | \
sed "1i $COMMENT" | \
sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips.txt"
cat "malware-domains-online.txt" | \
sort | \
grep -E "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \
sed "1i $COMMENT_ONLINE" | \
sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips-online.txt"
cat "malware-domains-online.txt" | \
grep -E "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \
sed -r "s/\[|\]//g" | \
sed "1i $COMMENT_ONLINE" | \
sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips-online.txt"
else
echo | \
sed "1i $COMMENT" | \
sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips.txt"
echo | \
sed "1i $COMMENT_ONLINE" | \
sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips-online.txt"
fi
## Wildcard subdomain
cat "malware-domains.txt" | \
sed "s/^/*./g" | \
sed "s/^/*./" | \
sed "1i $COMMENT" | \
sed "1s/Blocklist/Wildcard Asterisk Blocklist/" > "../public/urlhaus-filter-wildcard.txt"
cat "malware-domains-online.txt" | \
sed "s/^/*./g" | \
sed "s/^/*./" | \
sed "1i $COMMENT" | \
sed "1s/Blocklist/Wildcard Asterisk Blocklist/" > "../public/urlhaus-filter-wildcard-online.txt"
@ -468,12 +454,12 @@ COMMENT_IE="msFilterList\n$COMMENT\n: Expires=1\n#"
COMMENT_ONLINE_IE="msFilterList\n$COMMENT_ONLINE\n: Expires=1\n#"
cat "malware-hosts.txt" | \
sed "s/^/-d /g" | \
sed "s/^/-d /" | \
sed "1i $COMMENT_IE" | \
sed "2s/Domains Blocklist/Hosts Blocklist (IE)/" > "../public/urlhaus-filter.tpl"
cat "malware-hosts-online.txt" | \
sed "s/^/-d /g" | \
sed "s/^/-d /" | \
sed "1i $COMMENT_ONLINE_IE" | \
sed "2s/Domains Blocklist/Hosts Blocklist (IE)/" > "../public/urlhaus-filter-online.tpl"