From fd2659431ecad6d6a7d0233e4cf2512fb6718e78 Mon Sep 17 00:00:00 2001 From: MDLeom <2809763-curben@users.noreply.gitlab.com> Date: Sun, 23 Mar 2025 04:17:27 +0000 Subject: [PATCH] feat: more robust url parsing percent-encode, hostname/pathname/query-string parsing - increase nodejs requirement to v20 due to URL.canParse() - IPv6 support - include query string in IDS blocklists * and escape ; on query string - add address separator to adblock filters * https://gitlab.com/malware-filter/phishing-filter/-/commit/f4377f1fe62de99cb5b82f4bb5a566adbad94d02 - remove unnecessary global flag in sed - remove unnecessary sort - clean tmp at start --- src/clean_url.js | 79 ++++++++++++++++++++++ src/ids.js | 20 ++++-- src/script.sh | 166 ++++++++++++++++++++++------------------------- 3 files changed, 168 insertions(+), 97 deletions(-) create mode 100644 src/clean_url.js diff --git a/src/clean_url.js b/src/clean_url.js new file mode 100644 index 00000000..f9f7cd96 --- /dev/null +++ b/src/clean_url.js @@ -0,0 +1,79 @@ +'use strict' + +import { createInterface } from 'node:readline' + +const cleanHost = (hostname) => { + return hostname + // Remove invalid protocol, see #32 + .replace(/^(https?:\/\/)(?:ttps:\/\/|https:\/|http\/)/, '$1') + .replace(/^(https?:\/\/)?www\./, '$1') +} + +// nodejs does not percent-encode ^ yet +// https://github.com/nodejs/node/issues/57313 +// Applies to path, exclude query string +const caretPath = (pathname) => { + if (!pathname.includes('?')) return pathname.replaceAll('^', '%5E') + + const pathArray = pathname.split('?') + const path = pathArray[0].replaceAll('^', '%5E') + const search = pathArray.slice(1).join('?') + + return `${path}?${search}` +} + +for await (const line of createInterface({ input: process.stdin, terminal: false })) { + // parse hostname from url + if (process.argv[2] === 'hostname') { + if (URL.canParse(`http://${line}`)) { + const url = new URL(`http://${line}`) + + console.log(url.hostname) + } else { + const hostname = line + // host + .split('/')[0] + // exclude credential + .replace(/.*@(.+)/, '$1') + // exclude port + .replace(/:\d+$/, '') + // #2 + .split('?')[0] + + console.log(hostname) + } + } else { + // Skip invalid domains, see #15 + if (line.split('/')[2].includes('??')) continue + + if (URL.canParse(line)) { + let url = new URL(cleanHost(line)) + + // Decode O365 Safelinks + // https://support.microsoft.com/en-us/office/advanced-outlook-com-security-for-microsoft-365-subscribers-882d2243-eab9-4545-a58a-b36fee4a46e2 + if (url.hostname.endsWith('safelinks.protection.outlook.com')) { + url = new URL(url.searchParams.get('url')) + } + + url.host = cleanHost(url.host) + + // nodejs does not percent-encode ^ yet + // https://github.com/nodejs/node/issues/57313 + url.pathname = caretPath(url.pathname) + const outUrl = `${url.host}${url.pathname}${url.search}` + // remove trailing slash from domain except path + .replace(/(^[^/]*)\/+$/, '$1') + + console.log(outUrl) + } else { + const outUrl = caretPath(cleanHost(line + // remove protocol + .split('/').slice(2).join('/'))) + // url encode space + .replaceAll(' ', '%20') + .replace(/(^[^/]*)\/+$/, '$1') + + console.log(outUrl) + } + } +} diff --git a/src/ids.js b/src/ids.js index c13ab9bc..49f6ea5c 100644 --- a/src/ids.js +++ b/src/ids.js @@ -33,14 +33,20 @@ for await (const domain of domains.readLines()) { } for await (const line of urls.readLines()) { + if (!URL.canParse(`http://${line}`)) { + console.error(`Invalid URL: ${line}`) + continue + } + const url = new URL(`http://${line}`) - const { hostname } = url - let pathname = url.pathname.replace(';', '\\;') - snort2.write(`alert tcp $HOME_NET any -> $EXTERNAL_NET [80,443] (msg:"urlhaus-filter malicious website detected"; flow:established,from_client; content:"GET"; http_method; content:"${pathname.substring(0, 2048)}"; http_uri; nocase; content:"${hostname}"; content:"Host"; http_header; classtype:trojan-activity; sid:${sid}; rev:1;)\n`) - snort3.write(`alert http $HOME_NET any -> $EXTERNAL_NET any (msg:"urlhaus-filter malicious website detected"; http_header:field host; content:"${hostname}",nocase; http_uri; content:"${pathname}",nocase; classtype:trojan-activity; sid:${sid}; rev:1;)\n`) - suricata.write(`alert http $HOME_NET any -> $EXTERNAL_NET any (msg:"urlhaus-filter malicious website detected"; flow:established,from_client; http.method; content:"GET"; http.uri; content:"${pathname}"; endswith; nocase; http.host; content:"${hostname}"; classtype:trojan-activity; sid:${sid}; rev:1;)\n`) - pathname = url.pathname - splunk.write(`"${hostname}","${pathname}","urlhaus-filter malicious website detected","${process.env.CURRENT_TIME}"\n`) + const { hostname, pathname, search } = url + const pathEscape = `${pathname}${search}`.replaceAll(';', '\\;') + const path = pathname + search + + snort2.write(`alert tcp $HOME_NET any -> $EXTERNAL_NET [80,443] (msg:"urlhaus-filter malicious website detected"; flow:established,from_client; content:"GET"; http_method; content:"${pathEscape.substring(0, 2048)}"; http_uri; nocase; content:"${hostname}"; content:"Host"; http_header; classtype:trojan-activity; sid:${sid}; rev:1;)\n`) + snort3.write(`alert http $HOME_NET any -> $EXTERNAL_NET any (msg:"urlhaus-filter malicious website detected"; http_header:field host; content:"${hostname}",nocase; http_uri; content:"${pathEscape}",nocase; classtype:trojan-activity; sid:${sid}; rev:1;)\n`) + suricata.write(`alert http $HOME_NET any -> $EXTERNAL_NET any (msg:"urlhaus-filter malicious website detected"; flow:established,from_client; http.method; content:"GET"; http.uri; content:"${pathEscape}"; endswith; nocase; http.host; content:"${hostname}"; classtype:trojan-activity; sid:${sid}; rev:1;)\n`) + splunk.write(`"${hostname}","${path}","urlhaus-filter malicious website detected","${process.env.CURRENT_TIME}"\n`) sid++ } diff --git a/src/script.sh b/src/script.sh index d93a6964..b6ab65dd 100644 --- a/src/script.sh +++ b/src/script.sh @@ -65,6 +65,7 @@ else fi ## Create a temporary working folder +rm "tmp/" mkdir -p "tmp/" cd "tmp/" @@ -97,7 +98,7 @@ if [ -n "$CF_API" ]; then dos2unix | \ tr "[:upper:]" "[:lower:]" | \ grep -F "." | \ - sed "s/^www\.//g" | \ + sed "s/^www\.//" | \ sort -u > "top-1m-radar.txt" fi @@ -114,38 +115,23 @@ sed "/^#/d" > "URLhaus.csv" ## Parse URLs cat "URLhaus.csv" | \ cut -f 6 -d '"' | \ -cut -f 3- -d "/" | \ -# Domain must have at least a 'dot' -grep -F "." | \ -# Remove invalid protocol, see #32 -sed -E "s/^(ttps:\/\/|https:\/|http\/)//g" | \ -# Remove www. -sed "s/^www\.//g" | \ +node "../src/clean_url.js" | \ sort -u > "urlhaus.txt" ## Parse domain and IP address only cat "urlhaus.txt" | \ -cut -f 1 -d "/" | \ -# Remove port -cut -f 1 -d ":" | \ -# Remove invalid domains, see #15 -grep -vF "??" | \ -cut -f 1 -d "?" | \ +node "../src/clean_url.js" hostname | \ sort -u > "urlhaus-domains.txt" ## Parse online URLs only cat "URLhaus.csv" | \ grep -F '"online"' | \ cut -f 6 -d '"' | \ -cut -f 3- -d "/" | \ -sed "s/^www\.//g" | \ +node "../src/clean_url.js" | \ sort -u > "urlhaus-online.txt" cat "urlhaus-online.txt" | \ -cut -f 1 -d "/" | \ -cut -f 1 -d ":" | \ -grep -vF "??" | \ -cut -f 1 -d "?" | \ +node "../src/clean_url.js" hostname | \ sort -u > "urlhaus-domains-online.txt" @@ -157,7 +143,7 @@ tr "[:upper:]" "[:lower:]" | \ cut -f 2 -d "," | \ grep -F "." | \ # Remove www. -sed "s/^www\.//g" | \ +sed "s/^www\.//" | \ sort -u > "top-1m-umbrella.txt" ## Parse the Tranco 1 Million @@ -169,7 +155,7 @@ if [ -n "$(file 'top-1m-tranco.zip' | grep 'Zip archive data')" ]; then cut -f 2 -d "," | \ grep -F "." | \ # Remove www. - sed "s/^www\.//g" | \ + sed "s/^www\.//" | \ sort -u > "top-1m-tranco.txt" else # tranco has unreliable download @@ -206,13 +192,13 @@ sed "/^$/d" > "malware-domains-online.txt" ## Parse malware URLs from popular domains cat "urlhaus.txt" | \ grep -F -f "urlhaus-top-domains.txt" | \ -sed "s/^/||/g" | \ -sed "s/$/\$all/g" > "malware-url-top-domains.txt" +sed "s/^/||/" | \ +sed 's/$/^$all/' > "malware-url-top-domains.txt" cat "urlhaus-online.txt" | \ grep -F -f "urlhaus-top-domains.txt" | \ -sed "s/^/||/g" | \ -sed "s/$/\$all/g" > "malware-url-top-domains-online.txt" +sed "s/^/||/" | \ +sed 's/$/^$all/' > "malware-url-top-domains-online.txt" cat "urlhaus-online.txt" | \ grep -F -f "urlhaus-top-domains.txt" > "malware-url-top-domains-raw-online.txt" @@ -231,31 +217,28 @@ COMMENT_ABP="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n mkdir -p "../public/" cat "malware-domains.txt" "malware-url-top-domains.txt" | \ -sort | \ sed "1i $COMMENT_ABP" > "../public/urlhaus-filter.txt" cat "malware-domains-online.txt" "malware-url-top-domains-online.txt" | \ -sort | \ sed "1i $COMMENT_ABP" | \ sed "1s/Malicious/Online Malicious/" > "../public/urlhaus-filter-online.txt" # Adguard Home (#19, #22) cat "malware-domains.txt" | \ -sed "s/^/||/g" | \ -sed "s/$/^/g" > "malware-domains-adguard-home.txt" +sed "s/^/||/" | \ +sed "s/$/^/" | \ +sort -u > "malware-domains-adguard-home.txt" cat "malware-domains-online.txt" | \ -sed "s/^/||/g" | \ -sed "s/$/^/g" > "malware-domains-online-adguard-home.txt" +sed "s/^/||/" | \ +sed "s/$/^/" > "malware-domains-online-adguard-home.txt" cat "malware-domains-adguard-home.txt" | \ -sort | \ sed "1i $COMMENT_ABP" | \ sed "1s/Blocklist/Blocklist (AdGuard Home)/" > "../public/urlhaus-filter-agh.txt" cat "malware-domains-online-adguard-home.txt" | \ -sort | \ sed "1i $COMMENT_ABP" | \ sed "1s/Malicious/Online Malicious/" | \ sed "1s/Blocklist/Blocklist (AdGuard Home)/" > "../public/urlhaus-filter-agh-online.txt" @@ -263,20 +246,18 @@ sed "1s/Blocklist/Blocklist (AdGuard Home)/" > "../public/urlhaus-filter-agh-onl # Adguard browser extension cat "malware-domains.txt" | \ -sed "s/^/||/g" | \ -sed "s/$/\$all/g" > "malware-domains-adguard.txt" +sed "s/^/||/" | \ +sed 's/$/^$all/' > "malware-domains-adguard.txt" cat "malware-domains-online.txt" | \ -sed "s/^/||/g" | \ -sed "s/$/\$all/g" > "malware-domains-online-adguard.txt" +sed "s/^/||/" | \ +sed 's/$/^$all/' > "malware-domains-online-adguard.txt" cat "malware-domains-adguard.txt" "malware-url-top-domains.txt" | \ -sort | \ sed "1i $COMMENT_ABP" | \ sed "1s/Blocklist/Blocklist (AdGuard)/" > "../public/urlhaus-filter-ag.txt" cat "malware-domains-online-adguard.txt" "malware-url-top-domains-online.txt" | \ -sort | \ sed "1i $COMMENT_ABP" | \ sed "1s/Malicious/Online Malicious/" | \ sed "1s/Blocklist/Blocklist (AdGuard)/" > "../public/urlhaus-filter-ag-online.txt" @@ -284,22 +265,20 @@ sed "1s/Blocklist/Blocklist (AdGuard)/" > "../public/urlhaus-filter-ag-online.tx # Vivaldi cat "malware-domains.txt" | \ -sed "s/^/||/g" | \ -sed "s/$/\$document/g" > "malware-domains-vivaldi.txt" +sed "s/^/||/" | \ +sed 's/$/^$document/' > "malware-domains-vivaldi.txt" cat "malware-domains-online.txt" | \ -sed "s/^/||/g" | \ -sed "s/$/\$document/g" > "malware-domains-online-vivaldi.txt" +sed "s/^/||/" | \ +sed 's/$/^$document/' > "malware-domains-online-vivaldi.txt" cat "malware-domains-vivaldi.txt" "malware-url-top-domains.txt" | \ -sed "s/\$all$/\$document/g" | \ -sort | \ +sed 's/\$all$/$document/' | \ sed "1i $COMMENT_ABP" | \ sed "1s/Blocklist/Blocklist (Vivaldi)/" > "../public/urlhaus-filter-vivaldi.txt" cat "malware-domains-online-vivaldi.txt" "malware-url-top-domains-online.txt" | \ -sed "s/\$all$/\$document/g" | \ -sort | \ +sed 's/\$all$/$document/' | \ sed "1i $COMMENT_ABP" | \ sed "1s/Malicious/Online Malicious/" | \ sed "1s/Blocklist/Blocklist (Vivaldi)/" > "../public/urlhaus-filter-vivaldi-online.txt" @@ -307,67 +286,65 @@ sed "1s/Blocklist/Blocklist (Vivaldi)/" > "../public/urlhaus-filter-vivaldi-onli ## Domains-only blocklist # awk + head is a workaround for sed prepend -COMMENT=$(printf "$COMMENT_ABP" | sed "s/^!/#/g" | sed "1s/URL/Domains/" | awk '{printf "%s\\n", $0}' | head -c -2) +COMMENT=$(printf "$COMMENT_ABP" | sed "s/^!/#/" | sed "1s/URL/Domains/" | awk '{printf "%s\\n", $0}' | head -c -2) COMMENT_ONLINE=$(printf "$COMMENT" | sed "1s/Malicious/Online Malicious/" | awk '{printf "%s\\n", $0}' | head -c -2) cat "malware-domains.txt" | \ -sort | \ sed "1i $COMMENT" > "../public/urlhaus-filter-domains.txt" cat "malware-domains-online.txt" | \ -sort | \ sed "1i $COMMENT_ONLINE" > "../public/urlhaus-filter-domains-online.txt" ## Hosts only cat "malware-domains.txt" | \ -sort | \ -# Remove IPv4 address -grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" > "malware-hosts.txt" +# exclude IPv4 +grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \ +# exclude IPv6 +grep -vE "^\[" > "malware-hosts.txt" cat "malware-domains-online.txt" | \ -sort | \ -# Remove IPv4 address -grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" > "malware-hosts-online.txt" +grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \ +grep -vE "^\[" > "malware-hosts-online.txt" ## Hosts file blocklist cat "malware-hosts.txt" | \ -sed "s/^/0.0.0.0 /g" | \ +sed "s/^/0.0.0.0 /" | \ # Re-insert comment sed "1i $COMMENT" | \ sed "1s/Domains/Hosts/" > "../public/urlhaus-filter-hosts.txt" cat "malware-hosts-online.txt" | \ -sed "s/^/0.0.0.0 /g" | \ +sed "s/^/0.0.0.0 /" | \ sed "1i $COMMENT_ONLINE" | \ sed "1s/Domains/Hosts/" > "../public/urlhaus-filter-hosts-online.txt" ## Dnsmasq-compatible blocklist cat "malware-hosts.txt" | \ -sed "s/^/address=\//g" | \ -sed "s/$/\/0.0.0.0/g" | \ +sed "s/^/address=\//" | \ +sed "s/$/\/0.0.0.0/" | \ sed "1i $COMMENT" | \ sed "1s/Blocklist/dnsmasq Blocklist/" > "../public/urlhaus-filter-dnsmasq.conf" cat "malware-hosts-online.txt" | \ -sed "s/^/address=\//g" | \ -sed "s/$/\/0.0.0.0/g" | \ +sed "s/^/address=\//" | \ +sed "s/$/\/0.0.0.0/" | \ sed "1i $COMMENT_ONLINE" | \ sed "1s/Blocklist/dnsmasq Blocklist/" > "../public/urlhaus-filter-dnsmasq-online.conf" ## BIND-compatible blocklist cat "malware-hosts.txt" | \ -sed 's/^/zone "/g' | \ -sed 's/$/" { type master; notify no; file "null.zone.file"; };/g' | \ +sed 's/^/zone "/' | \ +sed 's/$/" { type master; notify no; file "null.zone.file"; };/' | \ sed "1i $COMMENT" | \ sed "1s/Blocklist/BIND Blocklist/" > "../public/urlhaus-filter-bind.conf" cat "malware-hosts-online.txt" | \ -sed 's/^/zone "/g' | \ -sed 's/$/" { type master; notify no; file "null.zone.file"; };/g' | \ +sed 's/^/zone "/' | \ +sed 's/$/" { type master; notify no; file "null.zone.file"; };/' | \ sed "1i $COMMENT_ONLINE" | \ sed "1s/Blocklist/BIND Blocklist/" > "../public/urlhaus-filter-bind-online.conf" @@ -377,30 +354,30 @@ CURRENT_UNIX_TIME="$(date +%s)" RPZ_SYNTAX="\n\$TTL 30\n@ IN SOA localhost. root.localhost. $CURRENT_UNIX_TIME 86400 3600 604800 30\n NS localhost.\n" cat "malware-hosts.txt" | \ -sed "s/$/ CNAME ./g" | \ +sed "s/$/ CNAME ./" | \ sed '1 i\'"$RPZ_SYNTAX"'' | \ sed "1i $COMMENT" | \ -sed "s/^#/;/g" | \ +sed "s/^#/;/" | \ sed "1s/Blocklist/RPZ Blocklist/" > "../public/urlhaus-filter-rpz.conf" cat "malware-hosts-online.txt" | \ -sed "s/$/ CNAME ./g" | \ +sed "s/$/ CNAME ./" | \ sed '1 i\'"$RPZ_SYNTAX"'' | \ sed "1i $COMMENT_ONLINE" | \ -sed "s/^#/;/g" | \ +sed "s/^#/;/" | \ sed "1s/Blocklist/RPZ Blocklist/" > "../public/urlhaus-filter-rpz-online.conf" ## Unbound-compatible blocklist cat "malware-hosts.txt" | \ -sed 's/^/local-zone: "/g' | \ -sed 's/$/" always_nxdomain/g' | \ +sed 's/^/local-zone: "/' | \ +sed 's/$/" always_nxdomain/' | \ sed "1i $COMMENT" | \ sed "1s/Blocklist/Unbound Blocklist/" > "../public/urlhaus-filter-unbound.conf" cat "malware-hosts-online.txt" | \ -sed 's/^/local-zone: "/g' | \ -sed 's/$/" always_nxdomain/g' | \ +sed 's/^/local-zone: "/' | \ +sed 's/$/" always_nxdomain/' | \ sed "1i $COMMENT_ONLINE" | \ sed "1s/Blocklist/Unbound Blocklist/" > "../public/urlhaus-filter-unbound-online.conf" @@ -415,28 +392,37 @@ cat "malware-hosts-online.txt" | \ sed "1i $COMMENT_ONLINE" | \ sed "1s/Domains/Names/" > "../public/urlhaus-filter-dnscrypt-blocked-names-online.txt" -# IPv4-based -cat "malware-domains.txt" | \ -sort | \ -grep -E "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \ -sed "1i $COMMENT" | \ -sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips.txt" +# IPv4/6 +if grep -Eq "^(([0-9]{1,3}[\.]){3}[0-9]{1,3}$|\[)" "phishing-notop-domains.txt"; then + cat "malware-domains.txt" | \ + grep -E "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \ + sed -r "s/\[|\]//g" | \ + sed "1i $COMMENT" | \ + sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips.txt" -cat "malware-domains-online.txt" | \ -sort | \ -grep -E "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \ -sed "1i $COMMENT_ONLINE" | \ -sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips-online.txt" + cat "malware-domains-online.txt" | \ + grep -E "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \ + sed -r "s/\[|\]//g" | \ + sed "1i $COMMENT_ONLINE" | \ + sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips-online.txt" +else + echo | \ + sed "1i $COMMENT" | \ + sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips.txt" + echo | \ + sed "1i $COMMENT_ONLINE" | \ + sed "1s/Domains/IPs/" > "../public/urlhaus-filter-dnscrypt-blocked-ips-online.txt" +fi ## Wildcard subdomain cat "malware-domains.txt" | \ -sed "s/^/*./g" | \ +sed "s/^/*./" | \ sed "1i $COMMENT" | \ sed "1s/Blocklist/Wildcard Asterisk Blocklist/" > "../public/urlhaus-filter-wildcard.txt" cat "malware-domains-online.txt" | \ -sed "s/^/*./g" | \ +sed "s/^/*./" | \ sed "1i $COMMENT" | \ sed "1s/Blocklist/Wildcard Asterisk Blocklist/" > "../public/urlhaus-filter-wildcard-online.txt" @@ -468,12 +454,12 @@ COMMENT_IE="msFilterList\n$COMMENT\n: Expires=1\n#" COMMENT_ONLINE_IE="msFilterList\n$COMMENT_ONLINE\n: Expires=1\n#" cat "malware-hosts.txt" | \ -sed "s/^/-d /g" | \ +sed "s/^/-d /" | \ sed "1i $COMMENT_IE" | \ sed "2s/Domains Blocklist/Hosts Blocklist (IE)/" > "../public/urlhaus-filter.tpl" cat "malware-hosts-online.txt" | \ -sed "s/^/-d /g" | \ +sed "s/^/-d /" | \ sed "1i $COMMENT_ONLINE_IE" | \ sed "2s/Domains Blocklist/Hosts Blocklist (IE)/" > "../public/urlhaus-filter-online.tpl"