diff --git a/package.json b/package.json index 2f0047a7..1ba27753 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,7 @@ "unzipper": "^0.12.3" }, "engines": { - "node": ">= 18.12.0" + "node": ">= 20.9.0" }, "type": "module" } diff --git a/src/clean_url.js b/src/clean_url.js new file mode 100644 index 00000000..ad927e49 --- /dev/null +++ b/src/clean_url.js @@ -0,0 +1,53 @@ +import { createInterface } from "node:readline" + +for await (const line of createInterface({ input: process.stdin })) { + // parse hostname from url + if (process.argv[2] === 'hostname') { + if (URL.canParse(`http://${line}`)) { + const { hostname } = new URL(`http://${line}`) + + console.log(hostname) + } + else { + const hostname = line + // host + .split('/')[0] + // exclude credential + .replace(/.*@(.+)/, '$1') + // exclude port + .replace(/:\d+$/, '') + // #2 + .split('?')[0] + + console.log(hostname) + } + } else { + if (URL.canParse(line)) { + let url = new URL(line) + + // Decode O365 Safelinks + // https://support.microsoft.com/en-us/office/advanced-outlook-com-security-for-microsoft-365-subscribers-882d2243-eab9-4545-a58a-b36fee4a46e2 + if (url.hostname.endsWith('safelinks.protection.outlook.com')) { + url = new URL(url.searchParams.get('url')) + } + + const outUrl = `${url.host.replace(/^www\./, '')}${url.pathname}${url.search}` + // remove trailing slash from domain except path #43 + .replace(/(^[^\/]*)\/+$/, '$1') + + console.log(outUrl) + } + else { + const outUrl = line + // remove protocol + .split('/').slice(2).join('/') + // remove www + .replace(/^www\./, '') + // url encode space #11 + .replace(' ', '%20') + .replace(/(^[^\/]*)\/+$/, '$1') + + console.log(outUrl) + } + } +} diff --git a/src/safelinks.js b/src/safelinks.js deleted file mode 100644 index d1ddca77..00000000 --- a/src/safelinks.js +++ /dev/null @@ -1,9 +0,0 @@ -// Decode O365 Safelinks -// https://support.microsoft.com/en-us/office/advanced-outlook-com-security-for-microsoft-365-subscribers-882d2243-eab9-4545-a58a-b36fee4a46e2 -import { createInterface } from "node:readline" - -for await (const line of createInterface({ input: process.stdin })) { - const inputUrl = new URL(`http://${line}`) - const outputUrl = new URL(inputUrl.searchParams.get('url')) - console.log(`${outputUrl.host}${outputUrl.pathname}${outputUrl.search}`) -} diff --git a/src/script.sh b/src/script.sh index e5a8c0c2..a346c85c 100644 --- a/src/script.sh +++ b/src/script.sh @@ -143,14 +143,7 @@ if [ -n "$(file 'phishtank.bz2' | grep 'bzip2 compressed data')" ]; then cut -f 2 -d "," | \ "./$CSVQUOTE" -u | \ sed 's/"//g' | \ - cut -f 3- -d "/" | \ - # Domain must have at least a 'dot' - grep -F "." | \ - sed "s/^www\.//g" | \ - # url encode space #11 - sed "s/ /%20/g" | \ - # remove trailing slash from domain except path #43 - sed -r "s/(^[^\/]*)\/+$/\1/g" | \ + node "../src/clean_url.js" | \ sort -u > "phishtank.txt" else # cloudflare may impose captcha @@ -161,11 +154,7 @@ fi cat "openphish-raw.txt" | \ dos2unix | \ tr "[:upper:]" "[:lower:]" | \ -cut -f 3- -d "/" | \ -grep -F "." | \ -sed "s/^www\.//g" | \ -sed "s/ /%20/g" | \ -sed -r "s/(^[^\/]*)\/+$/\1/g" | \ +node "../src/clean_url.js" | \ sort -u > "openphish.txt" gzip -dc "ipthreat.gz" | \ @@ -173,41 +162,19 @@ gzip -dc "ipthreat.gz" | \ sed "/^#/d" | \ sed "s/ # .*//g" | \ tr "[:upper:]" "[:lower:]" | \ -cut -f 3- -d "/" | \ -grep -F "." | \ -sed "s/^www\.//g" | \ -sed "s/ /%20/g" | \ -sed -r "s/(^[^\/]*)\/+$/\1/g" | \ +node "../src/clean_url.js" | \ sort -u > "ipthreat.txt" ## Combine all sources cat "openphish.txt" "ipthreat.txt" "phishtank.txt" | \ -sort -u > "phishing-temp.txt" +# remove blank lines +sed "/^$/d" | \ +sort -u > "phishing.txt" -## Parse O365 safelink -safelinks="$(cat 'phishing-temp.txt' | grep -P '^(?:[a-z]{3}\d{2}\.)?safelinks\.protection\.outlook\.com' || [ $? = 1 ])" -if [ -n "$safelinks" ]; then - echo "$safelinks" > "safelinks.txt" - - cat "phishing-temp.txt" | \ - grep -Fx -vf "safelinks.txt" > "phishing.txt" - - cat "safelinks.txt" | \ - node "../src/safelinks.js" | \ - sed -r "s/(^[^\/]*)\/+$/\1/g" | \ - sort -u >> "phishing.txt" -else - cp "phishing-temp.txt" "phishing.txt" -fi ## Parse domain and IP address only cat "phishing.txt" | \ -cut -f 1 -d "/" | \ -cut -f 1 -d ":" | \ -# #2 -cut -f 1 -d "?" | \ -# #91 -sed -r "s/.*@(.+)/\1/g" | \ +node "../src/clean_url.js" hostname | \ sort -u > "phishing-domains.txt" @@ -341,7 +308,10 @@ sort | \ sed "1i $COMMENT" > "../public/phishing-filter-domains.txt" cat "phishing-notop-domains.txt" | \ -grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" > "phishing-notop-hosts.txt" +# exclude IPv4 +grep -vE "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \ +# exclude IPv6 +grep -vE "^\[" > "phishing-notop-hosts.txt" ## Hosts file blocklist cat "phishing-notop-hosts.txt" | \ @@ -393,11 +363,12 @@ cat "phishing-notop-hosts.txt" | \ sed "1i $COMMENT" | \ sed "1s/Domains/Names/" > "../public/phishing-filter-dnscrypt-blocked-names.txt" -# IPv4-based -if grep -Eq "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" "phishing-notop-domains.txt"; then +# IPv4/6 +if grep -Eq "^(([0-9]{1,3}[\.]){3}[0-9]{1,3}$|\[)" "phishing-notop-domains.txt"; then cat "phishing-notop-domains.txt" | \ sort | \ - grep -E "^([0-9]{1,3}[\.]){3}[0-9]{1,3}$" | \ + grep -E "^(([0-9]{1,3}[\.]){3}[0-9]{1,3}$|\[)" | \ + sed -r "s/\[|\]//g" | \ sed "1i $COMMENT" | \ sed "1s/Domains/IPs/" > "../public/phishing-filter-dnscrypt-blocked-ips.txt" else