2020-07-05 10:46:19 +00:00
#!/bin/sh
set -efux -o pipefail
## Detect Musl C library
LIBC = " $( ldd /bin/ls | grep 'musl' || [ $? = 1 ] ) "
if [ -z " $LIBC " ] ; then
rm -f "/tmp/musl.log"
# Not Musl
CSVQUOTE = "../utils/csvquote-bin-glibc"
else
# Musl
CSVQUOTE = "../utils/csvquote-bin-musl"
fi
## Create a temporary working folder
mkdir -p "tmp/"
cd "tmp/"
## Prepare datasets
curl -L " https://data.phishtank.com/data/ $PHISHTANK_API /online-valid.csv.bz2 " -o "phishtank.bz2"
2020-07-12 07:16:27 +00:00
curl -L "https://openphish.com/feed.txt" -o "openphish-raw.txt"
2020-07-05 10:46:19 +00:00
curl -L "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip"
curl -L "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip"
bunzip2 -kc "phishtank.bz2" > "phishtank.csv"
## Parse URLs
cat "phishtank.csv" | \
## Workaround for column with double quotes
" ./ $CSVQUOTE " | \
cut -f 2 -d "," | \
" ./ $CSVQUOTE " -u | \
2020-07-07 10:06:18 +00:00
sed 's/"//g' | \
2020-07-05 10:46:19 +00:00
cut -f 3- -d "/" | \
# Domain must have at least a 'dot'
grep -F "." | \
2020-07-12 07:16:27 +00:00
sed "s/^www\.//g" > "phishtank.txt"
cat "openphish-raw.txt" | \
dos2unix | \
cut -f 3- -d "/" | \
grep -F "." | \
sed "s/^www\.//g" > "openphish.txt"
## Combine PhishTank and OpenPhish
cat "phishtank.txt" "openphish.txt" | \
sort -u > "phishing.txt"
2020-07-05 10:46:19 +00:00
## Parse domain and IP address only
2020-07-12 07:16:27 +00:00
cat "phishing.txt" | \
2020-07-05 10:46:19 +00:00
cut -f 1 -d "/" | \
cut -f 1 -d ":" | \
2020-07-12 07:16:27 +00:00
sort -u > "phishing-domains.txt"
2020-07-05 10:46:19 +00:00
cp "../src/exclude.txt" "."
## Parse the Umbrella 1 Million
unzip -p "top-1m-umbrella.zip" | \
dos2unix | \
# Parse domains only
cut -f 2 -d "," | \
grep -F "." | \
# Remove www.
sed "s/^www\.//g" | \
sort -u > "top-1m-umbrella.txt"
## Parse the Tranco 1 Million
unzip -p "top-1m-tranco.zip" | \
dos2unix | \
# Parse domains only
cut -f 2 -d "," | \
grep -F "." | \
# Remove www.
sed "s/^www\.//g" | \
sort -u > "top-1m-tranco.txt"
# Merge Umbrella, Traco and self-maintained top domains
cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \
sort -u > "top-1m-well-known.txt"
2020-07-12 07:16:27 +00:00
## Parse popular domains
cat "phishing-domains.txt" | \
2020-07-05 10:46:19 +00:00
# grep match whole line
2020-07-12 07:16:27 +00:00
grep -Fx -f "top-1m-well-known.txt" > "phishing-top-domains.txt"
2020-07-05 10:46:19 +00:00
2020-07-12 07:16:27 +00:00
## Exclude popular domains
cat "phishing-domains.txt" | \
grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains.txt"
2020-07-05 10:46:19 +00:00
## Parse phishing URLs from popular domains
2020-07-12 07:16:27 +00:00
cat "phishing.txt" | \
grep -F -f "phishing-top-domains.txt" | \
2020-07-05 10:46:19 +00:00
sed "s/^/||/g" | \
2020-07-07 01:28:03 +00:00
sed " s/ $/\$all/g " > "phishing-url-top-domains.txt"
2020-07-05 10:46:19 +00:00
## Merge malware domains and URLs
CURRENT_TIME = " $( date -R -u) "
FIRST_LINE = "! Title: Phishing URL Blocklist"
SECOND_LINE = " ! Updated: $CURRENT_TIME "
THIRD_LINE = "! Expires: 1 day (update frequency)"
FOURTH_LINE = "! Repo: https://gitlab.com/curben/phishing-filter"
2020-07-21 00:15:49 +00:00
FIFTH_LINE = "! License: https://creativecommons.org/licenses/by-sa/4.0/"
2020-07-13 02:30:06 +00:00
SIXTH_LINE = "! Source: https://www.phishtank.com/ & https://openphish.com/"
2020-07-05 10:46:19 +00:00
COMMENT_UBO = " $FIRST_LINE \n $SECOND_LINE \n $THIRD_LINE \n $FOURTH_LINE \n $FIFTH_LINE \n $SIXTH_LINE "
2020-09-15 08:48:44 +00:00
# Adguard Home
2020-07-12 07:16:27 +00:00
cat "phishing-notop-domains.txt" | \
2020-07-08 10:28:35 +00:00
sed "s/^/||/g" | \
2020-09-03 01:59:03 +00:00
sed " s/ $/^/g " > "phishing-domains-adguard-home.txt"
2020-07-08 10:28:35 +00:00
2020-09-03 01:59:03 +00:00
cat "phishing-domains-adguard-home.txt" | \
2020-09-01 06:36:05 +00:00
sort | \
sed '1 i\' " $COMMENT_UBO " '' | \
sed "1s/Blocklist/Blocklist (AdGuard Home)/" > "../dist/phishing-filter-agh.txt"
2020-09-03 01:59:03 +00:00
cat "phishing-domains-adguard-home.txt" "phishing-url-top-domains.txt" | \
2020-07-05 10:46:19 +00:00
sort | \
2020-09-19 08:42:33 +00:00
sed '1 i\' "\n! BREAKING CHANGE (1 Oct 2020): AdGuard Home should use this blocklist https://gitlab.com/curben/phishing-filter#domain-based-adguard-home\n" '' | \
2020-07-05 10:46:19 +00:00
sed '1 i\' " $COMMENT_UBO " '' > "../dist/phishing-filter.txt"
2020-09-15 08:48:44 +00:00
# Adguard browser extension
2020-09-03 01:59:03 +00:00
cat "phishing-notop-domains.txt" | \
sed "s/^/||/g" | \
sed " s/ $/\$all/g " > "phishing-domains-adguard.txt"
cat "phishing-domains-adguard.txt" "phishing-url-top-domains.txt" | \
sort | \
sed '1 i\' " $COMMENT_UBO " '' | \
sed "1s/Blocklist/Blocklist (AdGuard)/" > "../dist/phishing-filter-ag.txt"
2020-09-15 08:48:44 +00:00
# Vivaldi
cat "phishing-notop-domains.txt" | \
sed "s/^/||/g" | \
sed " s/ $/\$document/g " > "phishing-domains-vivaldi.txt"
cat "phishing-domains-vivaldi.txt" "phishing-url-top-domains.txt" | \
## to be removed
sed " s/\$all $/\$document/g " | \
sort | \
sed '1 i\' " $COMMENT_UBO " '' | \
sed "1s/Blocklist/Blocklist (Vivaldi)/" > "../dist/phishing-filter-vivaldi.txt"
2020-07-05 10:46:19 +00:00
## Domains-only blocklist
# awk + head is a workaround for sed prepend
COMMENT = $( printf " $COMMENT_UBO " | sed "s/^!/#/g" | sed "1s/URL/Domains/" | awk '{printf "%s\\n", $0}' | head -c -2)
2020-07-12 07:16:27 +00:00
cat "phishing-notop-domains.txt" | \
2020-07-05 10:46:19 +00:00
sort | \
sed '1 i\' " $COMMENT " '' > "../dist/phishing-filter-domains.txt"
## Hosts file blocklist
cat "../dist/phishing-filter-domains.txt" | \
# Exclude comment with #
grep -vE "^#" | \
# Remove IPv4 address
grep -vE "([0-9]{1,3}[\.]){3}[0-9]{1,3}" | \
sed "s/^/0.0.0.0 /g" | \
# Re-insert comment
sed '1 i\' " $COMMENT " '' | \
sed "1s/Domains/Hosts/" > "../dist/phishing-filter-hosts.txt"
## Dnsmasq-compatible blocklist
cat "../dist/phishing-filter-hosts.txt" | \
grep -vE "^#" | \
sed "s/^0.0.0.0 /address=\//g" | \
sed " s/ $/\/0.0.0.0/g " | \
sed '1 i\' " $COMMENT " '' | \
sed "1s/Blocklist/dnsmasq Blocklist/" > "../dist/phishing-filter-dnsmasq.conf"
## BIND-compatible blocklist
cat "../dist/phishing-filter-hosts.txt" | \
grep -vE "^#" | \
sed 's/^0.0.0.0 /zone "/g' | \
sed 's/$/" { type master; notify no; file "null.zone.file"; };/g' | \
sed '1 i\' " $COMMENT " '' | \
sed "1s/Blocklist/BIND Blocklist/" > "../dist/phishing-filter-bind.conf"
## Unbound-compatible blocklist
cat "../dist/phishing-filter-hosts.txt" | \
grep -vE "^#" | \
sed 's/^0.0.0.0 /local-zone: "/g' | \
sed 's/$/" always_nxdomain/g' | \
sed '1 i\' " $COMMENT " '' | \
sed "1s/Blocklist/Unbound Blocklist/" > "../dist/phishing-filter-unbound.conf"
## Clean up artifacts
2020-07-12 07:16:27 +00:00
rm "phishtank.csv" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "openphish-raw.txt"
2020-07-05 10:46:19 +00:00
cd ../