From e38bc68ada89b380e691a36ca6a0f1dbb415efe7 Mon Sep 17 00:00:00 2001 From: MDLeom <2809763-curben@users.noreply.gitlab.com> Date: Sun, 12 Jul 2020 08:16:27 +0100 Subject: [PATCH] feat: add OpenPhish data - https://openphish.com/ --- README.md | 18 ++++++++++-------- src/script.sh | 41 +++++++++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 4b89c8e4..ae28f461 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Phishing URL Blocklist -A blocklist of phishing websites, based on the [PhishTank](https://www.phishtank.com/) list. Blocklist is updated twice a day. +A blocklist of phishing websites, based on the [PhishTank](https://www.phishtank.com/) and [OpenPhish](https://openphish.com/) lists. Blocklist is updated twice a day. There are multiple formats available, refer to the appropriate section according to the program used: @@ -195,9 +195,9 @@ This blocklist operates by blocking the **whole** website, instead of specific w If you wish to exclude certain website(s) that you believe is sufficiently well-known, please create an [issue](https://gitlab.com/curben/phishing-filter/issues) or [merge request](https://gitlab.com/curben/phishing-filter/merge_requests). -This blocklist **only** accepts new phishing URLs from [PhishTank](https://www.phishtank.com/). +This blocklist **only** accepts new phishing URLs from [PhishTank](https://www.phishtank.com/) and [OpenPhish](https://openphish.com/). -Please report new phishing URL to the upstream maintainer through https://www.phishtank.com/add_web_phish.php. +Please report new phishing URL to [PhishTank](https://www.phishtank.com/add_web_phish.php) or [OpenPhish](https://openphish.com/faq.html). ## Cloning @@ -211,14 +211,16 @@ Use shallow clone to get the recent revisions only. Getting the last five revisi [Creative Commons Zero v1.0 Universal](LICENSE.md) -[csvquote](https://github.com/dbro/csvquote): [MIT License](https://choosealicense.com/licenses/mit/) +[PhishTank](https://www.phishtank.com/): [CC BY-SA 2.5](https://creativecommons.org/licenses/by-sa/2.5/) + +_PhishTank is either trademark or registered trademark of OpenDNS, LLC._ + +[OpenPhish](https://openphish.com/): Available free of charge by OpenPhish [Tranco List](https://tranco-list.eu/): MIT License [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html): Available free of charge by Cisco Umbrella -[PhishTank](https://www.phishtank.com/): [CC BY-SA 2.5](https://creativecommons.org/licenses/by-sa/2.5/) +[csvquote](https://github.com/dbro/csvquote): [MIT License](https://choosealicense.com/licenses/mit/) -PhishTank is either trademark or registered trademark of OpenDNS, LLC. - -This repository is not endorsed by PhishTank/OpenDNS. +This repository is not endorsed by PhishTank/OpenDNS and OpenPhish. diff --git a/src/script.sh b/src/script.sh index b24deb77..b5b8ea0b 100644 --- a/src/script.sh +++ b/src/script.sh @@ -18,9 +18,9 @@ fi mkdir -p "tmp/" cd "tmp/" - ## Prepare datasets curl -L "https://data.phishtank.com/data/$PHISHTANK_API/online-valid.csv.bz2" -o "phishtank.bz2" +curl -L "https://openphish.com/feed.txt" -o "openphish-raw.txt" curl -L "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip" curl -L "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip" @@ -37,14 +37,23 @@ sed 's/"//g' | \ cut -f 3- -d "/" | \ # Domain must have at least a 'dot' grep -F "." | \ -sed "s/^www\.//g" | \ -sort -u > "phishtank.txt" +sed "s/^www\.//g" > "phishtank.txt" + +cat "openphish-raw.txt" | \ +dos2unix | \ +cut -f 3- -d "/" | \ +grep -F "." | \ +sed "s/^www\.//g" > "openphish.txt" + +## Combine PhishTank and OpenPhish +cat "phishtank.txt" "openphish.txt" | \ +sort -u > "phishing.txt" ## Parse domain and IP address only -cat "phishtank.txt" | \ +cat "phishing.txt" | \ cut -f 1 -d "/" | \ cut -f 1 -d ":" | \ -sort -u > "phishtank-domains.txt" +sort -u > "phishing-domains.txt" cp "../src/exclude.txt" "." @@ -74,19 +83,19 @@ cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \ sort -u > "top-1m-well-known.txt" -## Parse popular domains from PhishTank -cat "phishtank-domains.txt" | \ +## Parse popular domains +cat "phishing-domains.txt" | \ # grep match whole line -grep -Fx -f "top-1m-well-known.txt" > "phishtank-top-domains.txt" +grep -Fx -f "top-1m-well-known.txt" > "phishing-top-domains.txt" -## Parse domains from PhishTank excluding popular domains -cat "phishtank-domains.txt" | \ -grep -F -vf "phishtank-top-domains.txt" > "phishing-domains.txt" +## Exclude popular domains +cat "phishing-domains.txt" | \ +grep -F -vf "phishing-top-domains.txt" > "phishing-notop-domains.txt" ## Parse phishing URLs from popular domains -cat "phishtank.txt" | \ -grep -F -f "phishtank-top-domains.txt" | \ +cat "phishing.txt" | \ +grep -F -f "phishing-top-domains.txt" | \ sed "s/^/||/g" | \ sed "s/$/\$all/g" > "phishing-url-top-domains.txt" @@ -103,7 +112,7 @@ COMMENT_UBO="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n # Compatibility with Adguard Home # https://gitlab.com/curben/urlhaus-filter/-/issues/19 -cat "phishing-domains.txt" | \ +cat "phishing-notop-domains.txt" | \ sed "s/^/||/g" | \ sed "s/$/^/g" > "phishing-domains-adguard.txt" @@ -116,7 +125,7 @@ sed '1 i\'"$COMMENT_UBO"'' > "../dist/phishing-filter.txt" # awk + head is a workaround for sed prepend COMMENT=$(printf "$COMMENT_UBO" | sed "s/^!/#/g" | sed "1s/URL/Domains/" | awk '{printf "%s\\n", $0}' | head -c -2) -cat "phishing-domains.txt" | \ +cat "phishing-notop-domains.txt" | \ sort | \ sed '1 i\'"$COMMENT"'' > "../dist/phishing-filter-domains.txt" @@ -161,7 +170,7 @@ sed "1s/Blocklist/Unbound Blocklist/" > "../dist/phishing-filter-unbound.conf" ## Clean up artifacts -rm "phishtank.csv" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" +rm "phishtank.csv" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "openphish-raw.txt" cd ../