From 4bf534bdbcb2c0f3f49ea0cfb98aa535c4d20d3c Mon Sep 17 00:00:00 2001 From: MDLeom <2809763-curben@users.noreply.gitlab.com> Date: Fri, 25 Nov 2022 07:19:20 +0000 Subject: [PATCH] feat: add Cloudflare Radar top 1m domains dataset --- .gitlab-ci.yml | 4 ++-- README.md | 43 +++++++++++++++++++++++-------------------- src/script.sh | 41 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 62 insertions(+), 26 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 26cef590..f7be56ba 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,7 +12,7 @@ build_job: stage: build before_script: - - apk update && apk add brotli curl grep xmlstarlet + - apk update && apk add brotli curl grep jq xmlstarlet script: - sh src/script.sh @@ -46,7 +46,7 @@ cloudflare: stage: deploy before_script: - - apk update && apk add curl + - apk update && apk add curl script: - curl -X POST "https://api.cloudflare.com/client/v4/pages/webhooks/deploy_hooks/$CLOUDFLARE_BUILD_HOOK" diff --git a/README.md b/README.md index bd457b86..3f97f765 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,21 @@ # Phishing URL Blocklist - Formats - * [URL-based](#url-based) - * [Domain-based](#domain-based) - * [Hosts-based](#hosts-based) - * [Domain-based (AdGuard Home)](#domain-based-adguard-home) - * [URL-based (AdGuard)](#url-based-adguard) - * [URL-based (Vivaldi)](#url-based-vivaldi) - * [Dnsmasq](#dnsmasq) - * [BIND zone](#bind) - * [RPZ](#response-policy-zone) - * [Unbound](#unbound) - * [dnscrypt-proxy](#dnscrypt-proxy) - * [Tracking Protection List (IE)](#tracking-protection-list-ie) - * [Snort2](#snort2) - * [Snort3](#snort3) - * [Suricata](#suricata) + - [URL-based](#url-based) + - [Domain-based](#domain-based) + - [Hosts-based](#hosts-based) + - [Domain-based (AdGuard Home)](#domain-based-adguard-home) + - [URL-based (AdGuard)](#url-based-adguard) + - [URL-based (Vivaldi)](#url-based-vivaldi) + - [Dnsmasq](#dnsmasq) + - [BIND zone](#bind) + - [RPZ](#response-policy-zone) + - [Unbound](#unbound) + - [dnscrypt-proxy](#dnscrypt-proxy) + - [Tracking Protection List (IE)](#tracking-protection-list-ie) + - [Snort2](#snort2) + - [Snort3](#snort3) + - [Suricata](#suricata) - [Compressed version](#compressed-version) - [Reporting issues](#issues) - [See also](#see-also) @@ -310,7 +310,7 @@ chmod 755 /etc/cron.daily/phishing-filter Configure dnscrypt-proxy to use the blocklist: -``` diff +```diff [blocked_names] + blocked_names_file = '/etc/dnscrypt-proxy/phishing-filter-dnscrypt-blocked-names.txt' @@ -407,7 +407,7 @@ chmod 755 /etc/cron.daily/phishing-filter Configure Snort to use the ruleset: -``` diff +```diff # /etc/snort/snort.lua ips = { @@ -448,7 +448,7 @@ chmod 755 /etc/cron.daily/phishing-filter Configure Suricata to use the ruleset: -``` diff +```diff # /etc/suricata/suricata.yaml rule-files: - local.rules @@ -475,13 +475,13 @@ All filters are also available as gzip- and brotli-compressed. - Gzip: https://malware-filter.gitlab.io/malware-filter/phishing-filter.txt.gz - Brotli: https://malware-filter.gitlab.io/malware-filter/phishing-filter.txt.br -*Snort 2 rule is only available in compressed format in pages.dev due to the platform's 25MB file size limit* +_Snort 2 rule is only available in compressed format in pages.dev due to the platform's 25MB file size limit_ ## Issues This blocklist operates by blocking the **whole** website, instead of specific webpages; exceptions are made on popular websites (e.g. `https://docs.google.com/`), in which webpages are specified instead (e.g. `https://docs.google.com/phishing-page`). Phishing webpages are only listed in [URL-based](#url-based) filter, popular websites are excluded from other filters. -*Popular* websites are as listed in the [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html) (top 1M domains + subdomains), [Tranco List](https://tranco-list.eu/) (top 1M domains) and this [custom list](src/exclude.txt). +_Popular_ websites are as listed in the [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html) (top 1M domains + subdomains), [Tranco List](https://tranco-list.eu/) (top 1M domains), [Cloudflare Radar](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/) (top 1M domains) and this [custom list](src/exclude.txt). If you wish to exclude certain website(s) that you believe is sufficiently well-known, please create an [issue](https://gitlab.com/malware-filter/phishing-filter/issues) or [merge request](https://gitlab.com/malware-filter/phishing-filter/merge_requests). @@ -504,6 +504,7 @@ Optional variables: - `PHISHTANK_API`: Recommended if you intend to run [script.sh](src/script.sh) >5 times daily. Register an account at [phishtank.org](https://phishtank.org/developer_info.php) to generate an application key. - `CLOUDFLARE_BUILD_HOOK`: Deploy to Cloudflare Pages. - `NETLIFY_SITE_ID`: Deploy to Netlify. +- `CF_API`: Include Cloudflare Radar [domains ranking](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/). [Guide](https://developers.cloudflare.com/radar/get-started/first-request/) to create an API token. ## License @@ -525,4 +526,6 @@ _PhishTank is either trademark or registered trademark of Cisco Systems, Inc._ [phishunt.io](https://phishunt.io/): All rights reserved by [Daniel López](https://twitter.com/0xDanielLopez) +[Cloudflare Radar](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/): Available to free Cloudflare account + This repository is not endorsed by PhishTank/OpenDNS and OpenPhish. diff --git a/src/script.sh b/src/script.sh index 183e6dfa..4cb34454 100644 --- a/src/script.sh +++ b/src/script.sh @@ -1,11 +1,11 @@ #!/bin/sh -# works best on busybox sh +# works best on busybox ash set -efx -o pipefail alias curl="curl -L" -alias rm="rm -f" +alias rm="rm -rf" ## Use GNU grep, busybox grep is too slow . "/etc/os-release" @@ -66,6 +66,33 @@ curl "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip" bunzip2 -kc "phishtank.bz2" > "phishtank.csv" +## Cloudflare Radar +if [ -n "$CF_API" ]; then + mkdir -p "cf/" + # Get the latest domain ranking buckets + curl -X GET "https://api.cloudflare.com/client/v4/radar/datasets?limit=5&offset=0&datasetType=RANKING_BUCKET&format=json" \ + -H "Authorization: Bearer $CF_API" -o "cf/datasets.json" + # Get the top 1m bucket's dataset ID + DATASET_ID=$(jq ".result.datasets[] | select(.meta.top==1000000) | .id" "cf/datasets.json") + # Get the dataset download url + curl --request POST \ + --url "https://api.cloudflare.com/client/v4/radar/datasets/download" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer $CF_API" \ + --data "{ \"datasetId\": $DATASET_ID }" \ + -o "cf/dataset-url.json" + DATASET_URL=$(jq ".result.dataset.url" "cf/dataset-url.json" | sed 's/"//g') + curl -L "$DATASET_URL" -o "cf/top-1m-radar.zip" + + ## Parse the Radar 1 Million + unzip -p "cf/top-1m-radar.zip" | \ + dos2unix | \ + tr "[:upper:]" "[:lower:]" | \ + grep -F "." | \ + sed "s/^www\.//g" | \ + sort -u > "top-1m-radar.txt" +fi + ## Parse URLs cat "phishtank.csv" | \ @@ -146,10 +173,16 @@ sort -u > "top-1m-tranco.txt" # ## Append new line https://unix.stackexchange.com/a/31955 # sed '$a\' > "oisd-exclude.txt" -# Merge Umbrella, Traco and self-maintained top domains +# Merge Umbrella, Tranco, Radar and self-maintained top domains cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \ sort -u > "top-1m-well-known.txt" +if [ -n "$CF_API" ] && [ -f "top-1m-radar.txt" ]; then + cat "top-1m-radar.txt" >> "top-1m-well-known.txt" + # sort in-place + sort "top-1m-well-known.txt" -u -o "top-1m-well-known.txt" +fi + ## Parse popular domains cat "phishing-domains.txt" | \ @@ -374,7 +407,7 @@ sed "2s/Domains Blocklist/Hosts Blocklist (IE)/" > "../public/phishing-filter.tp ## Clean up artifacts -rm "phishtank.csv" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "openphish-raw.txt" "phishunt.csv" +rm "phishtank.csv" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "openphish-raw.txt" "phishunt.csv" "cf/" "top-1m-radar.txt" cd ../