diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6d0c1b79..fc5bf293 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,7 +12,7 @@ build_job: stage: build before_script: - - apk update && apk add brotli curl grep + - apk update && apk add brotli curl grep jq script: - sh src/script.sh @@ -43,7 +43,7 @@ cloudflare: stage: deploy before_script: - - apk update && apk add curl + - apk update && apk add curl script: - curl -X POST "https://api.cloudflare.com/client/v4/pages/webhooks/deploy_hooks/$CLOUDFLARE_BUILD_HOOK" diff --git a/README.md b/README.md index cf6205d8..b16a638e 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,21 @@ # Malicious URL Blocklist - Formats - * [URL-based](#url-based) - * [Domain-based](#domain-based) - * [Hosts-based](#hosts-based) - * [Domain-based (AdGuard Home)](#domain-based-adguard-home) - * [URL-based (AdGuard)](#url-based-adguard) - * [URL-based (Vivaldi)](#url-based-vivaldi) - * [Dnsmasq](#dnsmasq) - * [BIND zone](#bind) - * [RPZ](#response-policy-zone) - * [Unbound](#unbound) - * [dnscrypt-proxy](#dnscrypt-proxy) - * [Tracking Protection List (IE)](#tracking-protection-list-ie) - * [Snort2](#snort2) - * [Snort3](#snort3) - * [Suricata](#suricata) + - [URL-based](#url-based) + - [Domain-based](#domain-based) + - [Hosts-based](#hosts-based) + - [Domain-based (AdGuard Home)](#domain-based-adguard-home) + - [URL-based (AdGuard)](#url-based-adguard) + - [URL-based (Vivaldi)](#url-based-vivaldi) + - [Dnsmasq](#dnsmasq) + - [BIND zone](#bind) + - [RPZ](#response-policy-zone) + - [Unbound](#unbound) + - [dnscrypt-proxy](#dnscrypt-proxy) + - [Tracking Protection List (IE)](#tracking-protection-list-ie) + - [Snort2](#snort2) + - [Snort3](#snort3) + - [Suricata](#suricata) - [Compressed version](#compressed-version) - [Reporting issues](#issues) - [Cloning](#cloning) @@ -479,7 +479,7 @@ chmod 755 /etc/cron.daily/urlhaus-filter Configure dnscrypt-proxy to use the blocklist: -``` diff +```diff [blocked_names] + blocked_names_file = '/etc/dnscrypt-proxy/urlhaus-filter-dnscrypt-blocked-names.txt' @@ -530,7 +530,6 @@ Lite version (online domains only): - ## Tracking Protection List (IE) This blocklist includes domains only. Supported in Internet Explorer 9+. @@ -616,7 +615,7 @@ chmod 755 /etc/cron.daily/urlhaus-filter Configure Snort to use the ruleset: -``` diff +```diff # /etc/snort/snort.lua ips = { @@ -657,7 +656,7 @@ chmod 755 /etc/cron.daily/urlhaus-filter Configure Suricata to use the ruleset: -``` diff +```diff # /etc/suricata/suricata.yaml rule-files: - local.rules @@ -697,7 +696,7 @@ All filters are also available as gzip- and brotli-compressed. This blocklist operates by blocking the **whole** website, instead of specific webpages; exceptions are made on popular websites (e.g. `https://docs.google.com/`), in which webpages are specified instead (e.g. `https://docs.google.com/malware-page`). Malicious webpages are only listed in the [URL-based](#url-based) filter, popular websites are excluded from other filters. -*Popular* websites are as listed in the [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html) (top 1M domains + subdomains), [Tranco List](https://tranco-list.eu/) (top 1M domains) and this [custom list](src/exclude.txt). +_Popular_ websites are as listed in the [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html) (top 1M domains + subdomains), [Tranco List](https://tranco-list.eu/) (top 1M domains), [Cloudflare Radar](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/) (top 1M domains) and this [custom list](src/exclude.txt). If you wish to exclude certain website(s) that you believe is sufficiently well-known, please create an [issue](https://gitlab.com/malware-filter/urlhaus-filter/issues) or [merge request](https://gitlab.com/malware-filter/urlhaus-filter/merge_requests). If the website is quite obscure but you still want to visit it, you can add a new line `||legitsite.com^$badfilter` to "My filters" tab of uBO; use a subdomain if relevant, `||sub.legitsite.com^$badfilter`. @@ -723,6 +722,7 @@ Optional variables: - `CLOUDFLARE_BUILD_HOOK`: Deploy to Cloudflare Pages. - `NETLIFY_SITE_ID`: Deploy to Netlify. +- `CF_API`: Include Cloudflare Radar [domains ranking](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/). [Guide](https://developers.cloudflare.com/radar/get-started/first-request/) to create an API token. ## License @@ -734,4 +734,6 @@ Optional variables: [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html): Available free of charge by Cisco Umbrella +[Cloudflare Radar](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/): Available to free Cloudflare account + This repository is not endorsed by Abuse.ch. diff --git a/src/script.sh b/src/script.sh index 2c1f6b0b..508497e5 100644 --- a/src/script.sh +++ b/src/script.sh @@ -1,6 +1,6 @@ #!/bin/sh -# works best on busybox sh +# works best on busybox ash set -efux -o pipefail @@ -35,6 +35,33 @@ curl -L "https://urlhaus.abuse.ch/downloads/csv/" -o "urlhaus.zip" curl -L "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip" curl -L "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip" +## Cloudflare Radar +if [ -n "$CF_API" ]; then + mkdir -p "cf/" + # Get the latest domain ranking buckets + curl -X GET "https://api.cloudflare.com/client/v4/radar/datasets?limit=5&offset=0&datasetType=RANKING_BUCKET&format=json" \ + -H "Authorization: Bearer $CF_API" -o "cf/datasets.json" + # Get the top 1m bucket's dataset ID + DATASET_ID=$(jq ".result.datasets[] | select(.meta.top==1000000) | .id" "cf/datasets.json") + # Get the dataset download url + curl --request POST \ + --url "https://api.cloudflare.com/client/v4/radar/datasets/download" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer $CF_API" \ + --data "{ \"datasetId\": $DATASET_ID }" \ + -o "cf/dataset-url.json" + DATASET_URL=$(jq ".result.dataset.url" "cf/dataset-url.json" | sed 's/"//g') + curl -L "$DATASET_URL" -o "cf/top-1m-radar.zip" + + ## Parse the Radar 1 Million + unzip -p "cf/top-1m-radar.zip" | \ + dos2unix | \ + tr "[:upper:]" "[:lower:]" | \ + grep -F "." | \ + sed "s/^www\.//g" | \ + sort -u > "top-1m-radar.txt" +fi + cp -f "../src/exclude.txt" "." ## Prepare URLhaus.csv @@ -108,6 +135,11 @@ sort -u > "top-1m-tranco.txt" cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \ sort -u > "top-1m-well-known.txt" +if [ -n "$CF_API" ] && [ -f "top-1m-radar.txt" ]; then + cat "top-1m-radar.txt" >> "top-1m-well-known.txt" + # sort in-place + sort "top-1m-well-known.txt" -u -o "top-1m-well-known.txt" +fi ## Parse popular domains from URLhaus cat "urlhaus-domains.txt" | \ @@ -422,7 +454,7 @@ sed "2s/Domains Blocklist/Hosts Blocklist (IE)/" > "../public/urlhaus-filter-onl ## Clean up artifacts -rm -f "URLhaus.csv" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" +rm -rf "URLhaus.csv" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "cf/" "top-1m-radar.txt" cd ../