From 433a87ce1130037fe79fbebbe1184ca2f4aed54c Mon Sep 17 00:00:00 2001 From: Ming Di Leom <2809763-curben@users.noreply.gitlab.com> Date: Sun, 20 Nov 2022 01:37:09 +0000 Subject: [PATCH] feat: add Cloudflare Radar top 1m domains dataset --- README.md | 41 ++++++++++++++++++++++------------------- src/script.sh | 47 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 61 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index bcfd780..02990c4 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,21 @@ # PUP Domains Blocklist - Formats - * [URL-based](#url-based) - * [Domain-based](#domain-based) - * [Hosts-based](#hosts-based) - * [Domain-based (AdGuard Home)](#domain-based-adguard-home) - * [URL-based (AdGuard)](#url-based-adguard) - * [URL-based (Vivaldi)](#url-based-vivaldi) - * [Dnsmasq](#dnsmasq) - * [BIND zone](#bind) - * [RPZ](#response-policy-zone) - * [Unbound](#unbound) - * [dnscrypt-proxy](#dnscrypt-proxy) - * [Tracking Protection List (IE)](#tracking-protection-list-ie) - * [Snort2](#snort2) - * [Snort3](#snort3) - * [Suricata](#suricata) + - [URL-based](#url-based) + - [Domain-based](#domain-based) + - [Hosts-based](#hosts-based) + - [Domain-based (AdGuard Home)](#domain-based-adguard-home) + - [URL-based (AdGuard)](#url-based-adguard) + - [URL-based (Vivaldi)](#url-based-vivaldi) + - [Dnsmasq](#dnsmasq) + - [BIND zone](#bind) + - [RPZ](#response-policy-zone) + - [Unbound](#unbound) + - [dnscrypt-proxy](#dnscrypt-proxy) + - [Tracking Protection List (IE)](#tracking-protection-list-ie) + - [Snort2](#snort2) + - [Snort3](#snort3) + - [Suricata](#suricata) - [Compressed version](#compressed-version) - [Reporting issues](#issues) - [FAQ and Guides](#faq-and-guides) @@ -306,7 +306,7 @@ chmod 755 /etc/cron.daily/pup-filter Configure dnscrypt-proxy to use the blocklist: -``` diff +```diff [blocked_names] + blocked_names_file = '/etc/dnscrypt-proxy/pup-filter-dnscrypt-blocked-names.txt' ``` @@ -393,7 +393,7 @@ chmod 755 /etc/cron.daily/pup-filter Configure Snort to use the ruleset: -``` diff +```diff # /etc/snort/snort.lua ips = { @@ -434,7 +434,7 @@ chmod 755 /etc/cron.daily/pup-filter Configure Suricata to use the ruleset: -``` diff +```diff # /etc/suricata/suricata.yaml rule-files: - local.rules @@ -465,7 +465,7 @@ All filters are also available as gzip- and brotli-compressed. This blocklist operates by blocking the **whole** website, popular websites are excluded from the filters. -*Popular* websites are as listed in the [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html) (top 1M domains + subdomains), [Tranco List](https://tranco-list.eu/) (top 1M domains) and this [custom list](src/exclude.txt). +_Popular_ websites are as listed in the [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html) (top 1M domains + subdomains), [Tranco List](https://tranco-list.eu/) (top 1M domains), [Cloudflare Radar](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/) (top 1M domains) and this [custom list](src/exclude.txt). If you wish to exclude certain website(s) that you believe is sufficiently well-known, please create an [issue](https://gitlab.com/malware-filter/pup-filter/issues) or [merge request](https://gitlab.com/malware-filter/pup-filter/merge_requests). @@ -481,6 +481,7 @@ Optional variables: - `CLOUDFLARE_BUILD_HOOK`: Deploy to Cloudflare Pages. - `NETLIFY_SITE_ID`: Deploy to Netlify. +- `CF_API`: Include Cloudflare Radar [domains ranking](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/). [Guide](https://developers.cloudflare.com/radar/get-started/first-request/) to create an API token. ## License @@ -493,3 +494,5 @@ filters: Derived from [malware-discoverer](https://github.com/zhouhanc/malware-d [Tranco List](https://tranco-list.eu/): MIT License [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html): Available free of charge by Cisco Umbrella + +[Cloudflare Radar](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/): Available to free Cloudflare account diff --git a/src/script.sh b/src/script.sh index cd1e670..daa73bd 100644 --- a/src/script.sh +++ b/src/script.sh @@ -1,10 +1,10 @@ #!/bin/sh -# works best on busybox sh +# works best on busybox ash set -efux -o pipefail -alias rm="rm -f" +alias rm="rm -rf" ## Use GNU grep, busybox grep is too slow . "/etc/os-release" @@ -20,8 +20,7 @@ fi ## Fallback to busybox dos2unix -if ! command -v dos2unix &> /dev/null -then +if ! command -v dos2unix &> /dev/null; then alias dos2unix="busybox dos2unix" fi @@ -35,6 +34,34 @@ curl -L "https://zhouhanc.github.io/malware-discoverer/blocklist.csv.zip" -o "so curl -L "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip" curl -L "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip" +## Cloudflare Radar +if [ -n "$CF_API" ]; then + mkdir -p "cf/" + # Get the latest domain ranking buckets + curl -X GET "https://api.cloudflare.com/client/v4/radar/datasets?limit=5&offset=0&datasetType=RANKING_BUCKET&format=json" \ + -H "Authorization: Bearer $CF_API" -o "cf/datasets.json" + # Get the top 1m bucket's dataset ID + DATASET_ID=$(jq ".result.datasets[] | select(.meta.top==1000000) | .id" "cf/datasets.json") + # Get the dataset download url + curl --request POST \ + --url "https://api.cloudflare.com/client/v4/radar/datasets/download" \ + --header "Content-Type: application/json" \ + --header "Authorization: Bearer $CF_API" \ + --data "{ \"datasetId\": $DATASET_ID }" \ + -o "cf/dataset-url.json" + DATASET_URL=$(jq ".result.dataset.url" "cf/dataset-url.json" | sed 's/"//g') + curl -L "$DATASET_URL" -o "cf/top-1m-radar.zip" + + ## Parse the Radar 1 Million + unzip -p "cf/top-1m-radar.zip" | \ + dos2unix | \ + tr "[:upper:]" "[:lower:]" | \ + grep -F "." | \ + sed "s/^www\.//g" | \ + sort -u > "top-1m-radar.txt" +fi + + ## Parse URLs unzip -p "source.zip" | \ dos2unix | \ @@ -58,10 +85,8 @@ sort -u > "top-1m-umbrella.txt" unzip -p "top-1m-tranco.zip" | \ dos2unix | \ tr "[:upper:]" "[:lower:]" | \ -# Parse domains only cut -f 2 -d "," | \ grep -F "." | \ -# Remove www. sed "s/^www\.//g" | \ sort -u > "top-1m-tranco.txt" @@ -75,10 +100,16 @@ cp "../src/exclude.txt" "." # ## Append new line https://unix.stackexchange.com/a/31955 # sed '$a\' > "oisd-exclude.txt" -# Merge Umbrella, Traco and self-maintained top domains +# Merge Umbrella, Tranco, Radar and self-maintained top domains cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \ sort -u > "top-1m-well-known.txt" +if [ -n "$CF_API" ] && [ -f "top-1m-radar.txt" ]; then + cat "top-1m-radar.txt" >> "top-1m-well-known.txt" + # sort in-place + sort "top-1m-well-known.txt" -u -o "top-1m-well-known.txt" +fi + ## Exclude popular domains cat "source-domains.txt" | \ @@ -239,7 +270,7 @@ sed -i "1s/Blocklist/Suricata Ruleset/" "../public/pup-filter-suricata.rules" ## Clean up artifacts -rm "source.zip" "source-domains.txt" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" +rm "source.zip" "source-domains.txt" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "cf/" "top-1m-radar.txt" cd ../