From 433a87ce1130037fe79fbebbe1184ca2f4aed54c Mon Sep 17 00:00:00 2001
From: Ming Di Leom <2809763-curben@users.noreply.gitlab.com>
Date: Sun, 20 Nov 2022 01:37:09 +0000
Subject: [PATCH] feat: add Cloudflare Radar top 1m domains dataset

---
 README.md     | 41 ++++++++++++++++++++++-------------------
 src/script.sh | 47 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index bcfd780..02990c4 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,21 @@
 # PUP Domains Blocklist
 
 - Formats
-  * [URL-based](#url-based)
-  * [Domain-based](#domain-based)
-  * [Hosts-based](#hosts-based)
-  * [Domain-based (AdGuard Home)](#domain-based-adguard-home)
-  * [URL-based (AdGuard)](#url-based-adguard)
-  * [URL-based (Vivaldi)](#url-based-vivaldi)
-  * [Dnsmasq](#dnsmasq)
-  * [BIND zone](#bind)
-  * [RPZ](#response-policy-zone)
-  * [Unbound](#unbound)
-  * [dnscrypt-proxy](#dnscrypt-proxy)
-  * [Tracking Protection List (IE)](#tracking-protection-list-ie)
-  * [Snort2](#snort2)
-  * [Snort3](#snort3)
-  * [Suricata](#suricata)
+  - [URL-based](#url-based)
+  - [Domain-based](#domain-based)
+  - [Hosts-based](#hosts-based)
+  - [Domain-based (AdGuard Home)](#domain-based-adguard-home)
+  - [URL-based (AdGuard)](#url-based-adguard)
+  - [URL-based (Vivaldi)](#url-based-vivaldi)
+  - [Dnsmasq](#dnsmasq)
+  - [BIND zone](#bind)
+  - [RPZ](#response-policy-zone)
+  - [Unbound](#unbound)
+  - [dnscrypt-proxy](#dnscrypt-proxy)
+  - [Tracking Protection List (IE)](#tracking-protection-list-ie)
+  - [Snort2](#snort2)
+  - [Snort3](#snort3)
+  - [Suricata](#suricata)
 - [Compressed version](#compressed-version)
 - [Reporting issues](#issues)
 - [FAQ and Guides](#faq-and-guides)
@@ -306,7 +306,7 @@ chmod 755 /etc/cron.daily/pup-filter
 
 Configure dnscrypt-proxy to use the blocklist:
 
-``` diff
+```diff
 [blocked_names]
 +  blocked_names_file = '/etc/dnscrypt-proxy/pup-filter-dnscrypt-blocked-names.txt'
 ```
@@ -393,7 +393,7 @@ chmod 755 /etc/cron.daily/pup-filter
 
 Configure Snort to use the ruleset:
 
-``` diff
+```diff
 # /etc/snort/snort.lua
 ips =
 {
@@ -434,7 +434,7 @@ chmod 755 /etc/cron.daily/pup-filter
 
 Configure Suricata to use the ruleset:
 
-``` diff
+```diff
 # /etc/suricata/suricata.yaml
 rule-files:
   - local.rules
@@ -465,7 +465,7 @@ All filters are also available as gzip- and brotli-compressed.
 
 This blocklist operates by blocking the **whole** website, popular websites are excluded from the filters.
 
-*Popular* websites are as listed in the [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html) (top 1M domains + subdomains), [Tranco List](https://tranco-list.eu/) (top 1M domains) and this [custom list](src/exclude.txt).
+_Popular_ websites are as listed in the [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html) (top 1M domains + subdomains), [Tranco List](https://tranco-list.eu/) (top 1M domains), [Cloudflare Radar](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/) (top 1M domains) and this [custom list](src/exclude.txt).
 
 If you wish to exclude certain website(s) that you believe is sufficiently well-known, please create an [issue](https://gitlab.com/malware-filter/pup-filter/issues) or [merge request](https://gitlab.com/malware-filter/pup-filter/merge_requests).
 
@@ -481,6 +481,7 @@ Optional variables:
 
 - `CLOUDFLARE_BUILD_HOOK`: Deploy to Cloudflare Pages.
 - `NETLIFY_SITE_ID`: Deploy to Netlify.
+- `CF_API`: Include Cloudflare Radar [domains ranking](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/). [Guide](https://developers.cloudflare.com/radar/get-started/first-request/) to create an API token.
 
 ## License
 
@@ -493,3 +494,5 @@ filters: Derived from [malware-discoverer](https://github.com/zhouhanc/malware-d
 [Tranco List](https://tranco-list.eu/): MIT License
 
 [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html): Available free of charge by Cisco Umbrella
+
+[Cloudflare Radar](https://developers.cloudflare.com/radar/investigate/domain-ranking-datasets/): Available to free Cloudflare account
diff --git a/src/script.sh b/src/script.sh
index cd1e670..daa73bd 100644
--- a/src/script.sh
+++ b/src/script.sh
@@ -1,10 +1,10 @@
 #!/bin/sh
 
-# works best on busybox sh
+# works best on busybox ash
 
 set -efux -o pipefail
 
-alias rm="rm -f"
+alias rm="rm -rf"
 
 ## Use GNU grep, busybox grep is too slow
 . "/etc/os-release"
@@ -20,8 +20,7 @@ fi
 
 
 ## Fallback to busybox dos2unix
-if ! command -v dos2unix &> /dev/null
-then
+if ! command -v dos2unix &> /dev/null; then
   alias dos2unix="busybox dos2unix"
 fi
 
@@ -35,6 +34,34 @@ curl -L "https://zhouhanc.github.io/malware-discoverer/blocklist.csv.zip" -o "so
 curl -L "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip"
 curl -L "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip"
 
+## Cloudflare Radar
+if [ -n "$CF_API" ]; then
+  mkdir -p "cf/"
+  # Get the latest domain ranking buckets
+  curl -X GET "https://api.cloudflare.com/client/v4/radar/datasets?limit=5&offset=0&datasetType=RANKING_BUCKET&format=json" \
+    -H "Authorization: Bearer $CF_API" -o "cf/datasets.json"
+  # Get the top 1m bucket's dataset ID
+  DATASET_ID=$(jq ".result.datasets[] | select(.meta.top==1000000) | .id" "cf/datasets.json")
+  # Get the dataset download url
+  curl --request POST \
+    --url "https://api.cloudflare.com/client/v4/radar/datasets/download" \
+    --header "Content-Type: application/json" \
+    --header "Authorization: Bearer $CF_API" \
+    --data "{ \"datasetId\": $DATASET_ID }" \
+    -o "cf/dataset-url.json"
+  DATASET_URL=$(jq ".result.dataset.url" "cf/dataset-url.json" | sed 's/"//g')
+  curl -L "$DATASET_URL" -o "cf/top-1m-radar.zip"
+
+  ## Parse the Radar 1 Million
+  unzip -p "cf/top-1m-radar.zip" | \
+  dos2unix | \
+  tr "[:upper:]" "[:lower:]" | \
+  grep -F "." | \
+  sed "s/^www\.//g" | \
+  sort -u > "top-1m-radar.txt"
+fi
+
+
 ## Parse URLs
 unzip -p "source.zip" | \
 dos2unix | \
@@ -58,10 +85,8 @@ sort -u > "top-1m-umbrella.txt"
 unzip -p "top-1m-tranco.zip" | \
 dos2unix | \
 tr "[:upper:]" "[:lower:]" | \
-# Parse domains only
 cut -f 2 -d "," | \
 grep -F "." | \
-# Remove www.
 sed "s/^www\.//g" | \
 sort -u > "top-1m-tranco.txt"
 
@@ -75,10 +100,16 @@ cp "../src/exclude.txt" "."
 # ## Append new line https://unix.stackexchange.com/a/31955
 # sed '$a\' > "oisd-exclude.txt"
 
-# Merge Umbrella, Traco and self-maintained top domains
+# Merge Umbrella, Tranco, Radar and self-maintained top domains
 cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \
 sort -u > "top-1m-well-known.txt"
 
+if [ -n "$CF_API" ] && [ -f "top-1m-radar.txt" ]; then
+  cat "top-1m-radar.txt" >> "top-1m-well-known.txt"
+  # sort in-place
+  sort "top-1m-well-known.txt" -u -o "top-1m-well-known.txt"
+fi
+
 
 ## Exclude popular domains
 cat "source-domains.txt" | \
@@ -239,7 +270,7 @@ sed -i "1s/Blocklist/Suricata Ruleset/" "../public/pup-filter-suricata.rules"
 
 
 ## Clean up artifacts
-rm "source.zip" "source-domains.txt" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt"
+rm "source.zip" "source-domains.txt" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "cf/" "top-1m-radar.txt"
 
 
 cd ../