fix: add oisd exclusion list

- https://oisd.nl/excludes.php
This commit is contained in:
Ming Di Leom 2021-05-03 11:31:23 +00:00
parent 7185e61203
commit 05f68f7ac5
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
2 changed files with 12 additions and 4 deletions

View File

@ -49,4 +49,4 @@ s3-eu-north-1.amazonaws.com
s3-sa-east-1.amazonaws.com
s3-me-south-1.amazonaws.com
s3-us-gov-east-1.amazonaws.com
s3-us-gov-west-1.amazonaws.com
s3-us-gov-west-1.amazonaws.com

View File

@ -11,7 +11,7 @@ cd "tmp/"
curl -L "https://zhouhanc.github.io/malware-discoverer/blocklist.csv.zip" -o "source.zip"
curl -L "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip"
curl -L "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip"
curl -L "https://oisd.nl/excludes.php" -o "oisd-exclude.html"
## Parse URLs
unzip -p "source.zip" | \
@ -45,8 +45,16 @@ sort -u > "top-1m-tranco.txt"
cp "../src/exclude.txt" "."
## Parse oisd exclusion list
cat "oisd-exclude.html" | \
# https://stackoverflow.com/a/47600828
xmlstarlet format --recover --html 2>/dev/null | \
xmlstarlet select --html --template --value-of '//a' | \
## Append new line https://unix.stackexchange.com/a/31955
sed -e '$a\' > "oisd-exclude.txt"
# Merge Umbrella, Traco and self-maintained top domains
cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \
cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" "oisd-exclude.txt" | \
sort -u > "top-1m-well-known.txt"
@ -193,7 +201,7 @@ sed -i "1s/Blocklist/Suricata Ruleset/" "../dist/pup-filter-suricata.rules"
## Clean up artifacts
rm "source.zip" "source-domains.txt" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt"
rm "source.zip" "source-domains.txt" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "oisd-exclude.html" "oisd-exclude.txt"
cd ../