From 667a55b51cce96863c327ef0978c5af5ff8fbb4a Mon Sep 17 00:00:00 2001 From: curben <2809763-curben@users.noreply.gitlab.com> Date: Thu, 2 Apr 2020 22:46:28 +0100 Subject: [PATCH] fix: revert to Umbrella list - Tranco doesn't include subdomain - Closes #12 - Closes #11 - Closes #10 - Closes #9 - Revert bb9e8ca7cf3009f51290e96ac0731bda8d181c48 --- script.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/script.sh b/script.sh index 5b36dc38..17017613 100644 --- a/script.sh +++ b/script.sh @@ -8,7 +8,7 @@ mkdir -p "tmp/" && cd "tmp/" ## Prepare datasets curl -L "https://urlhaus.abuse.ch/downloads/csv/" -o "urlhaus.zip" -curl -L "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m.csv.zip" +curl -L "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m.csv.zip" cp "../src/exclude.txt" "." @@ -49,7 +49,7 @@ cut -f 1 -d ":" | \ sort -u > "urlhaus-domains-online.txt" -## Parse the Tranco 1 Million +## Parse the Umbrella 1 Million unzip -p "top-1m.csv.zip" | \ dos2unix | \ # Parse domains only