From 74192f7e915ba26b515a310e49ed33fb6a29bfca Mon Sep 17 00:00:00 2001 From: MDLeom <2809763-curben@users.noreply.gitlab.com> Date: Thu, 18 Mar 2021 08:43:53 +0000 Subject: [PATCH] build: lowercase all links --- script.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/script.sh b/script.sh index a423fed1..82a47801 100644 --- a/script.sh +++ b/script.sh @@ -18,6 +18,7 @@ cp "../src/exclude.txt" "." unzip -p "urlhaus.zip" | \ # Convert DOS to Unix line ending dos2unix | \ +tr "[:upper:]" "[:lower:]" | \ # Remove comment sed "/^#/d" > "URLhaus.csv" @@ -61,6 +62,7 @@ sort -u > "urlhaus-domains-online.txt" ## Parse the Umbrella 1 Million unzip -p "top-1m-umbrella.zip" | \ dos2unix | \ +tr "[:upper:]" "[:lower:]" | \ # Parse domains only cut -f 2 -d "," | \ grep -F "." | \ @@ -71,6 +73,7 @@ sort -u > "top-1m-umbrella.txt" ## Parse the Tranco 1 Million unzip -p "top-1m-tranco.zip" | \ dos2unix | \ +tr "[:upper:]" "[:lower:]" | \ # Parse domains only cut -f 2 -d "," | \ grep -F "." | \ @@ -107,6 +110,9 @@ grep -F -f "urlhaus-top-domains.txt" | \ sed "s/^/||/g" | \ sed "s/$/\$all/g" > "malware-url-top-domains-online.txt" +cat "urlhaus-online.txt" | \ +grep -F -f "urlhaus-top-domains.txt" > "malware-url-top-domains-raw-online.txt" + ## Merge malware domains and URLs CURRENT_TIME="$(date -R -u)"