build: lowercase all links

This commit is contained in:
MDLeom 2021-03-18 08:43:53 +00:00
parent 90cba6fd10
commit 74192f7e91
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
1 changed files with 6 additions and 0 deletions

View File

@ -18,6 +18,7 @@ cp "../src/exclude.txt" "."
unzip -p "urlhaus.zip" | \
# Convert DOS to Unix line ending
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
# Remove comment
sed "/^#/d" > "URLhaus.csv"
@ -61,6 +62,7 @@ sort -u > "urlhaus-domains-online.txt"
## Parse the Umbrella 1 Million
unzip -p "top-1m-umbrella.zip" | \
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
# Parse domains only
cut -f 2 -d "," | \
grep -F "." | \
@ -71,6 +73,7 @@ sort -u > "top-1m-umbrella.txt"
## Parse the Tranco 1 Million
unzip -p "top-1m-tranco.zip" | \
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
# Parse domains only
cut -f 2 -d "," | \
grep -F "." | \
@ -107,6 +110,9 @@ grep -F -f "urlhaus-top-domains.txt" | \
sed "s/^/||/g" | \
sed "s/$/\$all/g" > "malware-url-top-domains-online.txt"
cat "urlhaus-online.txt" | \
grep -F -f "urlhaus-top-domains.txt" > "malware-url-top-domains-raw-online.txt"
## Merge malware domains and URLs
CURRENT_TIME="$(date -R -u)"