build: lowercase all links
This commit is contained in:
parent
90cba6fd10
commit
74192f7e91
|
@ -18,6 +18,7 @@ cp "../src/exclude.txt" "."
|
|||
unzip -p "urlhaus.zip" | \
|
||||
# Convert DOS to Unix line ending
|
||||
dos2unix | \
|
||||
tr "[:upper:]" "[:lower:]" | \
|
||||
# Remove comment
|
||||
sed "/^#/d" > "URLhaus.csv"
|
||||
|
||||
|
@ -61,6 +62,7 @@ sort -u > "urlhaus-domains-online.txt"
|
|||
## Parse the Umbrella 1 Million
|
||||
unzip -p "top-1m-umbrella.zip" | \
|
||||
dos2unix | \
|
||||
tr "[:upper:]" "[:lower:]" | \
|
||||
# Parse domains only
|
||||
cut -f 2 -d "," | \
|
||||
grep -F "." | \
|
||||
|
@ -71,6 +73,7 @@ sort -u > "top-1m-umbrella.txt"
|
|||
## Parse the Tranco 1 Million
|
||||
unzip -p "top-1m-tranco.zip" | \
|
||||
dos2unix | \
|
||||
tr "[:upper:]" "[:lower:]" | \
|
||||
# Parse domains only
|
||||
cut -f 2 -d "," | \
|
||||
grep -F "." | \
|
||||
|
@ -107,6 +110,9 @@ grep -F -f "urlhaus-top-domains.txt" | \
|
|||
sed "s/^/||/g" | \
|
||||
sed "s/$/\$all/g" > "malware-url-top-domains-online.txt"
|
||||
|
||||
cat "urlhaus-online.txt" | \
|
||||
grep -F -f "urlhaus-top-domains.txt" > "malware-url-top-domains-raw-online.txt"
|
||||
|
||||
|
||||
## Merge malware domains and URLs
|
||||
CURRENT_TIME="$(date -R -u)"
|
||||
|
|
Loading…
Reference in New Issue