fix: use simple URL list

we no longer care the status of URL bb817d9838
2019-05-27 15:59:08 +09:30 · 2019-05-27 15:59:08 +09:30 · 9a5fdb2be6
parent 196c66f2a1
commit 9a5fdb2be6
6 changed files with 194766 additions and 194679 deletions
--- a/src/URLhaus.csv
+++ b/src/URLhaus.csv
--- a/src/URLhaus.txt
+++ b/src/URLhaus.txt
--- a/utils/malware-domains.sh
+++ b/utils/malware-domains.sh
@ -4,21 +4,7 @@ set -e -x

 ## Parse domains from URLhaus excluding popular domains

-cat URLhaus.csv | \
-# Convert DOS to Unix line ending
-dos2unix | \
-# Parse online URLs only
-#grep '"online"' | \
-# Parse domains and IP address only
-cut -f 6 -d '"' | \
-cut -f 3 -d '/' | \
-cut -f 1 -d ':' | \
-# Remove www
-# Only matches domains that start with www
-# Not examplewww.com
-sed 's/^www\.//g' | \
-# Sort and remove duplicates
-sort -u | \
+cat URLhaus.txt | \
 # Exclude Umbrella Top 1M and well-known domains
 # grep inverse match whole line
 grep -Fx -vf urlhaus-top-domains.txt > malware-domains.txt
--- a/utils/malware-url-top-domains.sh
+++ b/utils/malware-url-top-domains.sh
@ -4,20 +4,6 @@ set -e -x

 ## Parse malware URLs from popular URLhaus domains

-cat URLhaus.csv | \
-# Convert DOS to Unix line ending
-dos2unix | \
-# Parse online URLs only
-#grep '"online"' | \
-# Parse URLs
-cut -f 6 -d '"' | \
-cut -f 3- -d '/' | \
-cut -f 1- -d ':' | \
-# Remove www
-# Only matches domains that start with www
-# Not examplewww.com
-sed 's/^www\.//g' | \
-# Sort and remove duplicates
-sort -u | \
+cat URLhaus.txt | \
 # Parse URLs from popular domains only
 grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt
--- a/utils/prerequisites.sh
+++ b/utils/prerequisites.sh
@ -3,10 +3,24 @@
 set -e -x

 # Download URLhaus database
-wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv
+wget https://urlhaus.abuse.ch/downloads/text/ -O ../src/URLhaus.txt

 # Download Cisco Umbrella 1 Million
 wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip

-cp ../src/URLhaus.csv .
 cp ../src/exclude.txt .
+
+## Clean up URLhaus.txt
+cat ../src/URLhaus.txt | \
+# Remove comment
+sed '/^#/ d' | \
+# Convert DOS to Unix line ending
+dos2unix | \
+# Remove http(s)://
+cut -f 3 -d '/' | \
+# Remove www
+# Only matches domains that start with www
+# Not examplewww.com
+sed 's/^www\.//g' | \
+# Sort and remove duplicates
+sort -u > URLhaus.txt
--- a/utils/urlhaus-top-domains.sh
+++ b/utils/urlhaus-top-domains.sh
@ -4,21 +4,7 @@ set -e -x

 ## Parse popular domains from URLhaus

-cat URLhaus.csv | \
-# Convert DOS to Unix line ending
-dos2unix | \
-# Parse online URLs only
-#grep '"online"' | \
-# Parse domains and IP address only
-cut -f 6 -d '"' | \
-cut -f 3 -d '/' | \
-cut -f 1 -d ':' | \
-# Remove www
-# Only matches domains that start with www
-# Not examplewww.com
-sed 's/^www\.//g' | \
-# Sort and remove duplicates
-sort -u | \
+cat URLhaus.txt | \
 # Exclude Umbrella Top 1M and well-known domains
 # grep match whole line
 grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt