fix: merge duplicate operations

remove simple comments
2019-05-28 12:32:08 +09:30 · 2019-05-28 12:32:08 +09:30 · d4bef6923f
parent 63442cd576
commit d4bef6923f
6 changed files with 13 additions and 19 deletions
--- a/utils/malware-domains.sh
+++ b/utils/malware-domains.sh
@ -4,7 +4,5 @@ set -e -x

 ## Parse domains from URLhaus excluding popular domains

-cat urlhaus.txt | \
-# Exclude Umbrella Top 1M and well-known domains
-# grep inverse match whole line
-grep -Fx -vf urlhaus-top-domains.txt > malware-domains.txt
+cat urlhaus-domains.txt | \
+grep -F -vf urlhaus-top-domains.txt > malware-domains.txt
--- a/utils/malware-url-top-domains.sh
+++ b/utils/malware-url-top-domains.sh
@ -5,5 +5,4 @@ set -e -x
 ## Parse malware URLs from popular URLhaus domains

 cat urlhaus.txt | \
-# Parse URLs from popular domains only
 grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt
--- a/utils/prerequisites.sh
+++ b/utils/prerequisites.sh
@ -17,12 +17,14 @@ dos2unix | \
 # Remove comment
 sed 's/^#.*//g' | \
 # Remove http(s)://
-cut -f 3 -d '/' | \
-# Remove port number
-cut -f 1 -d ':' | \
-# Remove www
-# Only matches domains that start with www
-# Not examplewww.com
+cut -f 3- -d '/' | \
+# Remove www.
 sed 's/^www\.//g' | \
-# Sort and remove duplicates
 sort -u > urlhaus.txt
+
+## Parse domain and IP address only
+cat urlhaus.txt | \
+cut -f 1 -d '/' | \
+cut -f 1 -d ':' | \
+# Sort and remove duplicates
+sort -u > urlhaus-domains.txt
--- a/utils/umbrella-top-1m.sh
+++ b/utils/umbrella-top-1m.sh
@ -14,11 +14,8 @@ dos2unix | \
 cut -f 2 -d ',' | \
 # Domain must have at least a 'dot'
 grep -F '.' | \
-# Remove www
-# Only matches domains that start with www
-# Not examplewww.com
+# Remove www.
 sed 's/^www\.//g' | \
-# Remove duplicates
 sort -u > top-1m.txt

 # Merge Umbrella and self-maintained top domains
--- a/utils/urlhaus-filter.sh
+++ b/utils/urlhaus-filter.sh
@ -15,7 +15,6 @@ SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/"
 COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE"

 cat malware-domains.txt malware-url-top-domains.txt | \
-# Sort alphabetically
 sort | \
 # Append header comment to the filter list
 sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt
--- a/utils/urlhaus-top-domains.sh
+++ b/utils/urlhaus-top-domains.sh
@ -4,7 +4,6 @@ set -e -x

 ## Parse popular domains from URLhaus

-cat urlhaus.txt | \
-# Exclude Umbrella Top 1M and well-known domains
+cat urlhaus-domains.txt | \
 # grep match whole line
 grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt