From d4bef6923f8e25b211fe87e87e0df03ae74b87e5 Mon Sep 17 00:00:00 2001 From: curben Date: Tue, 28 May 2019 12:32:08 +0930 Subject: [PATCH] fix: merge duplicate operations remove simple comments --- utils/malware-domains.sh | 6 ++---- utils/malware-url-top-domains.sh | 1 - utils/prerequisites.sh | 16 +++++++++------- utils/umbrella-top-1m.sh | 5 +---- utils/urlhaus-filter.sh | 1 - utils/urlhaus-top-domains.sh | 3 +-- 6 files changed, 13 insertions(+), 19 deletions(-) diff --git a/utils/malware-domains.sh b/utils/malware-domains.sh index cd8fc7be..f27a6c32 100644 --- a/utils/malware-domains.sh +++ b/utils/malware-domains.sh @@ -4,7 +4,5 @@ set -e -x ## Parse domains from URLhaus excluding popular domains -cat urlhaus.txt | \ -# Exclude Umbrella Top 1M and well-known domains -# grep inverse match whole line -grep -Fx -vf urlhaus-top-domains.txt > malware-domains.txt +cat urlhaus-domains.txt | \ +grep -F -vf urlhaus-top-domains.txt > malware-domains.txt diff --git a/utils/malware-url-top-domains.sh b/utils/malware-url-top-domains.sh index c31eb109..f0400ede 100644 --- a/utils/malware-url-top-domains.sh +++ b/utils/malware-url-top-domains.sh @@ -5,5 +5,4 @@ set -e -x ## Parse malware URLs from popular URLhaus domains cat urlhaus.txt | \ -# Parse URLs from popular domains only grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt diff --git a/utils/prerequisites.sh b/utils/prerequisites.sh index 9e0e29a6..88478779 100644 --- a/utils/prerequisites.sh +++ b/utils/prerequisites.sh @@ -17,12 +17,14 @@ dos2unix | \ # Remove comment sed 's/^#.*//g' | \ # Remove http(s):// -cut -f 3 -d '/' | \ -# Remove port number -cut -f 1 -d ':' | \ -# Remove www -# Only matches domains that start with www -# Not examplewww.com +cut -f 3- -d '/' | \ +# Remove www. sed 's/^www\.//g' | \ -# Sort and remove duplicates sort -u > urlhaus.txt + +## Parse domain and IP address only +cat urlhaus.txt | \ +cut -f 1 -d '/' | \ +cut -f 1 -d ':' | \ +# Sort and remove duplicates +sort -u > urlhaus-domains.txt \ No newline at end of file diff --git a/utils/umbrella-top-1m.sh b/utils/umbrella-top-1m.sh index 2fe8867c..69630f58 100644 --- a/utils/umbrella-top-1m.sh +++ b/utils/umbrella-top-1m.sh @@ -14,11 +14,8 @@ dos2unix | \ cut -f 2 -d ',' | \ # Domain must have at least a 'dot' grep -F '.' | \ -# Remove www -# Only matches domains that start with www -# Not examplewww.com +# Remove www. sed 's/^www\.//g' | \ -# Remove duplicates sort -u > top-1m.txt # Merge Umbrella and self-maintained top domains diff --git a/utils/urlhaus-filter.sh b/utils/urlhaus-filter.sh index 5dccd08f..d40fc3bd 100644 --- a/utils/urlhaus-filter.sh +++ b/utils/urlhaus-filter.sh @@ -15,7 +15,6 @@ SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/" COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE" cat malware-domains.txt malware-url-top-domains.txt | \ -# Sort alphabetically sort | \ # Append header comment to the filter list sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt diff --git a/utils/urlhaus-top-domains.sh b/utils/urlhaus-top-domains.sh index a75d80ac..254184cd 100644 --- a/utils/urlhaus-top-domains.sh +++ b/utils/urlhaus-top-domains.sh @@ -4,7 +4,6 @@ set -e -x ## Parse popular domains from URLhaus -cat urlhaus.txt | \ -# Exclude Umbrella Top 1M and well-known domains +cat urlhaus-domains.txt | \ # grep match whole line grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt