From 013267e310dcdfaa6eb67e2bd0e1d3f0b3d11a53 Mon Sep 17 00:00:00 2001 From: curben Date: Sun, 12 May 2019 12:29:52 +0930 Subject: [PATCH 1/3] perf: grep using urlhaus-top-domains.txt instead of much larger top-1m.txt --- .gitlab-ci.yml | 6 +++--- utils/malware-domains.sh | 2 +- utils/urlhaus-filter.sh | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a6d30d10..828963df 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -42,12 +42,12 @@ deploy: # Process the Umbrella Top 1M - sh ../utils/umbrella-top-1m.sh - # Parse domains from URLhaus excluding popular domains - - sh ../utils/malware-domains.sh - # Parse popular domains that also appear in URLhaus - sh ../utils/urlhaus-top-domains.sh + # Parse domains from URLhaus excluding popular domains + - sh ../utils/malware-domains.sh + # Parse malware URLs from popular domains - sh ../utils/malware-url-top-domains.sh diff --git a/utils/malware-domains.sh b/utils/malware-domains.sh index d4632fc8..2606aa25 100644 --- a/utils/malware-domains.sh +++ b/utils/malware-domains.sh @@ -19,4 +19,4 @@ sed -e 's/^www\.//g' | \ sort -u | \ # Exclude Umbrella Top 1M and well-known domains # grep inverse match whole line -grep -Fx -vf top-1m-well-known.txt > malware-domains.txt +grep -Fx -vf urlhaus-top-domains.txt > malware-domains.txt diff --git a/utils/urlhaus-filter.sh b/utils/urlhaus-filter.sh index 8e2c87f7..b7462310 100644 --- a/utils/urlhaus-filter.sh +++ b/utils/urlhaus-filter.sh @@ -3,7 +3,6 @@ ## Merge malware-domains.txt malware-url-top-domains.txt, ## and append a header to instruct uBO to grab the filter daily. - CURRENT_TIME="$(date -R -u)" FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist" SECOND_LINE="! Updated: $CURRENT_TIME" From 6e1a6b4c582fd88d5e84cf6a0ba07d57fa68e6ae Mon Sep 17 00:00:00 2001 From: curben Date: Sun, 12 May 2019 12:40:44 +0930 Subject: [PATCH 2/3] style: fix typo in comment --- utils/urlhaus-top-domains.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/urlhaus-top-domains.sh b/utils/urlhaus-top-domains.sh index e8d3484d..b8e9af98 100644 --- a/utils/urlhaus-top-domains.sh +++ b/utils/urlhaus-top-domains.sh @@ -18,5 +18,5 @@ sed -e 's/^www\.//g' | \ # Sort and remove duplicates sort -u | \ # Exclude Umbrella Top 1M and well-known domains -# grep inverse match whole line +# grep match whole line grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt From 43fdf9893f573f8c1869a26242639dccb60bd234 Mon Sep 17 00:00:00 2001 From: curben Date: Sun, 12 May 2019 12:45:23 +0930 Subject: [PATCH 3/3] docs: clarify malware-url-top-domains.sh --- utils/malware-url-top-domains.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/malware-url-top-domains.sh b/utils/malware-url-top-domains.sh index 445f55e1..6df3efe4 100644 --- a/utils/malware-url-top-domains.sh +++ b/utils/malware-url-top-domains.sh @@ -17,5 +17,5 @@ cut -f 1- -d ':' | \ sed -e 's/^www\.//g' | \ # Sort and remove duplicates sort -u | \ -# Include URLs from popular domains +# Parse URLs from popular domains only grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt