From e4dc980c96ad8b31049b6c92287a43615727d4a4 Mon Sep 17 00:00:00 2001
From: curben <curben@users.noreply.gitlab.com>
Date: Thu, 11 Oct 2018 13:50:48 +1030
Subject: [PATCH] Match whole line for faster search

Use unix line ending as standard
---
 utils/script.sh          | 13 +++++++++----
 utils/umbrella-top-1m.sh |  8 +++++++-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/utils/script.sh b/utils/script.sh
index 55dc353e..e6183e7a 100755
--- a/utils/script.sh
+++ b/utils/script.sh
@@ -13,17 +13,22 @@ COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE"
 # Download the database dump
 wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv
 
-# Parse domains and IP address only
 cat ../src/URLhaus.csv | \
+# Convert DOS to Unix line ending
+sed -z -e 's/\r\n/\n/g' | \
+# Parse online URLs only
 grep '"online"' | \
+# Parse domains and IP address only
 cut -f 6 -d '"' | \
 cut -f 3 -d '/' | \
 cut -f 1 -d ':' | \
+# Remove www.
+sed -z -e 's/\nwww\./\n/g' | \
 # Sort and remove duplicates
 sort -u | \
-# Exclude Umbrella Top 1M
-grep -vf ../src/top-1m.txt | \
+# Exclude Umbrella Top 1M. grep inverse match whole line
+grep -Fx -vf ../src/top-1m.txt | \
 # Exclude false positive
-grep -vf ../src/exclude.txt | \
+grep -Fx -vf ../src/exclude.txt | \
 # Append header comment to the filter list
 sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt
diff --git a/utils/umbrella-top-1m.sh b/utils/umbrella-top-1m.sh
index 09f73c5b..e04fff51 100755
--- a/utils/umbrella-top-1m.sh
+++ b/utils/umbrella-top-1m.sh
@@ -9,8 +9,14 @@ wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m
 
 # Decompress the zip and write output to stdout
 unzip -p top-1m.csv.zip | \
+# Convert DOS to Unix line ending
+sed -z -e 's/\r\n/\n/g' | \
 # Parse domains only
-cut -f 2 -d ',' > ../src/top-1m.txt
+cut -f 2 -d ',' | \
+# Remove www.
+sed -z -e 's/\nwww\./\n/g' | \
+# Remove duplicates
+sort -u > ../src/top-1m.txt
 
 # Remove downloaded zip file
 rm top-1m.csv.zip