From e4dc980c96ad8b31049b6c92287a43615727d4a4 Mon Sep 17 00:00:00 2001 From: curben Date: Thu, 11 Oct 2018 13:50:48 +1030 Subject: [PATCH] Match whole line for faster search Use unix line ending as standard --- utils/script.sh | 13 +++++++++---- utils/umbrella-top-1m.sh | 8 +++++++- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/utils/script.sh b/utils/script.sh index 55dc353e..e6183e7a 100755 --- a/utils/script.sh +++ b/utils/script.sh @@ -13,17 +13,22 @@ COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE" # Download the database dump wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv -# Parse domains and IP address only cat ../src/URLhaus.csv | \ +# Convert DOS to Unix line ending +sed -z -e 's/\r\n/\n/g' | \ +# Parse online URLs only grep '"online"' | \ +# Parse domains and IP address only cut -f 6 -d '"' | \ cut -f 3 -d '/' | \ cut -f 1 -d ':' | \ +# Remove www. +sed -z -e 's/\nwww\./\n/g' | \ # Sort and remove duplicates sort -u | \ -# Exclude Umbrella Top 1M -grep -vf ../src/top-1m.txt | \ +# Exclude Umbrella Top 1M. grep inverse match whole line +grep -Fx -vf ../src/top-1m.txt | \ # Exclude false positive -grep -vf ../src/exclude.txt | \ +grep -Fx -vf ../src/exclude.txt | \ # Append header comment to the filter list sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt diff --git a/utils/umbrella-top-1m.sh b/utils/umbrella-top-1m.sh index 09f73c5b..e04fff51 100755 --- a/utils/umbrella-top-1m.sh +++ b/utils/umbrella-top-1m.sh @@ -9,8 +9,14 @@ wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m # Decompress the zip and write output to stdout unzip -p top-1m.csv.zip | \ +# Convert DOS to Unix line ending +sed -z -e 's/\r\n/\n/g' | \ # Parse domains only -cut -f 2 -d ',' > ../src/top-1m.txt +cut -f 2 -d ',' | \ +# Remove www. +sed -z -e 's/\nwww\./\n/g' | \ +# Remove duplicates +sort -u > ../src/top-1m.txt # Remove downloaded zip file rm top-1m.csv.zip