Use dos2unix instead of sed

Add sed workaround for matching new line https://stackoverflow.com/a/1252191
2018-10-11 14:15:59 +10:30 · 2018-10-11 14:15:59 +10:30 · 88d6447fe0
parent b76378f607
commit 88d6447fe0
2 changed files with 10 additions and 6 deletions
--- a/utils/script.sh
+++ b/utils/script.sh
@ -15,15 +15,17 @@ wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv

 cat ../src/URLhaus.csv | \
 # Convert DOS to Unix line ending
-sed -z -e 's/\r\n/\n/g' | \
+dos2unix | \
 # Parse online URLs only
 grep '"online"' | \
 # Parse domains and IP address only
 cut -f 6 -d '"' | \
 cut -f 3 -d '/' | \
 cut -f 1 -d ':' | \
-# Remove www.
-sed -z -e 's/\nwww\./\n/g' | \
+# Remove www
+# Only matches domains that start with www
+# Not examplewww.com
+sed ':a;N;$!ba;s/\nwww\./\n/g' | \
 # Sort and remove duplicates
 sort -u | \
 # Exclude Umbrella Top 1M. grep inverse match whole line
--- a/utils/umbrella-top-1m.sh
+++ b/utils/umbrella-top-1m.sh
@ -10,11 +10,13 @@ wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m
 # Decompress the zip and write output to stdout
 unzip -p top-1m.csv.zip | \
 # Convert DOS to Unix line ending
-sed -z -e 's/\r\n/\n/g' | \
+dos2unix | \
 # Parse domains only
 cut -f 2 -d ',' | \
-# Remove www.
-sed -z -e 's/\nwww\./\n/g' | \
+# Remove www
+# Only matches domains that start with www
+# Not examplewww.com
+sed ':a;N;$!ba;s/\nwww\./\n/g' | \
 # Remove duplicates
 sort -u > ../src/top-1m.txt