Match whole line for faster search
Use unix line ending as standard
This commit is contained in:
parent
64de7976cc
commit
e4dc980c96
|
@ -13,17 +13,22 @@ COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE"
|
|||
# Download the database dump
|
||||
wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv
|
||||
|
||||
# Parse domains and IP address only
|
||||
cat ../src/URLhaus.csv | \
|
||||
# Convert DOS to Unix line ending
|
||||
sed -z -e 's/\r\n/\n/g' | \
|
||||
# Parse online URLs only
|
||||
grep '"online"' | \
|
||||
# Parse domains and IP address only
|
||||
cut -f 6 -d '"' | \
|
||||
cut -f 3 -d '/' | \
|
||||
cut -f 1 -d ':' | \
|
||||
# Remove www.
|
||||
sed -z -e 's/\nwww\./\n/g' | \
|
||||
# Sort and remove duplicates
|
||||
sort -u | \
|
||||
# Exclude Umbrella Top 1M
|
||||
grep -vf ../src/top-1m.txt | \
|
||||
# Exclude Umbrella Top 1M. grep inverse match whole line
|
||||
grep -Fx -vf ../src/top-1m.txt | \
|
||||
# Exclude false positive
|
||||
grep -vf ../src/exclude.txt | \
|
||||
grep -Fx -vf ../src/exclude.txt | \
|
||||
# Append header comment to the filter list
|
||||
sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt
|
||||
|
|
|
@ -9,8 +9,14 @@ wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m
|
|||
|
||||
# Decompress the zip and write output to stdout
|
||||
unzip -p top-1m.csv.zip | \
|
||||
# Convert DOS to Unix line ending
|
||||
sed -z -e 's/\r\n/\n/g' | \
|
||||
# Parse domains only
|
||||
cut -f 2 -d ',' > ../src/top-1m.txt
|
||||
cut -f 2 -d ',' | \
|
||||
# Remove www.
|
||||
sed -z -e 's/\nwww\./\n/g' | \
|
||||
# Remove duplicates
|
||||
sort -u > ../src/top-1m.txt
|
||||
|
||||
# Remove downloaded zip file
|
||||
rm top-1m.csv.zip
|
||||
|
|
Loading…
Reference in New Issue