urlhaus-filter/utils/script.sh

#!/bin/sh

# Download the URLhaus database dump and process it to be uBO-compatible

CURRENT_TIME="$(date -R -u)"
FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist"
SECOND_LINE="! Updated: $CURRENT_TIME"
THIRD_LINE="! Expires: 1 day (update frequency)"
FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter"
FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/"
SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/"
COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE"

# Download the database dump
wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv

cat ../src/URLhaus.csv | \
# Convert DOS to Unix line ending
dos2unix | \
# Parse online URLs only
grep '"online"' | \
# Parse domains and IP address only
cut -f 6 -d '"' | \
cut -f 3 -d '/' | \
cut -f 1 -d ':' | \
# Remove www
# Only matches domains that start with www
# Not examplewww.com
sed -e 's/^www\.//g' | \
# Sort and remove duplicates
sort -u | \
# Exclude Umbrella Top 1M. grep inverse match whole line
grep -Fx -vf ../src/top-1m.txt | \
# Exclude false positive
grep -Fx -vf ../src/exclude.txt | \
# Append header comment to the filter list
sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt

# Remove downloaded dataset
rm ../src/top-1m.txt
Initial commit 2018-10-09 06:18:46 +00:00			`#!/bin/sh`

			`# Download the URLhaus database dump and process it to be uBO-compatible`

			`CURRENT_TIME="$(date -R -u)"`
			`FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist"`
			`SECOND_LINE="! Updated: $CURRENT_TIME"`
Add filter update frequency to 1 day Filter is updated twice a day 2018-10-11 04:18:21 +00:00			`THIRD_LINE="! Expires: 1 day (update frequency)"`
Update repo link 2018-10-12 01:09:37 +00:00			`FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter"`
Add filter update frequency to 1 day Filter is updated twice a day 2018-10-11 04:18:21 +00:00			`FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/"`
			`SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/"`
Fix header comment 2018-10-11 04:41:33 +00:00			`COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE"`
Initial commit 2018-10-09 06:18:46 +00:00
			`# Download the database dump`
Reorganise files 2018-10-10 06:44:49 +00:00			`wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv`
Initial commit 2018-10-09 06:18:46 +00:00
Fix path 2018-10-11 00:42:35 +00:00			`cat ../src/URLhaus.csv \| \`
Match whole line for faster search Use unix line ending as standard 2018-10-11 03:20:48 +00:00			`# Convert DOS to Unix line ending`
Use dos2unix instead of sed Add sed workaround for matching new line https://stackoverflow.com/a/1252191 2018-10-11 03:45:59 +00:00			`dos2unix \| \`
Match whole line for faster search Use unix line ending as standard 2018-10-11 03:20:48 +00:00			`# Parse online URLs only`
Initial commit 2018-10-09 06:18:46 +00:00			`grep '"online"' \| \`
Match whole line for faster search Use unix line ending as standard 2018-10-11 03:20:48 +00:00			`# Parse domains and IP address only`
Initial commit 2018-10-09 06:18:46 +00:00			`cut -f 6 -d '"' \| \`
			`cut -f 3 -d '/' \| \`
			`cut -f 1 -d ':' \| \`
Use dos2unix instead of sed Add sed workaround for matching new line https://stackoverflow.com/a/1252191 2018-10-11 03:45:59 +00:00			`# Remove www`
			`# Only matches domains that start with www`
			`# Not examplewww.com`
Use simpler sed syntax for matching beginning of a line 2018-10-11 04:10:18 +00:00			`sed -e 's/^www\.//g' \| \`
Initial commit 2018-10-09 06:18:46 +00:00			`# Sort and remove duplicates`
			`sort -u \| \`
Match whole line for faster search Use unix line ending as standard 2018-10-11 03:20:48 +00:00			`# Exclude Umbrella Top 1M. grep inverse match whole line`
			`grep -Fx -vf ../src/top-1m.txt \| \`
Initial commit 2018-10-09 06:18:46 +00:00			`# Exclude false positive`
Match whole line for faster search Use unix line ending as standard 2018-10-11 03:20:48 +00:00			`grep -Fx -vf ../src/exclude.txt \| \`
Initial commit 2018-10-09 06:18:46 +00:00			`# Append header comment to the filter list`
Reorganise files 2018-10-10 06:44:49 +00:00			`sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt`
Remove top-1m.txt The dataset is not under public domain and may subject to copyright claim by Umbrella/Cisco 2018-10-22 03:10:22 +00:00
			`# Remove downloaded dataset`
			`rm ../src/top-1m.txt`