2018-10-09 06:18:46 +00:00
|
|
|
#!/bin/sh
|
|
|
|
|
|
|
|
# Download the URLhaus database dump and process it to be uBO-compatible
|
|
|
|
|
|
|
|
CURRENT_TIME="$(date -R -u)"
|
|
|
|
FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist"
|
|
|
|
SECOND_LINE="! Updated: $CURRENT_TIME"
|
2018-10-11 04:18:21 +00:00
|
|
|
THIRD_LINE="! Expires: 1 day (update frequency)"
|
2018-10-12 01:09:37 +00:00
|
|
|
FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter"
|
2018-10-11 04:18:21 +00:00
|
|
|
FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/"
|
|
|
|
SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/"
|
2018-10-11 04:41:33 +00:00
|
|
|
COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE"
|
2018-10-09 06:18:46 +00:00
|
|
|
|
|
|
|
# Download the database dump
|
2018-10-10 06:44:49 +00:00
|
|
|
wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv
|
2018-10-09 06:18:46 +00:00
|
|
|
|
2018-10-11 00:42:35 +00:00
|
|
|
cat ../src/URLhaus.csv | \
|
2018-10-11 03:20:48 +00:00
|
|
|
# Convert DOS to Unix line ending
|
2018-10-11 03:45:59 +00:00
|
|
|
dos2unix | \
|
2018-10-11 03:20:48 +00:00
|
|
|
# Parse online URLs only
|
2018-10-09 06:18:46 +00:00
|
|
|
grep '"online"' | \
|
2018-10-11 03:20:48 +00:00
|
|
|
# Parse domains and IP address only
|
2018-10-09 06:18:46 +00:00
|
|
|
cut -f 6 -d '"' | \
|
|
|
|
cut -f 3 -d '/' | \
|
|
|
|
cut -f 1 -d ':' | \
|
2018-10-11 03:45:59 +00:00
|
|
|
# Remove www
|
|
|
|
# Only matches domains that start with www
|
|
|
|
# Not examplewww.com
|
2018-10-11 04:10:18 +00:00
|
|
|
sed -e 's/^www\.//g' | \
|
2018-10-09 06:18:46 +00:00
|
|
|
# Sort and remove duplicates
|
|
|
|
sort -u | \
|
2018-10-11 03:20:48 +00:00
|
|
|
# Exclude Umbrella Top 1M. grep inverse match whole line
|
|
|
|
grep -Fx -vf ../src/top-1m.txt | \
|
2018-10-09 06:18:46 +00:00
|
|
|
# Exclude false positive
|
2018-10-11 03:20:48 +00:00
|
|
|
grep -Fx -vf ../src/exclude.txt | \
|
2018-10-09 06:18:46 +00:00
|
|
|
# Append header comment to the filter list
|
2018-10-10 06:44:49 +00:00
|
|
|
sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt
|
2018-10-22 03:10:22 +00:00
|
|
|
|
|
|
|
# Remove downloaded dataset
|
|
|
|
rm ../src/top-1m.txt
|