urlhaus-filter/utils/prerequisites.sh

30 lines
659 B
Bash

#!/bin/sh
set -e -x
# Download URLhaus database
wget https://urlhaus.abuse.ch/downloads/text/ -O ../src/URLhaus.txt
# Download Cisco Umbrella 1 Million
wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip
cp ../src/exclude.txt .
## Clean up URLhaus.txt
cat ../src/URLhaus.txt | \
# Convert DOS to Unix line ending
dos2unix | \
# Remove comment
sed 's/^#.*//g' | \
# Remove http(s)://
cut -f 3- -d '/' | \
# Remove www.
sed 's/^www\.//g' | \
sort -u > urlhaus.txt
## Parse domain and IP address only
cat urlhaus.txt | \
cut -f 1 -d '/' | \
cut -f 1 -d ':' | \
# Sort and remove duplicates
sort -u > urlhaus-domains.txt