#!/bin/sh set -e -x # Download URLhaus database wget https://urlhaus.abuse.ch/downloads/text/ -O ../src/URLhaus.txt # Download Cisco Umbrella 1 Million wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip cp ../src/exclude.txt . ## Clean up URLhaus.txt cat ../src/URLhaus.txt | \ # Convert DOS to Unix line ending dos2unix | \ # Remove comment sed 's/^#.*//g' | \ # Remove http(s):// cut -f 3- -d '/' | \ # Remove www. sed 's/^www\.//g' | \ sort -u > urlhaus.txt ## Parse domain and IP address only cat urlhaus.txt | \ cut -f 1 -d '/' | \ cut -f 1 -d ':' | \ sort -u > urlhaus-domains.txt