#!/bin/sh set -e -x ## Parse domains from URLhaus excluding popular domains cat URLhaus.csv | \ # Convert DOS to Unix line ending dos2unix | \ # Parse online URLs only grep '"online"' | \ # Parse domains and IP address only cut -f 6 -d '"' | \ cut -f 3 -d '/' | \ cut -f 1 -d ':' | \ # Remove www # Only matches domains that start with www # Not examplewww.com sed 's/^www\.//g' | \ # Sort and remove duplicates sort -u | \ # Exclude Umbrella Top 1M and well-known domains # grep inverse match whole line grep -Fx -vf urlhaus-top-domains.txt > malware-domains.txt