#!/bin/sh set -e -x ## Parse popular domains from URLhaus cat URLhaus.csv | \ # Convert DOS to Unix line ending dos2unix | \ # Parse online URLs only #grep '"online"' | \ # Parse domains and IP address only cut -f 6 -d '"' | \ cut -f 3 -d '/' | \ cut -f 1 -d ':' | \ # Remove www # Only matches domains that start with www # Not examplewww.com sed 's/^www\.//g' | \ # Sort and remove duplicates sort -u | \ # Exclude Umbrella Top 1M and well-known domains # grep match whole line grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt