2019-05-11 09:19:25 +00:00
|
|
|
#!/bin/sh
|
|
|
|
|
|
|
|
## Parse popular domains from URLhaus
|
|
|
|
|
|
|
|
cat URLhaus.csv | \
|
|
|
|
# Convert DOS to Unix line ending
|
|
|
|
dos2unix | \
|
|
|
|
# Parse online URLs only
|
|
|
|
grep '"online"' | \
|
|
|
|
# Parse domains and IP address only
|
|
|
|
cut -f 6 -d '"' | \
|
|
|
|
cut -f 3 -d '/' | \
|
|
|
|
cut -f 1 -d ':' | \
|
|
|
|
# Remove www
|
|
|
|
# Only matches domains that start with www
|
|
|
|
# Not examplewww.com
|
|
|
|
sed -e 's/^www\.//g' | \
|
|
|
|
# Sort and remove duplicates
|
|
|
|
sort -u | \
|
|
|
|
# Exclude Umbrella Top 1M and well-known domains
|
2019-05-12 03:10:44 +00:00
|
|
|
# grep match whole line
|
2019-05-11 09:19:25 +00:00
|
|
|
grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt
|