urlhaus-filter/utils/urlhaus-top-domains.sh

#!/bin/sh

## Parse popular domains from URLhaus

cat URLhaus.csv | \
# Convert DOS to Unix line ending
dos2unix | \
# Parse online URLs only
grep '"online"' | \
# Parse domains and IP address only
cut -f 6 -d '"' | \
cut -f 3 -d '/' | \
cut -f 1 -d ':' | \
# Remove www
# Only matches domains that start with www
# Not examplewww.com
sed -e 's/^www\.//g' | \
# Sort and remove duplicates
sort -u | \
# Exclude Umbrella Top 1M and well-known domains
# grep match whole line
grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt
feat: include full URL for popular domains 2019-05-11 09:19:25 +00:00			`#!/bin/sh`

			`## Parse popular domains from URLhaus`

			`cat URLhaus.csv \| \`
			`# Convert DOS to Unix line ending`
			`dos2unix \| \`
			`# Parse online URLs only`
			`grep '"online"' \| \`
			`# Parse domains and IP address only`
			`cut -f 6 -d '"' \| \`
			`cut -f 3 -d '/' \| \`
			`cut -f 1 -d ':' \| \`
			`# Remove www`
			`# Only matches domains that start with www`
			`# Not examplewww.com`
			`sed -e 's/^www\.//g' \| \`
			`# Sort and remove duplicates`
			`sort -u \| \`
			`# Exclude Umbrella Top 1M and well-known domains`
style: fix typo in comment 2019-05-12 03:10:44 +00:00			`# grep match whole line`
feat: include full URL for popular domains 2019-05-11 09:19:25 +00:00			`grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt`