#!/bin/sh

## Parse the Cisco Umbrella 1 Million
## More info:
## https://s3-us-west-1.amazonaws.com/umbrella-static/index.html

# Decompress the zip and write output to stdout
unzip -p top-1m.csv.zip | \
# Convert DOS to Unix line ending
dos2unix | \
# Parse domains only
cut -f 2 -d ',' | \
# Domain must have at least a 'dot'
grep -F '.' | \
# Remove www
# Only matches domains that start with www
# Not examplewww.com
sed 's/^www\.//g' | \
# Remove duplicates
sort -u > top-1m.txt

# Merge Umbrella and self-maintained top domains
cat top-1m.txt exclude.txt | \
sort -u > top-1m-well-known.txt