2018-10-10 06:25:29 +00:00
|
|
|
#!/bin/sh
|
|
|
|
|
2019-05-11 09:19:25 +00:00
|
|
|
## Parse the Cisco Umbrella 1 Million
|
|
|
|
## More info:
|
|
|
|
## https://s3-us-west-1.amazonaws.com/umbrella-static/index.html
|
2018-10-11 00:53:07 +00:00
|
|
|
|
2018-10-10 07:48:36 +00:00
|
|
|
# Decompress the zip and write output to stdout
|
2018-10-11 01:49:15 +00:00
|
|
|
unzip -p top-1m.csv.zip | \
|
2018-10-11 03:20:48 +00:00
|
|
|
# Convert DOS to Unix line ending
|
2018-10-11 03:45:59 +00:00
|
|
|
dos2unix | \
|
2018-10-10 06:25:29 +00:00
|
|
|
# Parse domains only
|
2018-10-11 03:20:48 +00:00
|
|
|
cut -f 2 -d ',' | \
|
2019-05-11 09:19:25 +00:00
|
|
|
# Domain must have at least a 'dot'
|
|
|
|
grep -F '.' | \
|
2018-10-11 03:45:59 +00:00
|
|
|
# Remove www
|
|
|
|
# Only matches domains that start with www
|
|
|
|
# Not examplewww.com
|
2018-10-11 04:10:18 +00:00
|
|
|
sed -e 's/^www\.//g' | \
|
2018-10-11 03:20:48 +00:00
|
|
|
# Remove duplicates
|
2019-05-11 09:19:25 +00:00
|
|
|
sort -u > top-1m.txt
|
2018-10-11 00:53:07 +00:00
|
|
|
|
2019-05-11 09:19:25 +00:00
|
|
|
# Merge Umbrella and self-maintained top domains
|
|
|
|
cat top-1m.txt exclude.txt | \
|
|
|
|
sort -u > top-1m-well-known.txt
|