fix: skip tranco if download fails

This commit is contained in:
MDLeom 2025-03-07 23:42:22 +00:00
parent 7e8139510d
commit b94d832896
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
1 changed files with 17 additions and 10 deletions

View File

@ -12,7 +12,7 @@ if [ -n "$BASH_VERSION" ]; then
shopt -s expand_aliases
fi
alias curl="curl -L"
alias curl="curl -iL"
alias rm="rm -rf"
## Use GNU grep, busybox grep is not as performant
@ -150,15 +150,22 @@ sed "s/^www\.//g" | \
sort -u > "top-1m-umbrella.txt"
## Parse the Tranco 1 Million
unzip "top-1m-tranco.zip" | \
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
# Parse domains only
cut -f 2 -d "," | \
grep -F "." | \
# Remove www.
sed "s/^www\.//g" | \
sort -u > "top-1m-tranco.txt"
if [ -n "$(file 'top-1m-tranco.zip' | grep 'Zip archive data')" ]; then
unzip "top-1m-tranco.zip" | \
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
# Parse domains only
cut -f 2 -d "," | \
grep -F "." | \
# Remove www.
sed "s/^www\.//g" | \
sort -u > "top-1m-tranco.txt"
else
# tranco has unreliable download
echo "top-1m-tranco.zip is not a zip, skipping it..."
touch "top-1m-tranco.txt"
fi
# Merge Umbrella, Tranco, Radar and self-maintained top domains
cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \