diff --git a/src/script.sh b/src/script.sh index bf46bc0d..50dbabe2 100644 --- a/src/script.sh +++ b/src/script.sh @@ -12,7 +12,7 @@ if [ -n "$BASH_VERSION" ]; then shopt -s expand_aliases fi -alias curl="curl -L" +alias curl="curl -iL" alias rm="rm -rf" ## Use GNU grep, busybox grep is not as performant @@ -150,15 +150,22 @@ sed "s/^www\.//g" | \ sort -u > "top-1m-umbrella.txt" ## Parse the Tranco 1 Million -unzip "top-1m-tranco.zip" | \ -dos2unix | \ -tr "[:upper:]" "[:lower:]" | \ -# Parse domains only -cut -f 2 -d "," | \ -grep -F "." | \ -# Remove www. -sed "s/^www\.//g" | \ -sort -u > "top-1m-tranco.txt" +if [ -n "$(file 'top-1m-tranco.zip' | grep 'Zip archive data')" ]; then + unzip "top-1m-tranco.zip" | \ + dos2unix | \ + tr "[:upper:]" "[:lower:]" | \ + # Parse domains only + cut -f 2 -d "," | \ + grep -F "." | \ + # Remove www. + sed "s/^www\.//g" | \ + sort -u > "top-1m-tranco.txt" +else + # tranco has unreliable download + echo "top-1m-tranco.zip is not a zip, skipping it..." + touch "top-1m-tranco.txt" +fi + # Merge Umbrella, Tranco, Radar and self-maintained top domains cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \