fix: skip tranco if download fails

This commit is contained in:
MDLeom 2025-03-08 01:23:59 +00:00
parent f70b9f20b2
commit 70bf08a46d
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
3 changed files with 18 additions and 12 deletions

View File

@ -14,7 +14,7 @@ jobs:
- name: Install Dependencies
run: |
apk update
apk add brotli curl git grep jq zstd
apk add brotli curl file grep jq zstd
- name: Build
env:
CF_API: ${{ secrets.CF_API }}

View File

@ -12,7 +12,7 @@ build_job:
stage: build
before_script:
- apk update && apk add brotli curl grep jq zstd
- apk update && apk add brotli curl file grep jq zstd
script:
- sh src/script.sh

View File

@ -72,7 +72,7 @@ cd "tmp/"
## Prepare datasets
curl "https://urlhaus.abuse.ch/downloads/csv/" -o "urlhaus.zip"
curl "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip"
curl "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip"
curl "https://tranco-list.eu/download/daily/top-1m.csv.zip" -o "top-1m-tranco.zip"
## Cloudflare Radar
if [ -n "$CF_API" ]; then
@ -161,15 +161,21 @@ sed "s/^www\.//g" | \
sort -u > "top-1m-umbrella.txt"
## Parse the Tranco 1 Million
unzip "top-1m-tranco.zip" | \
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
# Parse domains only
cut -f 2 -d "," | \
grep -F "." | \
# Remove www.
sed "s/^www\.//g" | \
sort -u > "top-1m-tranco.txt"
if [ -n "$(file 'top-1m-tranco.zip' | grep 'Zip archive data')" ]; then
unzip "top-1m-tranco.zip" | \
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
# Parse domains only
cut -f 2 -d "," | \
grep -F "." | \
# Remove www.
sed "s/^www\.//g" | \
sort -u > "top-1m-tranco.txt"
else
# tranco has unreliable download
echo "top-1m-tranco.zip is not a zip, skipping it..."
touch "top-1m-tranco.txt"
fi
# Merge Umbrella and self-maintained top domains
cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \