diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 9b672857..d00653e4 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -14,7 +14,7 @@ jobs: - name: Install Dependencies run: | apk update - apk add brotli curl git grep jq zstd + apk add brotli curl file grep jq zstd - name: Build env: CF_API: ${{ secrets.CF_API }} diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7e4c5de4..f8a16d98 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,7 +12,7 @@ build_job: stage: build before_script: - - apk update && apk add brotli curl grep jq zstd + - apk update && apk add brotli curl file grep jq zstd script: - sh src/script.sh diff --git a/src/script.sh b/src/script.sh index 80fd9526..b7d718c2 100644 --- a/src/script.sh +++ b/src/script.sh @@ -72,7 +72,7 @@ cd "tmp/" ## Prepare datasets curl "https://urlhaus.abuse.ch/downloads/csv/" -o "urlhaus.zip" curl "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip" -curl "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip" +curl "https://tranco-list.eu/download/daily/top-1m.csv.zip" -o "top-1m-tranco.zip" ## Cloudflare Radar if [ -n "$CF_API" ]; then @@ -161,15 +161,21 @@ sed "s/^www\.//g" | \ sort -u > "top-1m-umbrella.txt" ## Parse the Tranco 1 Million -unzip "top-1m-tranco.zip" | \ -dos2unix | \ -tr "[:upper:]" "[:lower:]" | \ -# Parse domains only -cut -f 2 -d "," | \ -grep -F "." | \ -# Remove www. -sed "s/^www\.//g" | \ -sort -u > "top-1m-tranco.txt" +if [ -n "$(file 'top-1m-tranco.zip' | grep 'Zip archive data')" ]; then + unzip "top-1m-tranco.zip" | \ + dos2unix | \ + tr "[:upper:]" "[:lower:]" | \ + # Parse domains only + cut -f 2 -d "," | \ + grep -F "." | \ + # Remove www. + sed "s/^www\.//g" | \ + sort -u > "top-1m-tranco.txt" +else + # tranco has unreliable download + echo "top-1m-tranco.zip is not a zip, skipping it..." + touch "top-1m-tranco.txt" +fi # Merge Umbrella and self-maintained top domains cat "top-1m-umbrella.txt" "top-1m-tranco.txt" "exclude.txt" | \