fix: skip phishtank if download fails

This commit is contained in:
MDLeom 2025-03-16 07:37:17 +00:00
parent 56d67d2a41
commit 993bb958f5
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
1 changed files with 23 additions and 16 deletions

View File

@ -103,7 +103,6 @@ curl "https://lists.ipthreat.net/file/ipthreat-lists/phishing/phishing-threat-0.
curl "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip"
curl "https://tranco-list.eu/download/daily/top-1m.csv.zip" -o "top-1m-tranco.zip"
bunzip2 -kc "phishtank.bz2" > "phishtank.csv"
## Cloudflare Radar
if [ -n "$CF_API" ]; then
@ -134,19 +133,27 @@ fi
## Parse URLs
cat "phishtank.csv" | \
tr "[:upper:]" "[:lower:]" | \
## Workaround for column with double quotes
"./$CSVQUOTE" | \
cut -f 2 -d "," | \
"./$CSVQUOTE" -u | \
sed 's/"//g' | \
cut -f 3- -d "/" | \
# Domain must have at least a 'dot'
grep -F "." | \
sed "s/^www\.//g" | \
# url encode space #11
sed "s/ /%20/g" > "phishtank.txt"
if [ -n "$(file 'phishtank.bz2' | grep 'bzip2 compressed data')" ]; then
bunzip2 -kc "phishtank.bz2" > "phishtank.csv"
cat "phishtank.csv" | \
tr "[:upper:]" "[:lower:]" | \
## Workaround for column with double quotes
"./$CSVQUOTE" | \
cut -f 2 -d "," | \
"./$CSVQUOTE" -u | \
sed 's/"//g' | \
cut -f 3- -d "/" | \
# Domain must have at least a 'dot'
grep -F "." | \
sed "s/^www\.//g" | \
# url encode space #11
sed "s/ /%20/g" > "phishtank.txt"
else
# cloudflare may impose captcha
echo "phishtank.bz2 is not a bzip2, skipping it..."
touch "phishtank.txt"
fi
cat "openphish-raw.txt" | \
dos2unix | \
@ -167,7 +174,7 @@ sed "s/^www\.//g" | \
sed "s/ /%20/g" > "ipthreat.txt"
## Combine all sources
cat "openphish.txt" "ipthreat.txt" "phishtank" | \
cat "openphish.txt" "ipthreat.txt" "phishtank.txt" | \
sort -u > "phishing.txt"
## Parse domain and IP address only
@ -206,7 +213,7 @@ if [ -n "$(file 'top-1m-tranco.zip' | grep 'Zip archive data')" ]; then
sed "s/^www\.//g" | \
sort -u > "top-1m-tranco.txt"
else
# tranco has unreliable download
# cloudflare may impose captcha
echo "top-1m-tranco.zip is not a zip, skipping it..."
touch "top-1m-tranco.txt"
fi