feat: remove phishtank source

frequent interference from cloudflare captcha
This commit is contained in:
MDLeom 2024-03-07 10:09:32 +00:00
parent 969e3cb94f
commit b3f6e90b9a
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
3 changed files with 5 additions and 50 deletions

View File

@ -37,18 +37,6 @@ check_grep() {
check_grep
## Detect Musl C library
LIBC="$(ldd /bin/ls | grep 'musl' || [ $? = 1 ])"
if [ -z "$LIBC" ]; then
rm "/tmp/musl.log"
# Not Musl
CSVQUOTE="../utils/csvquote-bin-glibc"
else
# Musl
CSVQUOTE="../utils/csvquote-bin-musl"
fi
## Fallback to busybox's dos2unix if installed
if ! command -v dos2unix &> /dev/null
then
@ -66,29 +54,12 @@ fi
mkdir -p "tmp/"
cd "tmp/"
USER_AGENT="phishtank/malware-filter"
if [ -n "$GITLAB_USER_LOGIN" ]; then
USER_AGENT="phishtank/$GITLAB_USER_LOGIN"
elif [ -n "$GITHUB_REPOSITORY_OWNER" ]; then
USER_AGENT="phishtank/$GITHUB_REPOSITORY_OWNER"
fi
## Prepare datasets
if [ -n "$PHISHTANK_API" ]; then
curl --user-agent "$USER_AGENT" \
"https://data.phishtank.com/data/$PHISHTANK_API/online-valid.csv.bz2" -o "phishtank.bz2"
else
curl --user-agent "$USER_AGENT" \
"https://data.phishtank.com/data/online-valid.csv.bz2" -o "phishtank.bz2"
fi
curl "https://openphish.com/feed.txt" -o "openphish-raw.txt"
curl "https://github.com/0xDanielLopez/TweetFeed/raw/master/week.csv" -o "phishunt.csv"
curl "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip"
curl "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip"
bunzip2 -kc "phishtank.bz2" > "phishtank.csv"
## Cloudflare Radar
if [ -n "$CF_API" ]; then
mkdir -p "cf/"
@ -118,26 +89,14 @@ fi
## Parse URLs
cat "phishtank.csv" | \
cat "openphish-raw.txt" | \
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
## Workaround for column with double quotes
"./$CSVQUOTE" | \
cut -f 2 -d "," | \
"./$CSVQUOTE" -u | \
sed 's/"//g' | \
cut -f 3- -d "/" | \
# Domain must have at least a 'dot'
grep -F "." | \
sed "s/^www\.//g" | \
# url encode space #11
sed "s/ /%20/g" > "phishtank.txt"
cat "openphish-raw.txt" | \
dos2unix | \
tr "[:upper:]" "[:lower:]" | \
cut -f 3- -d "/" | \
grep -F "." | \
sed "s/^www\.//g" | \
sed "s/ /%20/g" > "openphish.txt"
cat "phishunt.csv" | \
@ -149,11 +108,7 @@ grep -F "." | \
sed "s/^www\.//g" > "phishunt.txt"
## Combine all sources
cat "phishtank.txt" \
"openphish.txt" | \
# #43 #45
# "phishunt.txt" | \
sort -u > "phishing.txt"
sort -u "openphish.txt" > "phishing.txt"
## Parse domain and IP address only
cat "phishing.txt" | \
@ -276,7 +231,7 @@ SECOND_LINE="! Updated: $CURRENT_TIME"
THIRD_LINE="! Expires: 1 day (update frequency)"
FOURTH_LINE="! Homepage: https://gitlab.com/malware-filter/phishing-filter"
FIFTH_LINE="! License: https://gitlab.com/malware-filter/phishing-filter#license"
SIXTH_LINE="! Sources: phishtank.com, openphish.com, phishunt.io"
SIXTH_LINE="! Sources: openphish.com, phishunt.io"
COMMENT_UBO="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE"
mkdir -p "../public/"
@ -465,7 +420,7 @@ sed "2s/Domains Blocklist/Hosts Blocklist (IE)/" > "../public/phishing-filter.tp
## Clean up artifacts
rm "phishtank.csv" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "openphish-raw.txt" "phishunt.csv" "cf/" "top-1m-radar.txt"
rm "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "openphish-raw.txt" "phishunt.csv" "cf/" "top-1m-radar.txt"
cd ../

Binary file not shown.

Binary file not shown.