feat: remove phishtank source
frequent interference from cloudflare captcha
This commit is contained in:
		
							parent
							
								
									969e3cb94f
								
							
						
					
					
						commit
						b3f6e90b9a
					
				|  | @ -37,18 +37,6 @@ check_grep() { | |||
| check_grep | ||||
| 
 | ||||
| 
 | ||||
| ## Detect Musl C library | ||||
| LIBC="$(ldd /bin/ls | grep 'musl' || [ $? = 1 ])" | ||||
| if [ -z "$LIBC" ]; then | ||||
|   rm "/tmp/musl.log" | ||||
|   # Not Musl | ||||
|   CSVQUOTE="../utils/csvquote-bin-glibc" | ||||
| else | ||||
|   # Musl | ||||
|   CSVQUOTE="../utils/csvquote-bin-musl" | ||||
| fi | ||||
| 
 | ||||
| 
 | ||||
| ## Fallback to busybox's dos2unix if installed | ||||
| if ! command -v dos2unix &> /dev/null | ||||
| then | ||||
|  | @ -66,29 +54,12 @@ fi | |||
| mkdir -p "tmp/" | ||||
| cd "tmp/" | ||||
| 
 | ||||
| USER_AGENT="phishtank/malware-filter" | ||||
| if [ -n "$GITLAB_USER_LOGIN" ]; then | ||||
|   USER_AGENT="phishtank/$GITLAB_USER_LOGIN" | ||||
| elif [ -n "$GITHUB_REPOSITORY_OWNER" ]; then | ||||
|   USER_AGENT="phishtank/$GITHUB_REPOSITORY_OWNER" | ||||
| fi | ||||
| 
 | ||||
| ## Prepare datasets | ||||
| if [ -n "$PHISHTANK_API" ]; then | ||||
|   curl --user-agent "$USER_AGENT" \ | ||||
|   "https://data.phishtank.com/data/$PHISHTANK_API/online-valid.csv.bz2" -o "phishtank.bz2" | ||||
| else | ||||
|   curl --user-agent "$USER_AGENT" \ | ||||
|   "https://data.phishtank.com/data/online-valid.csv.bz2" -o "phishtank.bz2" | ||||
| fi | ||||
| 
 | ||||
| curl "https://openphish.com/feed.txt" -o "openphish-raw.txt" | ||||
| curl "https://github.com/0xDanielLopez/TweetFeed/raw/master/week.csv" -o "phishunt.csv" | ||||
| curl "https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip" -o "top-1m-umbrella.zip" | ||||
| curl "https://tranco-list.eu/top-1m.csv.zip" -o "top-1m-tranco.zip" | ||||
| 
 | ||||
| bunzip2 -kc "phishtank.bz2" > "phishtank.csv" | ||||
| 
 | ||||
| ## Cloudflare Radar | ||||
| if [ -n "$CF_API" ]; then | ||||
|   mkdir -p "cf/" | ||||
|  | @ -118,26 +89,14 @@ fi | |||
| 
 | ||||
| 
 | ||||
| ## Parse URLs | ||||
| cat "phishtank.csv" | \ | ||||
| cat "openphish-raw.txt" | \ | ||||
| dos2unix | \ | ||||
| tr "[:upper:]" "[:lower:]" | \ | ||||
| ## Workaround for column with double quotes | ||||
| "./$CSVQUOTE" | \ | ||||
| cut -f 2 -d "," | \ | ||||
| "./$CSVQUOTE" -u | \ | ||||
| sed 's/"//g' | \ | ||||
| cut -f 3- -d "/" | \ | ||||
| # Domain must have at least a 'dot' | ||||
| grep -F "." | \ | ||||
| sed "s/^www\.//g" | \ | ||||
| # url encode space #11 | ||||
| sed "s/ /%20/g" > "phishtank.txt" | ||||
| 
 | ||||
| cat "openphish-raw.txt" | \ | ||||
| dos2unix | \ | ||||
| tr "[:upper:]" "[:lower:]" | \ | ||||
| cut -f 3- -d "/" | \ | ||||
| grep -F "." | \ | ||||
| sed "s/^www\.//g" | \ | ||||
| sed "s/ /%20/g" > "openphish.txt" | ||||
| 
 | ||||
| cat "phishunt.csv" | \ | ||||
|  | @ -149,11 +108,7 @@ grep -F "." | \ | |||
| sed "s/^www\.//g" > "phishunt.txt" | ||||
| 
 | ||||
| ## Combine all sources | ||||
| cat "phishtank.txt" \ | ||||
|   "openphish.txt" | \ | ||||
|   # #43 #45 | ||||
|   # "phishunt.txt" | \ | ||||
| sort -u > "phishing.txt" | ||||
| sort -u "openphish.txt" > "phishing.txt" | ||||
| 
 | ||||
| ## Parse domain and IP address only | ||||
| cat "phishing.txt" | \ | ||||
|  | @ -276,7 +231,7 @@ SECOND_LINE="! Updated: $CURRENT_TIME" | |||
| THIRD_LINE="! Expires: 1 day (update frequency)" | ||||
| FOURTH_LINE="! Homepage: https://gitlab.com/malware-filter/phishing-filter" | ||||
| FIFTH_LINE="! License: https://gitlab.com/malware-filter/phishing-filter#license" | ||||
| SIXTH_LINE="! Sources: phishtank.com, openphish.com, phishunt.io" | ||||
| SIXTH_LINE="! Sources: openphish.com, phishunt.io" | ||||
| COMMENT_UBO="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE" | ||||
| 
 | ||||
| mkdir -p "../public/" | ||||
|  | @ -465,7 +420,7 @@ sed "2s/Domains Blocklist/Hosts Blocklist (IE)/" > "../public/phishing-filter.tp | |||
| 
 | ||||
| 
 | ||||
| ## Clean up artifacts | ||||
| rm "phishtank.csv" "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "openphish-raw.txt" "phishunt.csv" "cf/" "top-1m-radar.txt" | ||||
| rm "top-1m-umbrella.zip" "top-1m-umbrella.txt" "top-1m-tranco.txt" "openphish-raw.txt" "phishunt.csv" "cf/" "top-1m-radar.txt" | ||||
| 
 | ||||
| 
 | ||||
| cd ../ | ||||
|  |  | |||
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
		Loading…
	
		Reference in New Issue