fix: use simple URL list

we no longer care the status of URL
bb817d9838
This commit is contained in:
curben 2019-05-27 15:59:08 +09:30
parent 196c66f2a1
commit 9a5fdb2be6
6 changed files with 194766 additions and 194679 deletions

File diff suppressed because it is too large Load Diff

194747
src/URLhaus.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -4,21 +4,7 @@ set -e -x
## Parse domains from URLhaus excluding popular domains
cat URLhaus.csv | \
# Convert DOS to Unix line ending
dos2unix | \
# Parse online URLs only
#grep '"online"' | \
# Parse domains and IP address only
cut -f 6 -d '"' | \
cut -f 3 -d '/' | \
cut -f 1 -d ':' | \
# Remove www
# Only matches domains that start with www
# Not examplewww.com
sed 's/^www\.//g' | \
# Sort and remove duplicates
sort -u | \
cat URLhaus.txt | \
# Exclude Umbrella Top 1M and well-known domains
# grep inverse match whole line
grep -Fx -vf urlhaus-top-domains.txt > malware-domains.txt

View File

@ -4,20 +4,6 @@ set -e -x
## Parse malware URLs from popular URLhaus domains
cat URLhaus.csv | \
# Convert DOS to Unix line ending
dos2unix | \
# Parse online URLs only
#grep '"online"' | \
# Parse URLs
cut -f 6 -d '"' | \
cut -f 3- -d '/' | \
cut -f 1- -d ':' | \
# Remove www
# Only matches domains that start with www
# Not examplewww.com
sed 's/^www\.//g' | \
# Sort and remove duplicates
sort -u | \
cat URLhaus.txt | \
# Parse URLs from popular domains only
grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt

View File

@ -3,10 +3,24 @@
set -e -x
# Download URLhaus database
wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv
wget https://urlhaus.abuse.ch/downloads/text/ -O ../src/URLhaus.txt
# Download Cisco Umbrella 1 Million
wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip
cp ../src/URLhaus.csv .
cp ../src/exclude.txt .
## Clean up URLhaus.txt
cat ../src/URLhaus.txt | \
# Remove comment
sed '/^#/ d' | \
# Convert DOS to Unix line ending
dos2unix | \
# Remove http(s)://
cut -f 3 -d '/' | \
# Remove www
# Only matches domains that start with www
# Not examplewww.com
sed 's/^www\.//g' | \
# Sort and remove duplicates
sort -u > URLhaus.txt

View File

@ -4,21 +4,7 @@ set -e -x
## Parse popular domains from URLhaus
cat URLhaus.csv | \
# Convert DOS to Unix line ending
dos2unix | \
# Parse online URLs only
#grep '"online"' | \
# Parse domains and IP address only
cut -f 6 -d '"' | \
cut -f 3 -d '/' | \
cut -f 1 -d ':' | \
# Remove www
# Only matches domains that start with www
# Not examplewww.com
sed 's/^www\.//g' | \
# Sort and remove duplicates
sort -u | \
cat URLhaus.txt | \
# Exclude Umbrella Top 1M and well-known domains
# grep match whole line
grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt