fix: run dos2unix before text processing

rename urlhaus.txt in tmp/
This commit is contained in:
curben 2019-05-28 09:59:02 +09:30
parent 3c1384b95b
commit f9e1cb84ce
4 changed files with 6 additions and 6 deletions

View File

@ -4,7 +4,7 @@ set -e -x
## Parse domains from URLhaus excluding popular domains
cat URLhaus.txt | \
cat urlhaus.txt | \
# Exclude Umbrella Top 1M and well-known domains
# grep inverse match whole line
grep -Fx -vf urlhaus-top-domains.txt > malware-domains.txt

View File

@ -4,6 +4,6 @@ set -e -x
## Parse malware URLs from popular URLhaus domains
cat URLhaus.txt | \
cat urlhaus.txt | \
# Parse URLs from popular domains only
grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt

View File

@ -12,10 +12,10 @@ cp ../src/exclude.txt .
## Clean up URLhaus.txt
cat ../src/URLhaus.txt | \
# Remove comment
sed '/^#/ d' | \
# Convert DOS to Unix line ending
dos2unix | \
# Remove comment
sed '/^#/ d' | \
# Remove http(s)://
cut -f 3 -d '/' | \
# Remove www
@ -23,4 +23,4 @@ cut -f 3 -d '/' | \
# Not examplewww.com
sed 's/^www\.//g' | \
# Sort and remove duplicates
sort -u > URLhaus.txt
sort -u > urlhaus.txt

View File

@ -4,7 +4,7 @@ set -e -x
## Parse popular domains from URLhaus
cat URLhaus.txt | \
cat urlhaus.txt | \
# Exclude Umbrella Top 1M and well-known domains
# grep match whole line
grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt