Merge branch 'perf' into 'master'

perf: grep using smaller file

See merge request curben/urlhaus-filter!3
This commit is contained in:
curben 2019-05-12 03:19:38 +00:00
commit e258ffff3e
5 changed files with 6 additions and 7 deletions

View File

@ -42,12 +42,12 @@ deploy:
# Process the Umbrella Top 1M
- sh ../utils/umbrella-top-1m.sh
# Parse domains from URLhaus excluding popular domains
- sh ../utils/malware-domains.sh
# Parse popular domains that also appear in URLhaus
- sh ../utils/urlhaus-top-domains.sh
# Parse domains from URLhaus excluding popular domains
- sh ../utils/malware-domains.sh
# Parse malware URLs from popular domains
- sh ../utils/malware-url-top-domains.sh

View File

@ -19,4 +19,4 @@ sed -e 's/^www\.//g' | \
sort -u | \
# Exclude Umbrella Top 1M and well-known domains
# grep inverse match whole line
grep -Fx -vf top-1m-well-known.txt > malware-domains.txt
grep -Fx -vf urlhaus-top-domains.txt > malware-domains.txt

View File

@ -17,5 +17,5 @@ cut -f 1- -d ':' | \
sed -e 's/^www\.//g' | \
# Sort and remove duplicates
sort -u | \
# Include URLs from popular domains
# Parse URLs from popular domains only
grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt

View File

@ -3,7 +3,6 @@
## Merge malware-domains.txt malware-url-top-domains.txt,
## and append a header to instruct uBO to grab the filter daily.
CURRENT_TIME="$(date -R -u)"
FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist"
SECOND_LINE="! Updated: $CURRENT_TIME"

View File

@ -18,5 +18,5 @@ sed -e 's/^www\.//g' | \
# Sort and remove duplicates
sort -u | \
# Exclude Umbrella Top 1M and well-known domains
# grep inverse match whole line
# grep match whole line
grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt