feat: include full URL for popular domains

2019-05-11 18:49:25 +09:30 · 2019-05-11 18:49:25 +09:30 · 5beecca906
parent 067cc0440a
commit 5beecca906
10 changed files with 123 additions and 60 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+tmp/*
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,4 +1,4 @@
-image: alpine:latest # Use latest version of Alpine Linux docker image
+image: alpine:latest # Use the latest version of Alpine Linux docker image

 before_script:
  # Install dependencies
@ -33,21 +33,10 @@ deploy:
    # Change to the downloaded repo directory
    - cd build/

-    # Give execute permission to scripts
-    - cd utils/
-    - chmod 700 umbrella-top-1m.sh script.sh commit.sh
-
-    # Download Umbrella Popularity List
-    - ./umbrella-top-1m.sh
-
-    # Download database dump and process it
-    - ./script.sh
-
-    # Commit the changes
-    - ./commit.sh
+    # Execute script.sh
+    - sh utils/script.sh

    # Push the commit
-    - cd ../
    - git push

  only:
--- a/utils/commit.sh
+++ b/utils/commit.sh
@ -1,6 +1,9 @@
 #!/bin/sh

-# Commit the filter update
+## Commit the filter update
+
+## GitLab CI does not permit shell variable in .gitlab-ci.yml.
+## This file is a workaround for that.

 CURRENT_TIME="$(date -R -u)"
 git commit -a -m "Filter updated: $CURRENT_TIME"
--- a/utils/malware-domains.sh
+++ b/utils/malware-domains.sh
@ -0,0 +1,22 @@
+#!/bin/sh
+
+## Parse domains from URLhaus excluding popular domains
+
+cat URLhaus.csv | \
+# Convert DOS to Unix line ending
+dos2unix | \
+# Parse online URLs only
+grep '"online"' | \
+# Parse domains and IP address only
+cut -f 6 -d '"' | \
+cut -f 3 -d '/' | \
+cut -f 1 -d ':' | \
+# Remove www
+# Only matches domains that start with www
+# Not examplewww.com
+sed -e 's/^www\.//g' | \
+# Sort and remove duplicates
+sort -u | \
+# Exclude Umbrella Top 1M and well-known domains
+# grep inverse match whole line
+grep -Fx -vf top-1m-well-known.txt > malware-domains.txt
--- a/utils/malware-url-top-domains.sh
+++ b/utils/malware-url-top-domains.sh
@ -0,0 +1,21 @@
+#!/bin/sh
+
+## Parse malware URLs from popular URLhaus domains
+
+cat URLhaus.csv | \
+# Convert DOS to Unix line ending
+dos2unix | \
+# Parse online URLs only
+grep '"online"' | \
+# Parse URLs
+cut -f 6 -d '"' | \
+cut -f 3- -d '/' | \
+cut -f 1- -d ':' | \
+# Remove www
+# Only matches domains that start with www
+# Not examplewww.com
+sed -e 's/^www\.//g' | \
+# Sort and remove duplicates
+sort -u | \
+# Include URLs from popular domains
+grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt
--- a/utils/prerequisites.sh
+++ b/utils/prerequisites.sh
@ -0,0 +1,10 @@
+#!/bin/sh
+
+# Download URLhaus database
+wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv
+
+# Download Cisco Umbrella 1 Million
+wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip
+
+cp ../src/URLhaus.csv .
+cp ../src/exclude.txt .
--- a/utils/script.sh
+++ b/utils/script.sh
@ -1,40 +1,15 @@
 #!/bin/sh

-# Download the URLhaus database dump and process it to be uBO-compatible
+mkdir tmp/
+cd tmp/

-CURRENT_TIME="$(date -R -u)"
-FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist"
-SECOND_LINE="! Updated: $CURRENT_TIME"
-THIRD_LINE="! Expires: 1 day (update frequency)"
-FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter"
-FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/"
-SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/"
-COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE"
+sh ../utils/prerequisites.sh
+sh ../utils/umbrella-top-1m.sh
+sh ../utils/malware-domains.sh
+sh ../utils/urlhaus-top-domains.sh
+sh ../utils/malware-url-top-domains.sh
+sh ../utils/urlhaus-filter.sh
+sh ../utils/commit.sh

-# Download the database dump
-wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv
-
-cat ../src/URLhaus.csv | \
-# Convert DOS to Unix line ending
-dos2unix | \
-# Parse online URLs only
-grep '"online"' | \
-# Parse domains and IP address only
-cut -f 6 -d '"' | \
-cut -f 3 -d '/' | \
-cut -f 1 -d ':' | \
-# Remove www
-# Only matches domains that start with www
-# Not examplewww.com
-sed -e 's/^www\.//g' | \
-# Sort and remove duplicates
-sort -u | \
-# Exclude Umbrella Top 1M. grep inverse match whole line
-grep -Fx -vf ../src/top-1m.txt | \
-# Exclude false positive
-grep -Fx -vf ../src/exclude.txt | \
-# Append header comment to the filter list
-sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt
-
-# Remove downloaded dataset
-rm ../src/top-1m.txt
+cd ../
+rm -r tmp/
--- a/utils/umbrella-top-1m.sh
+++ b/utils/umbrella-top-1m.sh
@ -1,11 +1,8 @@
 #!/bin/sh

-# Download the Cisco Umbrella 1 Million
-# More info:
-# https://s3-us-west-1.amazonaws.com/umbrella-static/index.html
-
-# Download the list
-wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip
+## Parse the Cisco Umbrella 1 Million
+## More info:
+## https://s3-us-west-1.amazonaws.com/umbrella-static/index.html

 # Decompress the zip and write output to stdout
 unzip -p top-1m.csv.zip | \
@ -13,12 +10,15 @@ unzip -p top-1m.csv.zip | \
 dos2unix | \
 # Parse domains only
 cut -f 2 -d ',' | \
+# Domain must have at least a 'dot'
+grep -F '.' | \
 # Remove www
 # Only matches domains that start with www
 # Not examplewww.com
 sed -e 's/^www\.//g' | \
 # Remove duplicates
-sort -u > ../src/top-1m.txt
+sort -u > top-1m.txt

-# Remove downloaded zip file
-rm top-1m.csv.zip
+# Merge Umbrella and self-maintained top domains
+cat top-1m.txt exclude.txt | \
+sort -u > top-1m-well-known.txt
--- a/utils/urlhaus-filter.sh
+++ b/utils/urlhaus-filter.sh
@ -0,0 +1,20 @@
+#!/bin/sh
+
+## Merge malware-domains.txt malware-url-top-domains.txt,
+## and append a header to instruct uBO to grab the filter daily.
+
+
+CURRENT_TIME="$(date -R -u)"
+FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist"
+SECOND_LINE="! Updated: $CURRENT_TIME"
+THIRD_LINE="! Expires: 1 day (update frequency)"
+FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter"
+FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/"
+SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/"
+COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE"
+
+cat malware-domains.txt malware-url-top-domains.txt | \
+# Sort alphabetically
+sort | \
+# Append header comment to the filter list
+sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt
--- a/utils/urlhaus-top-domains.sh
+++ b/utils/urlhaus-top-domains.sh
@ -0,0 +1,22 @@
+#!/bin/sh
+
+## Parse popular domains from URLhaus
+
+cat URLhaus.csv | \
+# Convert DOS to Unix line ending
+dos2unix | \
+# Parse online URLs only
+grep '"online"' | \
+# Parse domains and IP address only
+cut -f 6 -d '"' | \
+cut -f 3 -d '/' | \
+cut -f 1 -d ':' | \
+# Remove www
+# Only matches domains that start with www
+# Not examplewww.com
+sed -e 's/^www\.//g' | \
+# Sort and remove duplicates
+sort -u | \
+# Exclude Umbrella Top 1M and well-known domains
+# grep inverse match whole line
+grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt