diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..1fb9ef57 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +tmp/* diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 14796915..6f1ad485 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -image: alpine:latest # Use latest version of Alpine Linux docker image +image: alpine:latest # Use the latest version of Alpine Linux docker image before_script: # Install dependencies @@ -33,21 +33,10 @@ deploy: # Change to the downloaded repo directory - cd build/ - # Give execute permission to scripts - - cd utils/ - - chmod 700 umbrella-top-1m.sh script.sh commit.sh - - # Download Umbrella Popularity List - - ./umbrella-top-1m.sh - - # Download database dump and process it - - ./script.sh - - # Commit the changes - - ./commit.sh + # Execute script.sh + - sh utils/script.sh # Push the commit - - cd ../ - git push only: diff --git a/utils/commit.sh b/utils/commit.sh old mode 100755 new mode 100644 index 7fe7a801..033936a5 --- a/utils/commit.sh +++ b/utils/commit.sh @@ -1,6 +1,9 @@ #!/bin/sh -# Commit the filter update +## Commit the filter update + +## GitLab CI does not permit shell variable in .gitlab-ci.yml. +## This file is a workaround for that. CURRENT_TIME="$(date -R -u)" git commit -a -m "Filter updated: $CURRENT_TIME" diff --git a/utils/malware-domains.sh b/utils/malware-domains.sh new file mode 100644 index 00000000..d4632fc8 --- /dev/null +++ b/utils/malware-domains.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +## Parse domains from URLhaus excluding popular domains + +cat URLhaus.csv | \ +# Convert DOS to Unix line ending +dos2unix | \ +# Parse online URLs only +grep '"online"' | \ +# Parse domains and IP address only +cut -f 6 -d '"' | \ +cut -f 3 -d '/' | \ +cut -f 1 -d ':' | \ +# Remove www +# Only matches domains that start with www +# Not examplewww.com +sed -e 's/^www\.//g' | \ +# Sort and remove duplicates +sort -u | \ +# Exclude Umbrella Top 1M and well-known domains +# grep inverse match whole line +grep -Fx -vf top-1m-well-known.txt > malware-domains.txt diff --git a/utils/malware-url-top-domains.sh b/utils/malware-url-top-domains.sh new file mode 100644 index 00000000..445f55e1 --- /dev/null +++ b/utils/malware-url-top-domains.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +## Parse malware URLs from popular URLhaus domains + +cat URLhaus.csv | \ +# Convert DOS to Unix line ending +dos2unix | \ +# Parse online URLs only +grep '"online"' | \ +# Parse URLs +cut -f 6 -d '"' | \ +cut -f 3- -d '/' | \ +cut -f 1- -d ':' | \ +# Remove www +# Only matches domains that start with www +# Not examplewww.com +sed -e 's/^www\.//g' | \ +# Sort and remove duplicates +sort -u | \ +# Include URLs from popular domains +grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt diff --git a/utils/prerequisites.sh b/utils/prerequisites.sh new file mode 100644 index 00000000..dca0f2c0 --- /dev/null +++ b/utils/prerequisites.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +# Download URLhaus database +wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv + +# Download Cisco Umbrella 1 Million +wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip + +cp ../src/URLhaus.csv . +cp ../src/exclude.txt . diff --git a/utils/script.sh b/utils/script.sh old mode 100755 new mode 100644 index 2ebe574e..d571d5ab --- a/utils/script.sh +++ b/utils/script.sh @@ -1,40 +1,15 @@ #!/bin/sh -# Download the URLhaus database dump and process it to be uBO-compatible +mkdir tmp/ +cd tmp/ -CURRENT_TIME="$(date -R -u)" -FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist" -SECOND_LINE="! Updated: $CURRENT_TIME" -THIRD_LINE="! Expires: 1 day (update frequency)" -FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter" -FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/" -SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/" -COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE" +sh ../utils/prerequisites.sh +sh ../utils/umbrella-top-1m.sh +sh ../utils/malware-domains.sh +sh ../utils/urlhaus-top-domains.sh +sh ../utils/malware-url-top-domains.sh +sh ../utils/urlhaus-filter.sh +sh ../utils/commit.sh -# Download the database dump -wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv - -cat ../src/URLhaus.csv | \ -# Convert DOS to Unix line ending -dos2unix | \ -# Parse online URLs only -grep '"online"' | \ -# Parse domains and IP address only -cut -f 6 -d '"' | \ -cut -f 3 -d '/' | \ -cut -f 1 -d ':' | \ -# Remove www -# Only matches domains that start with www -# Not examplewww.com -sed -e 's/^www\.//g' | \ -# Sort and remove duplicates -sort -u | \ -# Exclude Umbrella Top 1M. grep inverse match whole line -grep -Fx -vf ../src/top-1m.txt | \ -# Exclude false positive -grep -Fx -vf ../src/exclude.txt | \ -# Append header comment to the filter list -sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt - -# Remove downloaded dataset -rm ../src/top-1m.txt +cd ../ +rm -r tmp/ diff --git a/utils/umbrella-top-1m.sh b/utils/umbrella-top-1m.sh old mode 100755 new mode 100644 index 59b8f598..751f61b4 --- a/utils/umbrella-top-1m.sh +++ b/utils/umbrella-top-1m.sh @@ -1,11 +1,8 @@ #!/bin/sh -# Download the Cisco Umbrella 1 Million -# More info: -# https://s3-us-west-1.amazonaws.com/umbrella-static/index.html - -# Download the list -wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip +## Parse the Cisco Umbrella 1 Million +## More info: +## https://s3-us-west-1.amazonaws.com/umbrella-static/index.html # Decompress the zip and write output to stdout unzip -p top-1m.csv.zip | \ @@ -13,12 +10,15 @@ unzip -p top-1m.csv.zip | \ dos2unix | \ # Parse domains only cut -f 2 -d ',' | \ +# Domain must have at least a 'dot' +grep -F '.' | \ # Remove www # Only matches domains that start with www # Not examplewww.com sed -e 's/^www\.//g' | \ # Remove duplicates -sort -u > ../src/top-1m.txt +sort -u > top-1m.txt -# Remove downloaded zip file -rm top-1m.csv.zip +# Merge Umbrella and self-maintained top domains +cat top-1m.txt exclude.txt | \ +sort -u > top-1m-well-known.txt diff --git a/utils/urlhaus-filter.sh b/utils/urlhaus-filter.sh new file mode 100644 index 00000000..8e2c87f7 --- /dev/null +++ b/utils/urlhaus-filter.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +## Merge malware-domains.txt malware-url-top-domains.txt, +## and append a header to instruct uBO to grab the filter daily. + + +CURRENT_TIME="$(date -R -u)" +FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist" +SECOND_LINE="! Updated: $CURRENT_TIME" +THIRD_LINE="! Expires: 1 day (update frequency)" +FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter" +FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/" +SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/" +COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE" + +cat malware-domains.txt malware-url-top-domains.txt | \ +# Sort alphabetically +sort | \ +# Append header comment to the filter list +sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt diff --git a/utils/urlhaus-top-domains.sh b/utils/urlhaus-top-domains.sh new file mode 100644 index 00000000..e8d3484d --- /dev/null +++ b/utils/urlhaus-top-domains.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +## Parse popular domains from URLhaus + +cat URLhaus.csv | \ +# Convert DOS to Unix line ending +dos2unix | \ +# Parse online URLs only +grep '"online"' | \ +# Parse domains and IP address only +cut -f 6 -d '"' | \ +cut -f 3 -d '/' | \ +cut -f 1 -d ':' | \ +# Remove www +# Only matches domains that start with www +# Not examplewww.com +sed -e 's/^www\.//g' | \ +# Sort and remove duplicates +sort -u | \ +# Exclude Umbrella Top 1M and well-known domains +# grep inverse match whole line +grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt