diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..1fb9ef57 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +tmp/* diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 14796915..6f1ad485 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,4 +1,4 @@ -image: alpine:latest # Use latest version of Alpine Linux docker image +image: alpine:latest # Use the latest version of Alpine Linux docker image before_script: # Install dependencies @@ -33,21 +33,10 @@ deploy: # Change to the downloaded repo directory - cd build/ - # Give execute permission to scripts - - cd utils/ - - chmod 700 umbrella-top-1m.sh script.sh commit.sh - - # Download Umbrella Popularity List - - ./umbrella-top-1m.sh - - # Download database dump and process it - - ./script.sh - - # Commit the changes - - ./commit.sh + # Execute script.sh + - sh utils/script.sh # Push the commit - - cd ../ - git push only: diff --git a/README.md b/README.md index 467a7a87..3babc3c5 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,6 @@ Mirrors: - https://glcdn.githack.com/curben/urlhaus-filter/raw/master/urlhaus-filter.txt - https://cdn.staticaly.com/gl/curben/urlhaus-filter/raw/master/urlhaus-filter.txt -## Description - -Following URL categories are removed from the database dump: - -- Offline URLs -- Well-known domains from the [Umbrella Popularity List](https://s3-us-west-1.amazonaws.com/umbrella-static/index.html). -- False positives ([exclude.txt](src/exclude.txt)) - -Database dump is saved as [URLhaus.csv](src/URLhaus.csv), get processed by [script.sh](utils/script.sh) and output as [urlhaus-filter.txt](urlhaus-filter.txt). - ## Compatibility This filter is only tested with uBO. [FilterLists](https://filterlists.com/) shows it is compatible with the following software: @@ -40,11 +30,13 @@ This filter is only tested with uBO. [FilterLists](https://filterlists.com/) sho - [Samsung Knox](https://www.samsungknox.com/) - [uMatrix](https://github.com/gorhill/uMatrix) +Note that some of the software above are host-based only, meaning it cannot block malware URLs hosted by well-known domains (e.g. amazonaws.com, docs.google.com, dropbox.com). For best compatibility, use uBO or its fork NanoAdblocker. + ## Issues -Report any false positive by creating an [issue](https://gitlab.com/curben/urlhaus-filter/issues). +Report any false positive by creating an [issue](https://gitlab.com/curben/urlhaus-filter/issues) or [merge request](https://gitlab.com/curben/urlhaus-filter/merge_requests) -This filter **only** accepts malware URLs from the [URLhaus](https://urlhaus.abuse.ch/). +This filter **only** accepts malware URLs from [URLhaus](https://urlhaus.abuse.ch/). Please report new malware URL to the upstream maintainer through https://urlhaus.abuse.ch/api/#submit. @@ -54,7 +46,7 @@ This repo is not endorsed by Abuse.sh. Since the filter is updated frequently, cloning the repo would become slower over time as the revision grows. -Use shallow clone to get the recent revisions only. Getting the last five revisions is sufficient for a valid MR. +Use shallow clone to get the recent revisions only. Getting the last five revisions should be sufficient for a valid MR. `git clone --depth 5 https://gitlab.com/curben/urlhaus-filter.git` diff --git a/utils/commit.sh b/utils/commit.sh old mode 100755 new mode 100644 index 7fe7a801..033936a5 --- a/utils/commit.sh +++ b/utils/commit.sh @@ -1,6 +1,9 @@ #!/bin/sh -# Commit the filter update +## Commit the filter update + +## GitLab CI does not permit shell variable in .gitlab-ci.yml. +## This file is a workaround for that. CURRENT_TIME="$(date -R -u)" git commit -a -m "Filter updated: $CURRENT_TIME" diff --git a/utils/malware-domains.sh b/utils/malware-domains.sh new file mode 100644 index 00000000..d4632fc8 --- /dev/null +++ b/utils/malware-domains.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +## Parse domains from URLhaus excluding popular domains + +cat URLhaus.csv | \ +# Convert DOS to Unix line ending +dos2unix | \ +# Parse online URLs only +grep '"online"' | \ +# Parse domains and IP address only +cut -f 6 -d '"' | \ +cut -f 3 -d '/' | \ +cut -f 1 -d ':' | \ +# Remove www +# Only matches domains that start with www +# Not examplewww.com +sed -e 's/^www\.//g' | \ +# Sort and remove duplicates +sort -u | \ +# Exclude Umbrella Top 1M and well-known domains +# grep inverse match whole line +grep -Fx -vf top-1m-well-known.txt > malware-domains.txt diff --git a/utils/malware-url-top-domains.sh b/utils/malware-url-top-domains.sh new file mode 100644 index 00000000..445f55e1 --- /dev/null +++ b/utils/malware-url-top-domains.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +## Parse malware URLs from popular URLhaus domains + +cat URLhaus.csv | \ +# Convert DOS to Unix line ending +dos2unix | \ +# Parse online URLs only +grep '"online"' | \ +# Parse URLs +cut -f 6 -d '"' | \ +cut -f 3- -d '/' | \ +cut -f 1- -d ':' | \ +# Remove www +# Only matches domains that start with www +# Not examplewww.com +sed -e 's/^www\.//g' | \ +# Sort and remove duplicates +sort -u | \ +# Include URLs from popular domains +grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt diff --git a/utils/prerequisites.sh b/utils/prerequisites.sh new file mode 100644 index 00000000..dca0f2c0 --- /dev/null +++ b/utils/prerequisites.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +# Download URLhaus database +wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv + +# Download Cisco Umbrella 1 Million +wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip + +cp ../src/URLhaus.csv . +cp ../src/exclude.txt . diff --git a/utils/script.sh b/utils/script.sh old mode 100755 new mode 100644 index 2ebe574e..d571d5ab --- a/utils/script.sh +++ b/utils/script.sh @@ -1,40 +1,15 @@ #!/bin/sh -# Download the URLhaus database dump and process it to be uBO-compatible +mkdir tmp/ +cd tmp/ -CURRENT_TIME="$(date -R -u)" -FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist" -SECOND_LINE="! Updated: $CURRENT_TIME" -THIRD_LINE="! Expires: 1 day (update frequency)" -FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter" -FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/" -SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/" -COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE" +sh ../utils/prerequisites.sh +sh ../utils/umbrella-top-1m.sh +sh ../utils/malware-domains.sh +sh ../utils/urlhaus-top-domains.sh +sh ../utils/malware-url-top-domains.sh +sh ../utils/urlhaus-filter.sh +sh ../utils/commit.sh -# Download the database dump -wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv - -cat ../src/URLhaus.csv | \ -# Convert DOS to Unix line ending -dos2unix | \ -# Parse online URLs only -grep '"online"' | \ -# Parse domains and IP address only -cut -f 6 -d '"' | \ -cut -f 3 -d '/' | \ -cut -f 1 -d ':' | \ -# Remove www -# Only matches domains that start with www -# Not examplewww.com -sed -e 's/^www\.//g' | \ -# Sort and remove duplicates -sort -u | \ -# Exclude Umbrella Top 1M. grep inverse match whole line -grep -Fx -vf ../src/top-1m.txt | \ -# Exclude false positive -grep -Fx -vf ../src/exclude.txt | \ -# Append header comment to the filter list -sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt - -# Remove downloaded dataset -rm ../src/top-1m.txt +cd ../ +rm -r tmp/ diff --git a/utils/umbrella-top-1m.sh b/utils/umbrella-top-1m.sh old mode 100755 new mode 100644 index 59b8f598..751f61b4 --- a/utils/umbrella-top-1m.sh +++ b/utils/umbrella-top-1m.sh @@ -1,11 +1,8 @@ #!/bin/sh -# Download the Cisco Umbrella 1 Million -# More info: -# https://s3-us-west-1.amazonaws.com/umbrella-static/index.html - -# Download the list -wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip +## Parse the Cisco Umbrella 1 Million +## More info: +## https://s3-us-west-1.amazonaws.com/umbrella-static/index.html # Decompress the zip and write output to stdout unzip -p top-1m.csv.zip | \ @@ -13,12 +10,15 @@ unzip -p top-1m.csv.zip | \ dos2unix | \ # Parse domains only cut -f 2 -d ',' | \ +# Domain must have at least a 'dot' +grep -F '.' | \ # Remove www # Only matches domains that start with www # Not examplewww.com sed -e 's/^www\.//g' | \ # Remove duplicates -sort -u > ../src/top-1m.txt +sort -u > top-1m.txt -# Remove downloaded zip file -rm top-1m.csv.zip +# Merge Umbrella and self-maintained top domains +cat top-1m.txt exclude.txt | \ +sort -u > top-1m-well-known.txt diff --git a/utils/urlhaus-filter.sh b/utils/urlhaus-filter.sh new file mode 100644 index 00000000..8e2c87f7 --- /dev/null +++ b/utils/urlhaus-filter.sh @@ -0,0 +1,20 @@ +#!/bin/sh + +## Merge malware-domains.txt malware-url-top-domains.txt, +## and append a header to instruct uBO to grab the filter daily. + + +CURRENT_TIME="$(date -R -u)" +FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist" +SECOND_LINE="! Updated: $CURRENT_TIME" +THIRD_LINE="! Expires: 1 day (update frequency)" +FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter" +FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/" +SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/" +COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE" + +cat malware-domains.txt malware-url-top-domains.txt | \ +# Sort alphabetically +sort | \ +# Append header comment to the filter list +sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt diff --git a/utils/urlhaus-top-domains.sh b/utils/urlhaus-top-domains.sh new file mode 100644 index 00000000..e8d3484d --- /dev/null +++ b/utils/urlhaus-top-domains.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +## Parse popular domains from URLhaus + +cat URLhaus.csv | \ +# Convert DOS to Unix line ending +dos2unix | \ +# Parse online URLs only +grep '"online"' | \ +# Parse domains and IP address only +cut -f 6 -d '"' | \ +cut -f 3 -d '/' | \ +cut -f 1 -d ':' | \ +# Remove www +# Only matches domains that start with www +# Not examplewww.com +sed -e 's/^www\.//g' | \ +# Sort and remove duplicates +sort -u | \ +# Exclude Umbrella Top 1M and well-known domains +# grep inverse match whole line +grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt