diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b0319a49..26d49617 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -32,7 +32,7 @@ deploy: - cd build/ # Run scripts - - sh index.sh + - sh script.sh # Commit the changes - sh utils/commit.sh diff --git a/index.sh b/index.sh deleted file mode 100644 index e777ba1a..00000000 --- a/index.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/sh - -# -e: Fail the whole script if any command fails -# -x: Display running command -set -e -x - -# Create a temporary working folder -# -p: No error if existing -mkdir -p tmp/ && cd tmp/ - -# Download URLhaus database and Umbrella Top 1M -sh ../utils/prerequisites.sh - -# Process the Umbrella Top 1M -sh ../utils/umbrella-top-1m.sh - -# Parse popular domains that also appear in URLhaus -sh ../utils/urlhaus-top-domains.sh - -# Parse domains from URLhaus excluding popular domains -sh ../utils/malware-domains.sh - -# Parse malware URLs from popular domains -sh ../utils/malware-url-top-domains.sh - -# Merge malware domains and URLs -sh ../utils/urlhaus-filter.sh - -# Clean up the working folder -cd ../ && rm -r tmp/ diff --git a/script.sh b/script.sh new file mode 100644 index 00000000..bb0dccf1 --- /dev/null +++ b/script.sh @@ -0,0 +1,82 @@ +#!/bin/sh + +set -e -x + +## Create a temporary working folder +# -p: No error if existing +mkdir -p tmp/ && cd tmp/ + + +## Prepare datasets +wget https://urlhaus.abuse.ch/downloads/text/ -O ../src/URLhaus.txt +wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip + +cp ../src/exclude.txt . + +## Clean up URLhaus.txt +cat ../src/URLhaus.txt | \ +# Convert DOS to Unix line ending +dos2unix | \ +# Remove comment +sed 's/^#.*//g' | \ +# Remove http(s):// +cut -f 3- -d '/' | \ +# Remove www. +sed 's/^www\.//g' | \ +sort -u > urlhaus.txt + +## Parse domain and IP address only +cat urlhaus.txt | \ +cut -f 1 -d '/' | \ +cut -f 1 -d ':' | \ +sort -u > urlhaus-domains.txt + + +## Parse the Cisco Umbrella 1 Million +unzip -p top-1m.csv.zip | \ +dos2unix | \ +# Parse domains only +cut -f 2 -d ',' | \ +# Domain must have at least a 'dot' +grep -F '.' | \ +# Remove www. +sed 's/^www\.//g' | \ +sort -u > top-1m.txt + +# Merge Umbrella and self-maintained top domains +cat top-1m.txt exclude.txt | \ +sort -u > top-1m-well-known.txt + + +## Parse popular domains from URLhaus +cat urlhaus-domains.txt | \ +# grep match whole line +grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt + + +## Parse domains from URLhaus excluding popular domains +cat urlhaus-domains.txt | \ +grep -F -vf urlhaus-top-domains.txt > malware-domains.txt + +# Parse malware URLs from popular domains +cat urlhaus.txt | \ +grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt + + +## Merge malware domains and URLs +CURRENT_TIME="$(date -R -u)" +FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist" +SECOND_LINE="! Updated: $CURRENT_TIME" +THIRD_LINE="! Expires: 1 day (update frequency)" +FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter" +FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/" +SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/" +COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE" + +cat malware-domains.txt malware-url-top-domains.txt | \ +sort | \ +sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt + + +## Clean up the working folder +cd ../ && rm -r tmp/ diff --git a/utils/malware-domains.sh b/utils/malware-domains.sh deleted file mode 100644 index f27a6c32..00000000 --- a/utils/malware-domains.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh - -set -e -x - -## Parse domains from URLhaus excluding popular domains - -cat urlhaus-domains.txt | \ -grep -F -vf urlhaus-top-domains.txt > malware-domains.txt diff --git a/utils/malware-url-top-domains.sh b/utils/malware-url-top-domains.sh deleted file mode 100644 index f0400ede..00000000 --- a/utils/malware-url-top-domains.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh - -set -e -x - -## Parse malware URLs from popular URLhaus domains - -cat urlhaus.txt | \ -grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt diff --git a/utils/prerequisites.sh b/utils/prerequisites.sh deleted file mode 100644 index 81585957..00000000 --- a/utils/prerequisites.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/sh - -set -e -x - -# Download URLhaus database -wget https://urlhaus.abuse.ch/downloads/text/ -O ../src/URLhaus.txt - -# Download Cisco Umbrella 1 Million -wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip - -cp ../src/exclude.txt . - -## Clean up URLhaus.txt -cat ../src/URLhaus.txt | \ -# Convert DOS to Unix line ending -dos2unix | \ -# Remove comment -sed 's/^#.*//g' | \ -# Remove http(s):// -cut -f 3- -d '/' | \ -# Remove www. -sed 's/^www\.//g' | \ -sort -u > urlhaus.txt - -## Parse domain and IP address only -cat urlhaus.txt | \ -cut -f 1 -d '/' | \ -cut -f 1 -d ':' | \ -sort -u > urlhaus-domains.txt \ No newline at end of file diff --git a/utils/umbrella-top-1m.sh b/utils/umbrella-top-1m.sh deleted file mode 100644 index 69630f58..00000000 --- a/utils/umbrella-top-1m.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh - -set -e -x - -## Parse the Cisco Umbrella 1 Million -## More info: -## https://s3-us-west-1.amazonaws.com/umbrella-static/index.html - -# Decompress the zip and write output to stdout -unzip -p top-1m.csv.zip | \ -# Convert DOS to Unix line ending -dos2unix | \ -# Parse domains only -cut -f 2 -d ',' | \ -# Domain must have at least a 'dot' -grep -F '.' | \ -# Remove www. -sed 's/^www\.//g' | \ -sort -u > top-1m.txt - -# Merge Umbrella and self-maintained top domains -cat top-1m.txt exclude.txt | \ -sort -u > top-1m-well-known.txt diff --git a/utils/urlhaus-filter.sh b/utils/urlhaus-filter.sh deleted file mode 100644 index d40fc3bd..00000000 --- a/utils/urlhaus-filter.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh - -set -e -x - -## Merge malware-domains.txt malware-url-top-domains.txt, -## and append a header to instruct uBO to grab the filter daily. - -CURRENT_TIME="$(date -R -u)" -FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist" -SECOND_LINE="! Updated: $CURRENT_TIME" -THIRD_LINE="! Expires: 1 day (update frequency)" -FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter" -FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/" -SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/" -COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE" - -cat malware-domains.txt malware-url-top-domains.txt | \ -sort | \ -# Append header comment to the filter list -sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt diff --git a/utils/urlhaus-top-domains.sh b/utils/urlhaus-top-domains.sh deleted file mode 100644 index 254184cd..00000000 --- a/utils/urlhaus-top-domains.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/sh - -set -e -x - -## Parse popular domains from URLhaus - -cat urlhaus-domains.txt | \ -# grep match whole line -grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt