refactor: merge all scripts into script.sh

each script is too small
2019-05-29 14:45:08 +09:30 · 2019-05-29 14:45:08 +09:30 · c2a4f3f579
parent c720794f07
commit c2a4f3f579
9 changed files with 83 additions and 128 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -32,7 +32,7 @@ deploy:
    - cd build/

    # Run scripts
-    - sh index.sh
+    - sh script.sh

    # Commit the changes
    - sh utils/commit.sh
--- a/index.sh
+++ b/index.sh
@ -1,30 +0,0 @@
-#!/bin/sh
-
-# -e: Fail the whole script if any command fails
-# -x: Display running command
-set -e -x
-
-# Create a temporary working folder
-# -p: No error if existing
-mkdir -p tmp/ && cd tmp/
-
-# Download URLhaus database and Umbrella Top 1M
-sh ../utils/prerequisites.sh
-
-# Process the Umbrella Top 1M
-sh ../utils/umbrella-top-1m.sh
-
-# Parse popular domains that also appear in URLhaus
-sh ../utils/urlhaus-top-domains.sh
-
-# Parse domains from URLhaus excluding popular domains
-sh ../utils/malware-domains.sh
-
-# Parse malware URLs from popular domains
-sh ../utils/malware-url-top-domains.sh
-
-# Merge malware domains and URLs
-sh ../utils/urlhaus-filter.sh
-
-# Clean up the working folder
-cd ../ && rm -r tmp/
--- a/script.sh
+++ b/script.sh
@ -0,0 +1,82 @@
+#!/bin/sh
+
+set -e -x
+
+## Create a temporary working folder
+# -p: No error if existing
+mkdir -p tmp/ && cd tmp/
+
+
+## Prepare datasets
+wget https://urlhaus.abuse.ch/downloads/text/ -O ../src/URLhaus.txt
+wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip
+
+cp ../src/exclude.txt .
+
+## Clean up URLhaus.txt
+cat ../src/URLhaus.txt | \
+# Convert DOS to Unix line ending
+dos2unix | \
+# Remove comment
+sed 's/^#.*//g' | \
+# Remove http(s)://
+cut -f 3- -d '/' | \
+# Remove www.
+sed 's/^www\.//g' | \
+sort -u > urlhaus.txt
+
+## Parse domain and IP address only
+cat urlhaus.txt | \
+cut -f 1 -d '/' | \
+cut -f 1 -d ':' | \
+sort -u > urlhaus-domains.txt
+
+
+## Parse the Cisco Umbrella 1 Million
+unzip -p top-1m.csv.zip | \
+dos2unix | \
+# Parse domains only
+cut -f 2 -d ',' | \
+# Domain must have at least a 'dot'
+grep -F '.' | \
+# Remove www.
+sed 's/^www\.//g' | \
+sort -u > top-1m.txt
+
+# Merge Umbrella and self-maintained top domains
+cat top-1m.txt exclude.txt | \
+sort -u > top-1m-well-known.txt
+
+
+## Parse popular domains from URLhaus
+cat urlhaus-domains.txt | \
+# grep match whole line
+grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt
+
+
+## Parse domains from URLhaus excluding popular domains
+cat urlhaus-domains.txt | \
+grep -F -vf urlhaus-top-domains.txt > malware-domains.txt
+
+# Parse malware URLs from popular domains
+cat urlhaus.txt | \
+grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt
+
+
+## Merge malware domains and URLs
+CURRENT_TIME="$(date -R -u)"
+FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist"
+SECOND_LINE="! Updated: $CURRENT_TIME"
+THIRD_LINE="! Expires: 1 day (update frequency)"
+FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter"
+FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/"
+SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/"
+COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE"
+
+cat malware-domains.txt malware-url-top-domains.txt | \
+sort | \
+sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt
+
+
+## Clean up the working folder
+cd ../ && rm -r tmp/
--- a/utils/malware-domains.sh
+++ b/utils/malware-domains.sh
@ -1,8 +0,0 @@
-#!/bin/sh
-
-set -e -x
-
-## Parse domains from URLhaus excluding popular domains
-
-cat urlhaus-domains.txt | \
-grep -F -vf urlhaus-top-domains.txt > malware-domains.txt
--- a/utils/malware-url-top-domains.sh
+++ b/utils/malware-url-top-domains.sh
@ -1,8 +0,0 @@
-#!/bin/sh
-
-set -e -x
-
-## Parse malware URLs from popular URLhaus domains
-
-cat urlhaus.txt | \
-grep -F -f urlhaus-top-domains.txt > malware-url-top-domains.txt
--- a/utils/prerequisites.sh
+++ b/utils/prerequisites.sh
@ -1,29 +0,0 @@
-#!/bin/sh
-
-set -e -x
-
-# Download URLhaus database
-wget https://urlhaus.abuse.ch/downloads/text/ -O ../src/URLhaus.txt
-
-# Download Cisco Umbrella 1 Million
-wget https://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip -O top-1m.csv.zip
-
-cp ../src/exclude.txt .
-
-## Clean up URLhaus.txt
-cat ../src/URLhaus.txt | \
-# Convert DOS to Unix line ending
-dos2unix | \
-# Remove comment
-sed 's/^#.*//g' | \
-# Remove http(s)://
-cut -f 3- -d '/' | \
-# Remove www.
-sed 's/^www\.//g' | \
-sort -u > urlhaus.txt
-
-## Parse domain and IP address only
-cat urlhaus.txt | \
-cut -f 1 -d '/' | \
-cut -f 1 -d ':' | \
-sort -u > urlhaus-domains.txt
--- a/utils/umbrella-top-1m.sh
+++ b/utils/umbrella-top-1m.sh
@ -1,23 +0,0 @@
-#!/bin/sh
-
-set -e -x
-
-## Parse the Cisco Umbrella 1 Million
-## More info:
-## https://s3-us-west-1.amazonaws.com/umbrella-static/index.html
-
-# Decompress the zip and write output to stdout
-unzip -p top-1m.csv.zip | \
-# Convert DOS to Unix line ending
-dos2unix | \
-# Parse domains only
-cut -f 2 -d ',' | \
-# Domain must have at least a 'dot'
-grep -F '.' | \
-# Remove www.
-sed 's/^www\.//g' | \
-sort -u > top-1m.txt
-
-# Merge Umbrella and self-maintained top domains
-cat top-1m.txt exclude.txt | \
-sort -u > top-1m-well-known.txt
--- a/utils/urlhaus-filter.sh
+++ b/utils/urlhaus-filter.sh
@ -1,20 +0,0 @@
-#!/bin/sh
-
-set -e -x
-
-## Merge malware-domains.txt malware-url-top-domains.txt,
-## and append a header to instruct uBO to grab the filter daily.
-
-CURRENT_TIME="$(date -R -u)"
-FIRST_LINE="! Title: abuse.ch URLhaus Malicious URL Blocklist"
-SECOND_LINE="! Updated: $CURRENT_TIME"
-THIRD_LINE="! Expires: 1 day (update frequency)"
-FOURTH_LINE="! Repo: https://gitlab.com/curben/urlhaus-filter"
-FIFTH_LINE="! License: https://creativecommons.org/publicdomain/zero/1.0/"
-SIXTH_LINE="! Source: https://urlhaus.abuse.ch/api/"
-COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE"
-
-cat malware-domains.txt malware-url-top-domains.txt | \
-sort | \
-# Append header comment to the filter list
-sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt
--- a/utils/urlhaus-top-domains.sh
+++ b/utils/urlhaus-top-domains.sh
@ -1,9 +0,0 @@
-#!/bin/sh
-
-set -e -x
-
-## Parse popular domains from URLhaus
-
-cat urlhaus-domains.txt | \
-# grep match whole line
-grep -Fx -f top-1m-well-known.txt > urlhaus-top-domains.txt