feat: remove URLhaus.csv

- repo is getting too big
- move URLhaus.csv to job artifact
This commit is contained in:
curben 2020-05-14 10:44:25 +01:00
parent 27adfff015
commit 03ff3a9979
No known key found for this signature in database
GPG Key ID: 5D9DB57A25D34EE3
3 changed files with 11 additions and 351112 deletions

View File

@ -52,6 +52,11 @@ deploy_job:
- if: '$CI_COMMIT_REF_NAME == "master" && $CI_PIPELINE_SOURCE == "web"'
when: always
# Upload working folder as a job artifact
artifacts:
paths:
- tmp/
failed_job:
stage: failed_stage

View File

@ -3,7 +3,8 @@
set -efux -o pipefail
## Create a temporary working folder
mkdir -p "tmp/" && cd "tmp/"
mkdir -p "tmp/"
cd "tmp/"
## Prepare datasets
@ -18,10 +19,10 @@ unzip -p "urlhaus.zip" | \
# Convert DOS to Unix line ending
dos2unix | \
# Remove comment
sed "/^#/d" > "../src/URLhaus.csv"
sed "/^#/d" > "URLhaus.csv"
## Parse URLs
cat "../src/URLhaus.csv" | \
cat "URLhaus.csv" | \
cut -f 6 -d '"' | \
cut -f 3- -d "/" | \
# Domain must have at least a 'dot'
@ -37,7 +38,7 @@ cut -f 1 -d ":" | \
sort -u > "urlhaus-domains.txt"
## Parse online URLs only
cat "../src/URLhaus.csv" | \
cat "URLhaus.csv" | \
grep '"online"' | \
cut -f 6 -d '"' | \
cut -f 3- -d "/" | \
@ -197,4 +198,4 @@ sed '1 i\'"$COMMENT_ONLINE"'' | \
sed "1s/Blocklist/Unbound Blocklist/" > "../urlhaus-filter-unbound-online.conf"
cd ../ && rm -rf "tmp/"
cd ../

File diff suppressed because it is too large Load Diff