feat: remove URLhaus.csv
- repo is getting too big - move URLhaus.csv to job artifact
This commit is contained in:
parent
27adfff015
commit
03ff3a9979
|
@ -52,6 +52,11 @@ deploy_job:
|
|||
- if: '$CI_COMMIT_REF_NAME == "master" && $CI_PIPELINE_SOURCE == "web"'
|
||||
when: always
|
||||
|
||||
# Upload working folder as a job artifact
|
||||
artifacts:
|
||||
paths:
|
||||
- tmp/
|
||||
|
||||
failed_job:
|
||||
stage: failed_stage
|
||||
|
||||
|
|
11
script.sh
11
script.sh
|
@ -3,7 +3,8 @@
|
|||
set -efux -o pipefail
|
||||
|
||||
## Create a temporary working folder
|
||||
mkdir -p "tmp/" && cd "tmp/"
|
||||
mkdir -p "tmp/"
|
||||
cd "tmp/"
|
||||
|
||||
|
||||
## Prepare datasets
|
||||
|
@ -18,10 +19,10 @@ unzip -p "urlhaus.zip" | \
|
|||
# Convert DOS to Unix line ending
|
||||
dos2unix | \
|
||||
# Remove comment
|
||||
sed "/^#/d" > "../src/URLhaus.csv"
|
||||
sed "/^#/d" > "URLhaus.csv"
|
||||
|
||||
## Parse URLs
|
||||
cat "../src/URLhaus.csv" | \
|
||||
cat "URLhaus.csv" | \
|
||||
cut -f 6 -d '"' | \
|
||||
cut -f 3- -d "/" | \
|
||||
# Domain must have at least a 'dot'
|
||||
|
@ -37,7 +38,7 @@ cut -f 1 -d ":" | \
|
|||
sort -u > "urlhaus-domains.txt"
|
||||
|
||||
## Parse online URLs only
|
||||
cat "../src/URLhaus.csv" | \
|
||||
cat "URLhaus.csv" | \
|
||||
grep '"online"' | \
|
||||
cut -f 6 -d '"' | \
|
||||
cut -f 3- -d "/" | \
|
||||
|
@ -197,4 +198,4 @@ sed '1 i\'"$COMMENT_ONLINE"'' | \
|
|||
sed "1s/Blocklist/Unbound Blocklist/" > "../urlhaus-filter-unbound-online.conf"
|
||||
|
||||
|
||||
cd ../ && rm -rf "tmp/"
|
||||
cd ../
|
||||
|
|
351107
src/URLhaus.csv
351107
src/URLhaus.csv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue