diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 688588fb..eea06448 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -34,6 +34,7 @@ deploy: - cd build # Give execute permission to scripts + - cd src/ - chmod 700 umbrella-top-1m.sh script.sh commit.sh # Download Umbrella Popularity List @@ -46,6 +47,7 @@ deploy: - ./commit.sh # Push the commit + - cd ../ - git push only: diff --git a/README.md b/README.md index 9e889e49..a8223d74 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,9 @@ https://gitlab.com/curben/urlhaus/raw/master/urlhaus-filter.txt Following URL categories are removed from the database dump: - Offline URL -- Well-known host ([top-1m.txt](top-1m.txt)) or false positives ([exclude.txt](exclude.txt)) +- Well-known host ([top-1m.txt](src/top-1m.txt)) or false positives ([exclude.txt](src/exclude.txt)) -Database dump is saved as [URLhaus.csv](URLhaus.csv), processed by [script.sh](script.sh) and output as [urlhaus-filter.txt](urlhaus-filter.txt). +Database dump is saved as [src/URLhaus.csv](URLhaus.csv), processed by [script.sh](utils/script.sh) and output as [urlhaus-filter.txt](urlhaus-filter.txt). ## Note diff --git a/exclude.txt b/exclude.txt deleted file mode 100644 index 3d62dcb9..00000000 --- a/exclude.txt +++ /dev/null @@ -1 +0,0 @@ -o.aolcdn.com diff --git a/URLhaus.csv b/src/URLhaus.csv similarity index 100% rename from URLhaus.csv rename to src/URLhaus.csv diff --git a/src/exclude.txt b/src/exclude.txt new file mode 100644 index 00000000..fdc232e6 --- /dev/null +++ b/src/exclude.txt @@ -0,0 +1 @@ +# Nothing yet... diff --git a/top-1m.txt b/src/top-1m.txt similarity index 100% rename from top-1m.txt rename to src/top-1m.txt diff --git a/commit.sh b/utils/commit.sh similarity index 100% rename from commit.sh rename to utils/commit.sh diff --git a/script.sh b/utils/script.sh similarity index 81% rename from script.sh rename to utils/script.sh index 2f3c670d..bda7c229 100644 --- a/script.sh +++ b/utils/script.sh @@ -11,7 +11,7 @@ FIFTH_LINE="! Source: https://urlhaus.abuse.ch/api/" COMMENT="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE" # Download the database dump -wget https://urlhaus.abuse.ch/downloads/csv/ -O URLhaus.csv +wget https://urlhaus.abuse.ch/downloads/csv/ -O ../src/URLhaus.csv # Parse domains and IP address only cat URLhaus.csv | \ @@ -22,8 +22,8 @@ cut -f 1 -d ':' | \ # Sort and remove duplicates sort -u | \ # Exclude Umbrella Top 1M -grep -vf top-1m.txt | \ +grep -vf ../src/top-1m.txt | \ # Exclude false positive -grep -vf exclude.txt | \ +grep -vf ../src/exclude.txt | \ # Append header comment to the filter list -sed '1 i\'"$COMMENT"'' > urlhaus-filter.txt +sed '1 i\'"$COMMENT"'' > ../urlhaus-filter.txt diff --git a/umbrella-top-1m.sh b/utils/umbrella-top-1m.sh similarity index 88% rename from umbrella-top-1m.sh rename to utils/umbrella-top-1m.sh index 3fda700f..0eba9f02 100644 --- a/umbrella-top-1m.sh +++ b/utils/umbrella-top-1m.sh @@ -9,4 +9,4 @@ wget -O- http://s3-us-west-1.amazonaws.com/umbrella-static/top-1m.csv.zip | \ # Unzip funzip | \ # Parse domains only -cut -f 2 -d ',' > top-1m.txt +cut -f 2 -d ',' > ../src/top-1m.txt