fix: dedup entries & parse .js links only

This commit is contained in:
Ming Di Leom 2021-06-01 10:01:25 +00:00
parent d880d44655
commit b0477feea0
5 changed files with 108529 additions and 242610 deletions

117037
dist/tracking-data.txt vendored

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

117032
dist/tracking-filter.txt vendored

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +1,13 @@
'use strict'
const { join } = require('path')
const { appendFile, readdir, readFile, writeFile } = require('fs').promises
const { appendFile, readdir, readFile, rm } = require('fs').promises
const { parse } = JSON
const f = async () => {
// __dirname is src/
const outputFile = join(__dirname, '../dist/tracking-data.txt')
await writeFile(outputFile, `# Title: Tracking URL
# Updated: ${new Date().toUTCString()}
# Repo: https://gitlab.com/curben/tracking-filter
# License: https://gitlab.com/curben/tracking-filter#license
# Source: https://github.com/duckduckgo/tracker-radar\n`)
const outputFile = join(__dirname, '../tmp/tracking-data-raw.txt')
await rm(outputFile, { force: true })
const domains = join(__dirname, '../tmp/tracker-radar/domains')
const countries = await readdir(domains)
for (const country of countries) {
@ -21,8 +17,8 @@ const f = async () => {
const { resources } = parse(data)
const tracking = resources.filter(({ fingerprinting }) => fingerprinting === 1)
for (const { rule } of tracking) {
const link = rule.replace(/\\/g, '') + '\n'
await appendFile(outputFile, link)
const link = rule.replace(/\\/g, '').replace(/^www\./g, '')
if (link.endsWith('.js')) await appendFile(outputFile, link + '\n')
}
}
}

View File

@ -7,20 +7,15 @@ cd "tmp/"
# Prepare datasets
# rm -rf "tracker-radar/"
# git clone --depth 1 "https://github.com/duckduckgo/tracker-radar" "tracker-radar/"
rm -rf "tracker-radar/"
git clone --depth 1 "https://github.com/duckduckgo/tracker-radar" "tracker-radar/"
# Extract tracking links
node "../src/script.js"
# # Extract tracking links
# node "../src/script.js"
# # Cleanup
# cat "../dist/tracking-data.txt" | \
# # Remove comment
# sed "/^#/d" | \
# # Remove www.
# sed "s/^www\.//g" > "tracking-url.txt"
# Cleanup
cat "tracking-data-raw.txt" | \
sort -u > "tracking-url.txt"
CURRENT_TIME="$(date -R -u)"
@ -32,6 +27,10 @@ FIFTH_LINE="! License: https://gitlab.com/curben/tracking-filter#license"
SIXTH_LINE="! Source: https://github.com/duckduckgo/tracker-radar"
COMMENT_UBO="$FIRST_LINE\n$SECOND_LINE\n$THIRD_LINE\n$FOURTH_LINE\n$FIFTH_LINE\n$SIXTH_LINE"
# Original data
cat "tracking-url.txt" | \
sed '1 i\'"$COMMENT_UBO"'' | \
sed "s/^!/#/g" > "../dist/tracking-data.txt"
# uBO & Adguard
cat "tracking-url.txt" | \
@ -39,7 +38,6 @@ sed "s/^/||/g" | \
sed "s/$/\$all/g" | \
sed '1 i\'"$COMMENT_UBO"'' > "../dist/tracking-filter.txt"
# Vivaldi
cat "tracking-url.txt" | \
sed "s/^/||/g" | \