diff --git a/domains-blocklist.conf b/domains-blocklist.conf new file mode 100644 index 0000000..c121972 --- /dev/null +++ b/domains-blocklist.conf @@ -0,0 +1,67 @@ +# Local additions +file:domains-blocklist-local-additions.txt + +## ADS - TRACKERS ## + +# dnswarden +# https://raw.githubusercontent.com/dnswarden/blocklist/master/test/block-3rd-party-cnames.txt + +# Frogeye +https://hostfiles.frogeye.fr/firstparty-trackers.txt + +# NextDNS +https://raw.githubusercontent.com/nextdns/cname-cloaking-blocklist/master/domains + + +# Energized: Spark +# https://block.energized.pro/spark/formats/domains.txt + +# Energized: BluGo +# https://block.energized.pro/bluGo/formats/domains.txt + +# Energized: Blu +# https://block.energized.pro/blu/formats/domains.txt + +# Energized: Basic +# https://block.energized.pro/basic/formats/domains.txt + +# Energized: Ultimate +https://block.energized.pro/ultimate/formats/domains.txt + + +# Energized: Regional +https://block.energized.pro/extensions/regional/formats/domains.txt + +# Energized: Xtreme +https://block.energized.pro/extensions/xtreme/formats/domains.txt + +# Energized: Porn Lite +# https://block.energized.pro/extensions/porn-lite/formats/domains.txt + +# Energized: Social +# https://block.energized.pro/extensions/social/formats/domains.txt + +# Energized: IP +# https://block.energized.pro/extensions/ips/formats/list.txt + + + +## dnscrypt-proxy PROJECT ## + +# NoTracking's list - blocking ads, trackers and other online garbage +# https://raw.githubusercontent.com/notracking/hosts-blocklists/master/dnscrypt-proxy/dnscrypt-proxy.blacklist.txt + +# OISD.NL - Blocks ads, phishing, malware, tracking and more. Tries to minimize false positives. +# https://dbl.oisd.nl/ + + + +## SPOTIFY ## + +# x0uid +# https://raw.githubusercontent.com/x0uid/SpotifyAdBlock/master/SpotifyBlocklist.txt + +# CHEF-KOCH +# https://raw.githubusercontent.com/CHEF-KOCH/Spotify-Ad-free/master/filters/Spotify-HOSTS.txt + + diff --git a/generate-domains-blocklist.py b/generate-domains-blocklist.py new file mode 100644 index 0000000..986ec83 --- /dev/null +++ b/generate-domains-blocklist.py @@ -0,0 +1,307 @@ +#! /usr/bin/env python3 + +# run with python generate-domains-blocklist.py > list.txt.tmp && mv -f list.txt.tmp list + +import argparse +import re +import sys + +try: + import urllib2 as urllib + + URLLIB_NEW = False +except (ImportError, ModuleNotFoundError): + import urllib.request as urllib + from urllib.request import Request + + URLLIB_NEW = True + + +def parse_time_restricted_list(content): + rx_comment = re.compile(r"^(#|$)") + rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$") + rx_trusted = re.compile(r"^([*a-z0-9.-]+)\s*(@\S+)?$") + + names = set() + time_restrictions = {} + rx_set = [rx_trusted] + for line in content.splitlines(): + line = str.lower(str.strip(line)) + if rx_comment.match(line): + continue + line = rx_inline_comment.sub("", line) + for rx in rx_set: + matches = rx.match(line) + if not matches: + continue + name = matches.group(1) + names.add(name) + time_restriction = matches.group(2) + if time_restriction: + time_restrictions[name] = time_restriction + return names, time_restrictions + + +def parse_trusted_list(content): + names, _time_restrictions = parse_time_restricted_list(content) + time_restrictions = {} + return names, time_restrictions + + +def parse_list(content, trusted=False): + rx_comment = re.compile(r"^(#|$)") + rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$") + rx_u = re.compile( + r"^@*\|\|([a-z0-9][a-z0-9.-]*[.][a-z]{2,})\^?(\$(popup|third-party))?$") + rx_l = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$") + rx_h = re.compile( + r"^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$" + ) + rx_mdl = re.compile(r'^"[^"]+","([a-z0-9][a-z0-9.-]*[.][a-z]{2,})",') + rx_b = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,}),.+,[0-9: /-]+,") + rx_dq = re.compile(r"^address=/([a-z0-9][a-z0-9.-]*[.][a-z]{2,})/.") + + if trusted: + return parse_trusted_list(content) + + names = set() + time_restrictions = {} + rx_set = [rx_u, rx_l, rx_h, rx_mdl, rx_b, rx_dq] + for line in content.splitlines(): + line = str.lower(str.strip(line)) + if rx_comment.match(line): + continue + line = rx_inline_comment.sub("", line) + for rx in rx_set: + matches = rx.match(line) + if not matches: + continue + name = matches.group(1) + names.add(name) + return names, time_restrictions + + +# basic check if the line contains any regex specific char +def is_regex(line): + regex_chars = "*[]?}{" + return any(char in line for char in regex_chars) + + +def parse_regex(names): + regexes = set() + for line in names: + # skip lines without regex characters: + if not is_regex(line): + continue + # convert to python regex: + line=line.replace(".", "\.") + line=line.replace("*", ".*") + line = "^"+line+"$" + # check if resulting regex is valid: + try: + if re.compile(line): + regexes.add(line) + except re.error: + sys.stderr.write("Invalid regex: {} [{}]\n".format(line, re.error)) + continue + return regexes + + +def print_restricted_name(name, time_restrictions): + if name in time_restrictions: + print("{}\t{}".format(name, time_restrictions[name])) + else: + print( + "# ignored: [{}] was in the time-restricted list, " + "but without a time restriction label".format(name) + ) + + +def load_from_url(url): + sys.stderr.write("Loading data from [{}]\n".format(url)) + req = urllib.Request(url=url, headers={"User-Agent": "dnscrypt-proxy"}) + trusted = False + + if URLLIB_NEW: + req_type = req.type + else: + req_type = req.get_type() + if req_type == "file": + trusted = True + + response = None + try: + response = urllib.urlopen(req, timeout=int(args.timeout)) + except urllib.URLError as err: + raise Exception("[{}] could not be loaded: {}\n".format(url, err)) + if trusted is False and response.getcode() != 200: + raise Exception("[{}] returned HTTP code {}\n".format( + url, response.getcode())) + content = response.read() + if URLLIB_NEW: + content = content.decode("utf-8", errors="replace") + + return (content, trusted) + + +def name_cmp(name): + parts = name.split(".") + parts.reverse() + return str.join(".", parts) + + +def has_suffix(names, name): + parts = str.split(name, ".") + while parts: + parts = parts[1:] + if str.join(".", parts) in names: + return True + + return False + + +# check if a line matches with any of the collected regexes: +def covered_by_regex(line, regexes): + + # only check lines that aren't regexes themselves: + if not is_regex(line): + for regex in regexes: + if re.match(regex, line): + return True + + return False + + +def allowlist_from_url(url): + if not url: + return set() + content, trusted = load_from_url(url) + + names, _time_restrictions = parse_list(content, trusted) + return names + + +def blocklists_from_config_file( + file, allowlist, time_restricted_url, ignore_retrieval_failure +): + blocklists = {} + allowed_names = set() + all_regexes = set() + all_names = set() + unique_names = set() + + # Load conf & blocklists + with open(file) as fd: + for line in fd: + line = str.strip(line) + if str.startswith(line, "#") or line == "": + continue + url = line + try: + content, trusted = load_from_url(url) + names, _time_restrictions = parse_list(content, trusted) + blocklists[url] = names + all_names |= names + all_regexes |= parse_regex(names) + + except Exception as e: + sys.stderr.write(str(e)) + if not ignore_retrieval_failure: + exit(1) + + # Time-based blocklist + if time_restricted_url and not re.match(r"^[a-z0-9]+:", time_restricted_url): + time_restricted_url = "file:" + time_restricted_url + + if time_restricted_url: + time_restricted_content, _trusted = load_from_url(time_restricted_url) + time_restricted_names, time_restrictions = parse_time_restricted_list( + time_restricted_content + ) + + if time_restricted_names: + print("########## Time-based blocklist ##########\n") + for name in time_restricted_names: + print_restricted_name(name, time_restrictions) + + # Time restricted names should be allowed, or they could be always blocked + allowed_names |= time_restricted_names + + # Allowed list + if allowlist and not re.match(r"^[a-z0-9]+:", allowlist): + allowlist = "file:" + allowlist + + allowed_names |= allowlist_from_url(allowlist) + + # Process blocklists + for url, names in blocklists.items(): + print("\n\n########## Blocklist from {} ##########\n".format(url)) + ignored, allowed = 0, 0 + list_names = list() + for name in names: + if has_suffix(all_names, name) or name in unique_names or covered_by_regex(name, all_regexes): + ignored = ignored + 1 + elif has_suffix(allowed_names, name) or name in allowed_names: + allowed = allowed + 1 + else: + list_names.append(name) + unique_names.add(name) + + list_names.sort(key=name_cmp) + if ignored: + print("# Ignored duplicates: {}\n".format(ignored)) + if allowed: + print("# Ignored entries due to the allowlist: {}\n".format(allowed)) + for name in list_names: + print(name) + + +argp = argparse.ArgumentParser( + description="Create a unified blocklist from a set of local and remote files" +) +argp.add_argument( + "-c", + "--config", + default="domains-blocklist.conf", + help="file containing blocklist sources", +) +argp.add_argument( + "-w", + "--whitelist", + help="Deprecated. Please use -a or --allowlist", +) +argp.add_argument( + "-a", + "--allowlist", + default="domains-allowlist.txt", + help="file containing a set of names to exclude from the blocklist", +) +argp.add_argument( + "-r", + "--time-restricted", + default="domains-time-restricted.txt", + help="file containing a set of names to be time restricted", +) +argp.add_argument( + "-i", + "--ignore-retrieval-failure", + action="store_true", + help="generate list even if some urls couldn't be retrieved", +) +argp.add_argument("-t", "--timeout", default=30, help="URL open timeout") + +args = argp.parse_args() + +whitelist = args.whitelist +if whitelist: + print('Use of -w WHITELIST has been removed. Please use -a ALLOWLIST instead.') + exit(1) + +conf = args.config +allowlist = args.allowlist +time_restricted = args.time_restricted +ignore_retrieval_failure = args.ignore_retrieval_failure + +blocklists_from_config_file( + conf, allowlist, time_restricted, ignore_retrieval_failure) +