[UPDATE] - init

2020-10-27 05:34:53 -04:00 · 2020-10-27 05:34:53 -04:00 · ad3a1afdb1
parent 63a0e94e58
commit ad3a1afdb1
2 changed files with 374 additions and 0 deletions
--- a/domains-blocklist.conf
+++ b/domains-blocklist.conf
@ -0,0 +1,67 @@
+# Local additions
+file:domains-blocklist-local-additions.txt
+
+## ADS - TRACKERS ##
+
+# dnswarden
+# https://raw.githubusercontent.com/dnswarden/blocklist/master/test/block-3rd-party-cnames.txt
+
+# Frogeye
+https://hostfiles.frogeye.fr/firstparty-trackers.txt
+
+# NextDNS
+https://raw.githubusercontent.com/nextdns/cname-cloaking-blocklist/master/domains
+
+
+# Energized: Spark
+# https://block.energized.pro/spark/formats/domains.txt
+
+# Energized: BluGo
+# https://block.energized.pro/bluGo/formats/domains.txt
+
+# Energized: Blu
+# https://block.energized.pro/blu/formats/domains.txt
+
+# Energized: Basic
+# https://block.energized.pro/basic/formats/domains.txt
+
+# Energized: Ultimate
+https://block.energized.pro/ultimate/formats/domains.txt
+
+
+# Energized: Regional
+https://block.energized.pro/extensions/regional/formats/domains.txt
+
+# Energized: Xtreme
+https://block.energized.pro/extensions/xtreme/formats/domains.txt
+
+# Energized: Porn Lite
+# https://block.energized.pro/extensions/porn-lite/formats/domains.txt
+
+# Energized: Social
+# https://block.energized.pro/extensions/social/formats/domains.txt
+
+# Energized: IP
+# https://block.energized.pro/extensions/ips/formats/list.txt
+
+
+
+## dnscrypt-proxy PROJECT ##
+
+# NoTracking's list - blocking ads, trackers and other online garbage
+# https://raw.githubusercontent.com/notracking/hosts-blocklists/master/dnscrypt-proxy/dnscrypt-proxy.blacklist.txt
+
+# OISD.NL - Blocks ads, phishing, malware, tracking and more. Tries to minimize false positives.
+# https://dbl.oisd.nl/
+
+
+
+## SPOTIFY ##
+
+# x0uid
+# https://raw.githubusercontent.com/x0uid/SpotifyAdBlock/master/SpotifyBlocklist.txt
+
+# CHEF-KOCH
+# https://raw.githubusercontent.com/CHEF-KOCH/Spotify-Ad-free/master/filters/Spotify-HOSTS.txt
+
+
--- a/generate-domains-blocklist.py
+++ b/generate-domains-blocklist.py
@ -0,0 +1,307 @@
+#! /usr/bin/env python3
+
+# run with python generate-domains-blocklist.py > list.txt.tmp && mv -f list.txt.tmp list
+
+import argparse
+import re
+import sys
+
+try:
+    import urllib2 as urllib
+
+    URLLIB_NEW = False
+except (ImportError, ModuleNotFoundError):
+    import urllib.request as urllib
+    from urllib.request import Request
+
+    URLLIB_NEW = True
+
+
+def parse_time_restricted_list(content):
+    rx_comment = re.compile(r"^(#|$)")
+    rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
+    rx_trusted = re.compile(r"^([*a-z0-9.-]+)\s*(@\S+)?$")
+
+    names = set()
+    time_restrictions = {}
+    rx_set = [rx_trusted]
+    for line in content.splitlines():
+        line = str.lower(str.strip(line))
+        if rx_comment.match(line):
+            continue
+        line = rx_inline_comment.sub("", line)
+        for rx in rx_set:
+            matches = rx.match(line)
+            if not matches:
+                continue
+            name = matches.group(1)
+            names.add(name)
+            time_restriction = matches.group(2)
+            if time_restriction:
+                time_restrictions[name] = time_restriction
+    return names, time_restrictions
+
+
+def parse_trusted_list(content):
+    names, _time_restrictions = parse_time_restricted_list(content)
+    time_restrictions = {}
+    return names, time_restrictions
+
+
+def parse_list(content, trusted=False):
+    rx_comment = re.compile(r"^(#|$)")
+    rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
+    rx_u = re.compile(
+        r"^@*\|\|([a-z0-9][a-z0-9.-]*[.][a-z]{2,})\^?(\$(popup|third-party))?$")
+    rx_l = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$")
+    rx_h = re.compile(
+        r"^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$"
+    )
+    rx_mdl = re.compile(r'^"[^"]+","([a-z0-9][a-z0-9.-]*[.][a-z]{2,})",')
+    rx_b = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,}),.+,[0-9: /-]+,")
+    rx_dq = re.compile(r"^address=/([a-z0-9][a-z0-9.-]*[.][a-z]{2,})/.")
+
+    if trusted:
+        return parse_trusted_list(content)
+
+    names = set()
+    time_restrictions = {}
+    rx_set = [rx_u, rx_l, rx_h, rx_mdl, rx_b, rx_dq]
+    for line in content.splitlines():
+        line = str.lower(str.strip(line))
+        if rx_comment.match(line):
+            continue
+        line = rx_inline_comment.sub("", line)
+        for rx in rx_set:
+            matches = rx.match(line)
+            if not matches:
+                continue
+            name = matches.group(1)
+            names.add(name)
+    return names, time_restrictions
+
+
+# basic check if the line contains any regex specific char
+def is_regex(line):
+    regex_chars = "*[]?}{"
+    return any(char in line for char in regex_chars)
+
+
+def parse_regex(names):
+    regexes = set()
+    for line in names:
+        # skip lines without regex characters:
+        if not is_regex(line):
+            continue
+        # convert to python regex:
+        line=line.replace(".", "\.")
+        line=line.replace("*", ".*")
+        line = "^"+line+"$"
+        # check if resulting regex is valid:
+        try:
+            if re.compile(line):
+                regexes.add(line)
+        except re.error:
+            sys.stderr.write("Invalid regex: {} [{}]\n".format(line, re.error))
+            continue
+    return regexes
+
+
+def print_restricted_name(name, time_restrictions):
+    if name in time_restrictions:
+        print("{}\t{}".format(name, time_restrictions[name]))
+    else:
+        print(
+            "# ignored: [{}] was in the time-restricted list, "
+            "but without a time restriction label".format(name)
+        )
+
+
+def load_from_url(url):
+    sys.stderr.write("Loading data from [{}]\n".format(url))
+    req = urllib.Request(url=url, headers={"User-Agent": "dnscrypt-proxy"})
+    trusted = False
+
+    if URLLIB_NEW:
+        req_type = req.type
+    else:
+        req_type = req.get_type()
+    if req_type == "file":
+        trusted = True
+
+    response = None
+    try:
+        response = urllib.urlopen(req, timeout=int(args.timeout))
+    except urllib.URLError as err:
+        raise Exception("[{}] could not be loaded: {}\n".format(url, err))
+    if trusted is False and response.getcode() != 200:
+        raise Exception("[{}] returned HTTP code {}\n".format(
+            url, response.getcode()))
+    content = response.read()
+    if URLLIB_NEW:
+        content = content.decode("utf-8", errors="replace")
+
+    return (content, trusted)
+
+
+def name_cmp(name):
+    parts = name.split(".")
+    parts.reverse()
+    return str.join(".", parts)
+
+
+def has_suffix(names, name):
+    parts = str.split(name, ".")
+    while parts:
+        parts = parts[1:]
+        if str.join(".", parts) in names:
+            return True
+
+    return False
+
+
+# check if a line matches with any of the collected regexes:
+def covered_by_regex(line, regexes):
+
+    # only check lines that aren't regexes themselves:
+    if not is_regex(line):
+        for regex in regexes:
+            if re.match(regex, line):
+                return True
+
+    return False
+
+
+def allowlist_from_url(url):
+    if not url:
+        return set()
+    content, trusted = load_from_url(url)
+
+    names, _time_restrictions = parse_list(content, trusted)
+    return names
+
+
+def blocklists_from_config_file(
+    file, allowlist, time_restricted_url, ignore_retrieval_failure
+):
+    blocklists = {}
+    allowed_names = set()
+    all_regexes = set()
+    all_names = set()
+    unique_names = set()
+
+    # Load conf & blocklists
+    with open(file) as fd:
+        for line in fd:
+            line = str.strip(line)
+            if str.startswith(line, "#") or line == "":
+                continue
+            url = line
+            try:
+                content, trusted = load_from_url(url)
+                names, _time_restrictions = parse_list(content, trusted)
+                blocklists[url] = names
+                all_names |= names
+                all_regexes |= parse_regex(names)
+
+            except Exception as e:
+                sys.stderr.write(str(e))
+                if not ignore_retrieval_failure:
+                    exit(1)
+
+    # Time-based blocklist
+    if time_restricted_url and not re.match(r"^[a-z0-9]+:", time_restricted_url):
+        time_restricted_url = "file:" + time_restricted_url
+
+    if time_restricted_url:
+        time_restricted_content, _trusted = load_from_url(time_restricted_url)
+        time_restricted_names, time_restrictions = parse_time_restricted_list(
+            time_restricted_content
+        )
+
+        if time_restricted_names:
+            print("########## Time-based blocklist ##########\n")
+            for name in time_restricted_names:
+                print_restricted_name(name, time_restrictions)
+
+        # Time restricted names should be allowed, or they could be always blocked
+        allowed_names |= time_restricted_names
+
+    # Allowed list
+    if allowlist and not re.match(r"^[a-z0-9]+:", allowlist):
+        allowlist = "file:" + allowlist
+
+    allowed_names |= allowlist_from_url(allowlist)
+
+    # Process blocklists
+    for url, names in blocklists.items():
+        print("\n\n########## Blocklist from {} ##########\n".format(url))
+        ignored, allowed = 0, 0
+        list_names = list()
+        for name in names:
+            if has_suffix(all_names, name) or name in unique_names or covered_by_regex(name, all_regexes):
+                ignored = ignored + 1
+            elif has_suffix(allowed_names, name) or name in allowed_names:
+                allowed = allowed + 1
+            else:
+                list_names.append(name)
+                unique_names.add(name)
+
+        list_names.sort(key=name_cmp)
+        if ignored:
+            print("# Ignored duplicates: {}\n".format(ignored))
+        if allowed:
+            print("# Ignored entries due to the allowlist: {}\n".format(allowed))
+        for name in list_names:
+            print(name)
+
+
+argp = argparse.ArgumentParser(
+    description="Create a unified blocklist from a set of local and remote files"
+)
+argp.add_argument(
+    "-c",
+    "--config",
+    default="domains-blocklist.conf",
+    help="file containing blocklist sources",
+)
+argp.add_argument(
+    "-w",
+    "--whitelist",
+    help="Deprecated.  Please use -a or --allowlist",
+)
+argp.add_argument(
+    "-a",
+    "--allowlist",
+    default="domains-allowlist.txt",
+    help="file containing a set of names to exclude from the blocklist",
+)
+argp.add_argument(
+    "-r",
+    "--time-restricted",
+    default="domains-time-restricted.txt",
+    help="file containing a set of names to be time restricted",
+)
+argp.add_argument(
+    "-i",
+    "--ignore-retrieval-failure",
+    action="store_true",
+    help="generate list even if some urls couldn't be retrieved",
+)
+argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
+
+args = argp.parse_args()
+
+whitelist = args.whitelist
+if whitelist:
+    print('Use of -w WHITELIST has been removed. Please use -a ALLOWLIST instead.')
+    exit(1)
+
+conf = args.config
+allowlist = args.allowlist
+time_restricted = args.time_restricted
+ignore_retrieval_failure = args.ignore_retrieval_failure
+
+blocklists_from_config_file(
+    conf, allowlist, time_restricted, ignore_retrieval_failure)
+