Delete generate-domains-blocklist.py

2023-01-09 16:21:00 +11:00 · 2023-01-09 16:21:00 +11:00 · e5790e6e48
parent 96afabfd75
commit e5790e6e48
1 changed files with 0 additions and 341 deletions
--- a/config/generate-domains-blocklist.py
+++ b/config/generate-domains-blocklist.py
@ -1,341 +0,0 @@
-#! /usr/bin/env python3
-
-# Linux, run with: python3 generate-domains-blocklist.py > list.txt.tmp && mv -f list.txt.tmp list
-# Windows, run with: py generate-domains-blocklist.py > list.txt
-
-import argparse
-import re
-import sys
-
-#added list variable
-domain_list = []
-
-try:
-    import urllib2 as urllib
-
-    URLLIB_NEW = False
-except (ImportError, ModuleNotFoundError):
-    import urllib.request as urllib
-    from urllib.request import Request
-
-    URLLIB_NEW = True
-
-
-def parse_time_restricted_list(content):
-    rx_comment = re.compile(r"^(#|$)")
-    rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
-    rx_trusted = re.compile(r"^([*a-z0-9.-]+)\s*(@\S+)?$")
-    rx_timed = re.compile(r".+\s*@\S+$")
-
-    names = set()
-    time_restrictions = {}
-    rx_set = [rx_trusted]
-    for line in content.splitlines():
-        line = str.lower(str.strip(line))
-        if rx_comment.match(line):
-            continue
-        line = rx_inline_comment.sub("", line)
-        for rx in rx_set:
-            matches = rx.match(line)
-            if not matches:
-                continue
-            name = matches.group(1)
-            names.add(name)
-            time_restriction = matches.group(2)
-            if time_restriction:
-                time_restrictions[name] = time_restriction
-    return names, time_restrictions
-
-
-def parse_trusted_list(content):
-    names, _time_restrictions = parse_time_restricted_list(content)
-    time_restrictions = {}
-    return names, time_restrictions
-
-
-def parse_list(content, trusted=False):
-    rx_comment = re.compile(r"^(#|$)")
-    rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$")
-    rx_u = re.compile(
-        r"^@*\|\|([a-z0-9][a-z0-9.-]*[.][a-z]{2,})\^?(\$(popup|third-party))?$")
-    rx_l = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$")
-    rx_lw = re.compile(r"^[*][.]([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$")
-    rx_h = re.compile(
-        r"^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$"
-    )
-    rx_mdl = re.compile(r'^"[^"]+","([a-z0-9][a-z0-9.-]*[.][a-z]{2,})",')
-    rx_b = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,}),.+,[0-9: /-]+,")
-    rx_dq = re.compile(r"^address=/([a-z0-9][a-z0-9.-]*[.][a-z]{2,})/.")
-
-    if trusted:
-        return parse_trusted_list(content)
-
-    names = set()
-    time_restrictions = {}
-    rx_set = [rx_u, rx_l, rx_lw, rx_h, rx_mdl, rx_b, rx_dq]
-    for line in content.splitlines():
-        line = str.lower(str.strip(line))
-        if rx_comment.match(line):
-            continue
-        line = rx_inline_comment.sub("", line)
-        for rx in rx_set:
-            matches = rx.match(line)
-            if not matches:
-                continue
-            name = matches.group(1)
-            names.add(name)
-    return names, time_restrictions
-
-
-# basic check if the line contains any regex specific char
-def is_regex(line):
-    regex_chars = "*[]?}{"
-    return any(char in line for char in regex_chars)
-
-
-def parse_regex(names):
-    regexes = set()
-    for line in names:
-        # skip lines without regex characters:
-        if not is_regex(line):
-            continue
-        # convert to python regex:
-        line=line.replace(".", "\.")
-        line=line.replace("*", ".*")
-        line = "^"+line+"$"
-        # check if resulting regex is valid:
-        try:
-            if re.compile(line):
-                regexes.add(line)
-        except re.error:
-            sys.stderr.write("Invalid regex: {} [{}]\n".format(line, re.error))
-            continue
-    return regexes
-
-
-def print_restricted_name(name, time_restrictions):
-    if name in time_restrictions:
-        print("{}\t{}".format(name, time_restrictions[name]))
-    else:
-        print(
-            "# ignored: [{}] was in the time-restricted list, "
-            "but without a time restriction label".format(name)
-        )
-
-
-def load_from_url(url):
-    sys.stderr.write("Loading data from [{}]\n".format(url))
-    req = urllib.Request(url=url, headers={"User-Agent": "dnscrypt-proxy"})
-    trusted = False
-
-    if URLLIB_NEW:
-        req_type = req.type
-    else:
-        req_type = req.get_type()
-    if req_type == "file":
-        trusted = True
-
-    response = None
-    try:
-        response = urllib.urlopen(req, timeout=int(args.timeout))
-    except urllib.URLError as err:
-        raise Exception("[{}] could not be loaded: {}\n".format(url, err))
-    if trusted is False and response.getcode() != 200:
-        raise Exception("[{}] returned HTTP code {}\n".format(
-            url, response.getcode()))
-    content = response.read()
-    if URLLIB_NEW:
-        content = content.decode("utf-8", errors="replace")
-
-    return (content, trusted)
-
-
-def name_cmp(name):
-    parts = name.split(".")
-    parts.reverse()
-    return str.join(".", parts)
-
-
-def has_suffix(names, name):
-    parts = str.split(name, ".")
-    while parts:
-        parts = parts[1:]
-        if str.join(".", parts) in names:
-            return True
-
-    return False
-
-
-# check if a line matches with any of the collected regexes:
-def covered_by_regex(line, regexes):
-
-    # only check lines that aren't regexes themselves:
-    if not is_regex(line):
-        for regex in regexes:
-            if re.match(regex, line):
-                return True
-
-    return False
-
-
-def allowlist_from_url(url):
-    if not url:
-        return set()
-    content, trusted = load_from_url(url)
-
-    names, _time_restrictions = parse_list(content, trusted)
-    return names
-
-
-def blocklists_from_config_file(
-    file, allowlist, time_restricted_url, ignore_retrieval_failure, sort
-):
-    blocklists = {}
-    allowed_names = set()
-    all_regexes = set()
-    all_names = set()
-    unique_names = set()
-
-    # Load conf & blocklists
-    with open(file) as fd:
-        for line in fd:
-            line = str.strip(line)
-            if str.startswith(line, "#") or line == "":
-                continue
-            url = line
-            try:
-                content, trusted = load_from_url(url)
-                names, _time_restrictions = parse_list(content, trusted)
-                blocklists[url] = names
-                all_names |= names
-                all_regexes |= parse_regex(names)
-
-            except Exception as e:
-                sys.stderr.write(str(e))
-                if not ignore_retrieval_failure:
-                    exit(1)
-
-    # Time-based blocklist
-    if time_restricted_url and not re.match(r"^[a-z0-9]+:", time_restricted_url):
-        time_restricted_url = "file:" + time_restricted_url
-
-    if time_restricted_url:
-        time_restricted_content, _trusted = load_from_url(time_restricted_url)
-        time_restricted_names, time_restrictions = parse_time_restricted_list(
-            time_restricted_content
-        )
-
-        if time_restricted_names:
-            print("\n# Time-based blocklist")
-            for name in time_restricted_names:
-                print_restricted_name(name, time_restrictions)
-
-        # Time restricted names should be allowed, or they could be always blocked
-        allowed_names |= time_restricted_names
-
-    # Allowed list
-    if allowlist and not re.match(r"^[a-z0-9]+:", allowlist):
-        allowlist = "file:" + allowlist
-
-    allowed_names |= allowlist_from_url(allowlist)
-
-    # Process blocklists
-    for url, names in blocklists.items():
-        print("\n# Blocklist from [{}]".format(url))
-        ignored, allowed = 0, 0
-        list_names = list()
-        
-        for name in names:
-            if has_suffix(all_names, name) or name in unique_names or covered_by_regex(name, all_regexes):
-                ignored = ignored + 1
-            elif has_suffix(allowed_names, name) or name in allowed_names:
-                allowed = allowed + 1
-            else:
-                list_names.append(name)
-                unique_names.add(name)
-
-        list_names.sort(key=name_cmp)
-        if ignored:
-            print("# Ignored duplicates: {}".format(ignored))
-        if allowed:
-            print("# Ignored entries due to the allowlist: {}".format(allowed))
-        # my mod to sort domains - 73sydney
-        list_names = sorted(set(list_names))    
-        for name in list_names:
-            #commented out below
-            #print(name)
-            #added below
-            domain_list.append(name)
-    #added below  
-    #for y in sorted([x.strip().split('.')[::-1] for x in domain_list]): print('.'.join(y[::-1]))
-    data = []
-    for x in domain_list:
-        d = x.strip().split('.')
-        d.reverse()
-        data.append(d)
-    if args.sort == 'tld':
-        data.sort()
-    else:
-        data.sort(key=lambda x: x[1:])
-    for y in data:
-        y.reverse()
-        print('.'.join(y))
-
-
-argp = argparse.ArgumentParser(
-    description="Create a unified blocklist from a set of local and remote files"
-)
-argp.add_argument(
-    "-c",
-    "--config",
-    default="domains-blocklist.conf",
-    help="file containing blocklist sources",
-)
-argp.add_argument(
-    "-w",
-    "--whitelist",
-    help="Deprecated.  Please use -a or --allowlist",
-)
-argp.add_argument(
-    "-a",
-    "--allowlist",
-    default="domains-allowlist.txt",
-    help="file containing a set of names to exclude from the blocklist",
-)
-argp.add_argument(
-    "-r",
-    "--time-restricted",
-    default="domains-time-restricted.txt",
-    help="file containing a set of names to be time restricted",
-)
-argp.add_argument(
-    "-i",
-    "--ignore-retrieval-failure",
-    action="store_true",
-    help="generate list even if some urls couldn't be retrieved",
-)
-argp.add_argument("-t", "--timeout", default=30, help="URL open timeout")
-
-argp.add_argument(
-    "-s",
-    "--sort",
-    default="domain",
-    help="sort method, either: none = domain (default) or tld",
-)
-
-
-args = argp.parse_args()
-
-whitelist = args.whitelist
-if whitelist:
-    print('Use of -w WHITELIST has been removed. Please use -a ALLOWLIST instead.')
-    exit(1)
-
-conf = args.config
-allowlist = args.allowlist
-time_restricted = args.time_restricted
-ignore_retrieval_failure = args.ignore_retrieval_failure
-sort = args.sort
-
-blocklists_from_config_file(
-    conf, allowlist, time_restricted, ignore_retrieval_failure, sort)