From 94a9a95370e900b85027ba6e00a11c4a4b64c4f4 Mon Sep 17 00:00:00 2001 From: Adrian Miller Date: Mon, 9 Jan 2023 16:22:31 +1100 Subject: [PATCH] Add files via upload - sorts via domain by default - -s / --sort commandline argument allows for sorting via tld --- config/generate-domains-blocklist.py | 347 +++++++++++++++++++++++++++ 1 file changed, 347 insertions(+) create mode 100644 config/generate-domains-blocklist.py diff --git a/config/generate-domains-blocklist.py b/config/generate-domains-blocklist.py new file mode 100644 index 0000000..fc2d1da --- /dev/null +++ b/config/generate-domains-blocklist.py @@ -0,0 +1,347 @@ +#! /usr/bin/env python3 + +# Linux, run with: python3 generate-domains-blocklist.py > list.txt.tmp && mv -f list.txt.tmp list +# Windows, run with: py generate-domains-blocklist.py > list.txt + +import argparse +import re +import sys + +#added domain_list variable +domain_list = [] + +try: + import urllib2 as urllib + + URLLIB_NEW = False +except (ImportError, ModuleNotFoundError): + import urllib.request as urllib + from urllib.request import Request + + URLLIB_NEW = True + + +def parse_time_restricted_list(content): + rx_comment = re.compile(r"^(#|$)") + rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$") + rx_trusted = re.compile(r"^([*a-z0-9.-]+)\s*(@\S+)?$") + rx_timed = re.compile(r".+\s*@\S+$") + + names = set() + time_restrictions = {} + rx_set = [rx_trusted] + for line in content.splitlines(): + line = str.lower(str.strip(line)) + if rx_comment.match(line): + continue + line = rx_inline_comment.sub("", line) + for rx in rx_set: + matches = rx.match(line) + if not matches: + continue + name = matches.group(1) + names.add(name) + time_restriction = matches.group(2) + if time_restriction: + time_restrictions[name] = time_restriction + return names, time_restrictions + + +def parse_trusted_list(content): + names, _time_restrictions = parse_time_restricted_list(content) + time_restrictions = {} + return names, time_restrictions + + +def parse_list(content, trusted=False): + rx_comment = re.compile(r"^(#|$)") + rx_inline_comment = re.compile(r"\s*#\s*[a-z0-9-].*$") + rx_u = re.compile( + r"^@*\|\|([a-z0-9][a-z0-9.-]*[.][a-z]{2,})\^?(\$(popup|third-party))?$") + rx_l = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$") + rx_lw = re.compile(r"^[*][.]([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$") + rx_h = re.compile( + r"^[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}\s+([a-z0-9][a-z0-9.-]*[.][a-z]{2,})$" + ) + rx_mdl = re.compile(r'^"[^"]+","([a-z0-9][a-z0-9.-]*[.][a-z]{2,})",') + rx_b = re.compile(r"^([a-z0-9][a-z0-9.-]*[.][a-z]{2,}),.+,[0-9: /-]+,") + rx_dq = re.compile(r"^address=/([a-z0-9][a-z0-9.-]*[.][a-z]{2,})/.") + + if trusted: + return parse_trusted_list(content) + + names = set() + time_restrictions = {} + rx_set = [rx_u, rx_l, rx_lw, rx_h, rx_mdl, rx_b, rx_dq] + for line in content.splitlines(): + line = str.lower(str.strip(line)) + if rx_comment.match(line): + continue + line = rx_inline_comment.sub("", line) + for rx in rx_set: + matches = rx.match(line) + if not matches: + continue + name = matches.group(1) + names.add(name) + return names, time_restrictions + + +# basic check if the line contains any regex specific char +def is_regex(line): + regex_chars = "*[]?}{" + return any(char in line for char in regex_chars) + + +def parse_regex(names): + regexes = set() + for line in names: + # skip lines without regex characters: + if not is_regex(line): + continue + # convert to python regex: + line=line.replace(".", "\.") + line=line.replace("*", ".*") + line = "^"+line+"$" + # check if resulting regex is valid: + try: + if re.compile(line): + regexes.add(line) + except re.error: + sys.stderr.write("Invalid regex: {} [{}]\n".format(line, re.error)) + continue + return regexes + + +def print_restricted_name(name, time_restrictions): + if name in time_restrictions: + print("{}\t{}".format(name, time_restrictions[name])) + else: + print( + "# ignored: [{}] was in the time-restricted list, " + "but without a time restriction label".format(name) + ) + + +def load_from_url(url): + sys.stderr.write("Loading data from [{}]\n".format(url)) + req = urllib.Request(url=url, headers={"User-Agent": "dnscrypt-proxy"}) + trusted = False + + if URLLIB_NEW: + req_type = req.type + else: + req_type = req.get_type() + if req_type == "file": + trusted = True + + response = None + try: + response = urllib.urlopen(req, timeout=int(args.timeout)) + except urllib.URLError as err: + raise Exception("[{}] could not be loaded: {}\n".format(url, err)) + if trusted is False and response.getcode() != 200: + raise Exception("[{}] returned HTTP code {}\n".format( + url, response.getcode())) + content = response.read() + if URLLIB_NEW: + content = content.decode("utf-8", errors="replace") + + return (content, trusted) + + +def name_cmp(name): + parts = name.split(".") + parts.reverse() + return str.join(".", parts) + + +def has_suffix(names, name): + parts = str.split(name, ".") + while parts: + parts = parts[1:] + if str.join(".", parts) in names: + return True + + return False + + +# check if a line matches with any of the collected regexes: +def covered_by_regex(line, regexes): + + # only check lines that aren't regexes themselves: + if not is_regex(line): + for regex in regexes: + if re.match(regex, line): + return True + + return False + + +def allowlist_from_url(url): + if not url: + return set() + content, trusted = load_from_url(url) + + names, _time_restrictions = parse_list(content, trusted) + return names + +#added sort arg +def blocklists_from_config_file( + file, allowlist, time_restricted_url, ignore_retrieval_failure, sort +): + blocklists = {} + allowed_names = set() + all_regexes = set() + all_names = set() + unique_names = set() + + # Load conf & blocklists + with open(file) as fd: + for line in fd: + line = str.strip(line) + if str.startswith(line, "#") or line == "": + continue + url = line + try: + content, trusted = load_from_url(url) + names, _time_restrictions = parse_list(content, trusted) + blocklists[url] = names + all_names |= names + all_regexes |= parse_regex(names) + + except Exception as e: + sys.stderr.write(str(e)) + if not ignore_retrieval_failure: + exit(1) + + # Time-based blocklist + if time_restricted_url and not re.match(r"^[a-z0-9]+:", time_restricted_url): + time_restricted_url = "file:" + time_restricted_url + + if time_restricted_url: + time_restricted_content, _trusted = load_from_url(time_restricted_url) + time_restricted_names, time_restrictions = parse_time_restricted_list( + time_restricted_content + ) + + if time_restricted_names: + print("\n# Time-based blocklist") + for name in time_restricted_names: + print_restricted_name(name, time_restrictions) + + # Time restricted names should be allowed, or they could be always blocked + allowed_names |= time_restricted_names + + # Allowed list + if allowlist and not re.match(r"^[a-z0-9]+:", allowlist): + allowlist = "file:" + allowlist + + allowed_names |= allowlist_from_url(allowlist) + + # Process blocklists + for url, names in blocklists.items(): + print("\n# Blocklist from [{}]".format(url)) + ignored, allowed = 0, 0 + list_names = list() + + for name in names: + if has_suffix(all_names, name) or name in unique_names or covered_by_regex(name, all_regexes): + ignored = ignored + 1 + elif has_suffix(allowed_names, name) or name in allowed_names: + allowed = allowed + 1 + else: + list_names.append(name) + unique_names.add(name) + + list_names.sort(key=name_cmp) + if ignored: + print("# Ignored duplicates: {}".format(ignored)) + if allowed: + print("# Ignored entries due to the allowlist: {}".format(allowed)) + # added list_names = sorted(set(list_names)) to sort domains + list_names = sorted(set(list_names)) + for name in list_names: + #commented out print(name) below + #print(name) + #added domain_list.append(name) below + domain_list.append(name) + + #added domain/tld sorting function below + data = [] + for x in domain_list: + d = x.strip().split('.') + d.reverse() + data.append(d) + #if -s tld / --sort tld commandline arg used then sort via tld + if args.sort == 'tld': + data.sort() + #otherwise sort via domain + else: + data.sort(key=lambda x: x[1:]) + for y in data: + y.reverse() + print('.'.join(y)) + + +argp = argparse.ArgumentParser( + description="Create a unified blocklist from a set of local and remote files" +) +argp.add_argument( + "-c", + "--config", + default="domains-blocklist.conf", + help="file containing blocklist sources", +) +argp.add_argument( + "-w", + "--whitelist", + help="Deprecated. Please use -a or --allowlist", +) +argp.add_argument( + "-a", + "--allowlist", + default="domains-allowlist.txt", + help="file containing a set of names to exclude from the blocklist", +) +argp.add_argument( + "-r", + "--time-restricted", + default="domains-time-restricted.txt", + help="file containing a set of names to be time restricted", +) +argp.add_argument( + "-i", + "--ignore-retrieval-failure", + action="store_true", + help="generate list even if some urls couldn't be retrieved", +) +argp.add_argument("-t", "--timeout", default=30, help="URL open timeout") + +#added argp.add_argument for sort commandline option +argp.add_argument( + "-s", + "--sort", + default="domain", + help="sort method, either: none = domain (default) or tld", +) + + +args = argp.parse_args() + +whitelist = args.whitelist +if whitelist: + print('Use of -w WHITELIST has been removed. Please use -a ALLOWLIST instead.') + exit(1) + +conf = args.config +allowlist = args.allowlist +time_restricted = args.time_restricted +ignore_retrieval_failure = args.ignore_retrieval_failure + +#added sort to arg list +sort = args.sort + +#added sort to arg list +blocklists_from_config_file( + conf, allowlist, time_restricted, ignore_retrieval_failure, sort)