X-Git-Url: http://unremediatedgender.space/source?p=Ultimately_Untrue_Thought.git;a=blobdiff_plain;f=notes%2Flink_compilation.py;fp=notes%2Flink_compilation.py;h=96a866925b6794e01ff5d9b282d0206d4fd78861;hp=0000000000000000000000000000000000000000;hb=67046e4a33f562f614cf1b7ef952a39f63397f3b;hpb=edafbcbd76c20f1615e73082f020321fec41256a diff --git a/notes/link_compilation.py b/notes/link_compilation.py new file mode 100644 index 0000000..96a8669 --- /dev/null +++ b/notes/link_compilation.py @@ -0,0 +1,38 @@ +import os +import re +import sys +from collections import Counter +from urllib.parse import urlparse + +# https://davidwells.io/snippets/regex-match-markdown-links +# (slightly fixed to include `-` and others as valid, thx Laurin Neff in comments) +url_regex = re.compile(r"\[([\w\s\d]+)\]\(((?:\/|https?:\/\/)[\w\d./?=#%+&-]+)\)") + +def tree(root): + text_counter = Counter() + link_counter = Counter() + domain_counter = Counter() + for path, _dirs, filenames in os.walk(root): + for filename in filenames: + if not filename.endswith(".md"): + continue + if "drafts" in path: + continue + filepath = os.path.join(path, filename) + with open(filepath) as f: + content = f.read() + matches = url_regex.finditer(content) + for match in matches: + text = match.group(1) + link = match.group(2) + parsed = urlparse(link) + text_counter[text] += 1 + link_counter[link] += 1 + domain_counter[parsed.netloc] += 1 + print(text_counter) + print(link_counter) + print(domain_counter) + print(len(domain_counter)) + +if __name__ == "__main__": + tree(sys.argv[1])