From: M. Taylor Saotome-Westlake Date: Wed, 19 May 2021 03:37:54 +0000 (-0700) Subject: fun link-compiler script X-Git-Url: http://unremediatedgender.space/source?p=Ultimately_Untrue_Thought.git;a=commitdiff_plain;h=67046e4a33f562f614cf1b7ef952a39f63397f3b fun link-compiler script This might not be quite right—it looks like it only found two of the links to "Changing Emotions", when I had specifically counted 5 in "Sexual Dimorphism"? But, I can debug later if I really care; this was just something to peek at quickly. --- diff --git a/notes/link_compilation.py b/notes/link_compilation.py new file mode 100644 index 0000000..96a8669 --- /dev/null +++ b/notes/link_compilation.py @@ -0,0 +1,38 @@ +import os +import re +import sys +from collections import Counter +from urllib.parse import urlparse + +# https://davidwells.io/snippets/regex-match-markdown-links +# (slightly fixed to include `-` and others as valid, thx Laurin Neff in comments) +url_regex = re.compile(r"\[([\w\s\d]+)\]\(((?:\/|https?:\/\/)[\w\d./?=#%+&-]+)\)") + +def tree(root): + text_counter = Counter() + link_counter = Counter() + domain_counter = Counter() + for path, _dirs, filenames in os.walk(root): + for filename in filenames: + if not filename.endswith(".md"): + continue + if "drafts" in path: + continue + filepath = os.path.join(path, filename) + with open(filepath) as f: + content = f.read() + matches = url_regex.finditer(content) + for match in matches: + text = match.group(1) + link = match.group(2) + parsed = urlparse(link) + text_counter[text] += 1 + link_counter[link] += 1 + domain_counter[parsed.netloc] += 1 + print(text_counter) + print(link_counter) + print(domain_counter) + print(len(domain_counter)) + +if __name__ == "__main__": + tree(sys.argv[1])