From 67046e4a33f562f614cf1b7ef952a39f63397f3b Mon Sep 17 00:00:00 2001 From: "M. Taylor Saotome-Westlake" Date: Tue, 18 May 2021 20:37:54 -0700 Subject: [PATCH] fun link-compiler script MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This might not be quite right—it looks like it only found two of the links to "Changing Emotions", when I had specifically counted 5 in "Sexual Dimorphism"? But, I can debug later if I really care; this was just something to peek at quickly. --- notes/link_compilation.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 notes/link_compilation.py diff --git a/notes/link_compilation.py b/notes/link_compilation.py new file mode 100644 index 0000000..96a8669 --- /dev/null +++ b/notes/link_compilation.py @@ -0,0 +1,38 @@ +import os +import re +import sys +from collections import Counter +from urllib.parse import urlparse + +# https://davidwells.io/snippets/regex-match-markdown-links +# (slightly fixed to include `-` and others as valid, thx Laurin Neff in comments) +url_regex = re.compile(r"\[([\w\s\d]+)\]\(((?:\/|https?:\/\/)[\w\d./?=#%+&-]+)\)") + +def tree(root): + text_counter = Counter() + link_counter = Counter() + domain_counter = Counter() + for path, _dirs, filenames in os.walk(root): + for filename in filenames: + if not filename.endswith(".md"): + continue + if "drafts" in path: + continue + filepath = os.path.join(path, filename) + with open(filepath) as f: + content = f.read() + matches = url_regex.finditer(content) + for match in matches: + text = match.group(1) + link = match.group(2) + parsed = urlparse(link) + text_counter[text] += 1 + link_counter[link] += 1 + domain_counter[parsed.netloc] += 1 + print(text_counter) + print(link_counter) + print(domain_counter) + print(len(domain_counter)) + +if __name__ == "__main__": + tree(sys.argv[1]) -- 2.17.1