From: M. Taylor Saotome-Westlake <ultimatelyuntruethought@gmail.com>
Date: Wed, 19 May 2021 03:37:54 +0000 (-0700)
Subject: fun link-compiler script
X-Git-Url: http://unremediatedgender.space/source?a=commitdiff_plain;h=67046e4a33f562f614cf1b7ef952a39f63397f3b;p=Ultimately_Untrue_Thought.git

fun link-compiler script

This might not be quite right—it looks like it only found two of the
links to "Changing Emotions", when I had specifically counted 5 in
"Sexual Dimorphism"? But, I can debug later if I really care; this was
just something to peek at quickly.
---

diff --git a/notes/link_compilation.py b/notes/link_compilation.py
new file mode 100644
index 0000000..96a8669
--- /dev/null
+++ b/notes/link_compilation.py
@@ -0,0 +1,38 @@
+import os
+import re
+import sys
+from collections import Counter
+from urllib.parse import urlparse
+
+# https://davidwells.io/snippets/regex-match-markdown-links
+# (slightly fixed to include `-` and others as valid, thx Laurin Neff in comments)
+url_regex = re.compile(r"\[([\w\s\d]+)\]\(((?:\/|https?:\/\/)[\w\d./?=#%+&-]+)\)")
+
+def tree(root):
+    text_counter = Counter()
+    link_counter = Counter()
+    domain_counter = Counter()
+    for path, _dirs, filenames in os.walk(root):
+        for filename in filenames:
+            if not filename.endswith(".md"):
+                continue
+            if "drafts" in path:
+                continue
+            filepath = os.path.join(path, filename)
+            with open(filepath) as f:
+                content = f.read()
+                matches = url_regex.finditer(content)
+                for match in matches:
+                    text = match.group(1)
+                    link = match.group(2)
+                    parsed = urlparse(link)
+                    text_counter[text] += 1
+                    link_counter[link] += 1
+                    domain_counter[parsed.netloc] += 1
+    print(text_counter)
+    print(link_counter)
+    print(domain_counter)
+    print(len(domain_counter))
+
+if __name__ == "__main__":
+    tree(sys.argv[1])