fun link-compiler script
authorM. Taylor Saotome-Westlake <ultimatelyuntruethought@gmail.com>
Wed, 19 May 2021 03:37:54 +0000 (20:37 -0700)
committerM. Taylor Saotome-Westlake <ultimatelyuntruethought@gmail.com>
Wed, 19 May 2021 03:43:39 +0000 (20:43 -0700)
This might not be quite right—it looks like it only found two of the
links to "Changing Emotions", when I had specifically counted 5 in
"Sexual Dimorphism"? But, I can debug later if I really care; this was
just something to peek at quickly.

notes/link_compilation.py [new file with mode: 0644]

diff --git a/notes/link_compilation.py b/notes/link_compilation.py
new file mode 100644 (file)
index 0000000..96a8669
--- /dev/null
@@ -0,0 +1,38 @@
+import os
+import re
+import sys
+from collections import Counter
+from urllib.parse import urlparse
+
+# https://davidwells.io/snippets/regex-match-markdown-links
+# (slightly fixed to include `-` and others as valid, thx Laurin Neff in comments)
+url_regex = re.compile(r"\[([\w\s\d]+)\]\(((?:\/|https?:\/\/)[\w\d./?=#%+&-]+)\)")
+
+def tree(root):
+    text_counter = Counter()
+    link_counter = Counter()
+    domain_counter = Counter()
+    for path, _dirs, filenames in os.walk(root):
+        for filename in filenames:
+            if not filename.endswith(".md"):
+                continue
+            if "drafts" in path:
+                continue
+            filepath = os.path.join(path, filename)
+            with open(filepath) as f:
+                content = f.read()
+                matches = url_regex.finditer(content)
+                for match in matches:
+                    text = match.group(1)
+                    link = match.group(2)
+                    parsed = urlparse(link)
+                    text_counter[text] += 1
+                    link_counter[link] += 1
+                    domain_counter[parsed.netloc] += 1
+    print(text_counter)
+    print(link_counter)
+    print(domain_counter)
+    print(len(domain_counter))
+
+if __name__ == "__main__":
+    tree(sys.argv[1])