fun link-compiler script
[Ultimately_Untrue_Thought.git] / notes / link_compilation.py
diff --git a/notes/link_compilation.py b/notes/link_compilation.py
new file mode 100644 (file)
index 0000000..96a8669
--- /dev/null
@@ -0,0 +1,38 @@
+import os
+import re
+import sys
+from collections import Counter
+from urllib.parse import urlparse
+
+# https://davidwells.io/snippets/regex-match-markdown-links
+# (slightly fixed to include `-` and others as valid, thx Laurin Neff in comments)
+url_regex = re.compile(r"\[([\w\s\d]+)\]\(((?:\/|https?:\/\/)[\w\d./?=#%+&-]+)\)")
+
+def tree(root):
+    text_counter = Counter()
+    link_counter = Counter()
+    domain_counter = Counter()
+    for path, _dirs, filenames in os.walk(root):
+        for filename in filenames:
+            if not filename.endswith(".md"):
+                continue
+            if "drafts" in path:
+                continue
+            filepath = os.path.join(path, filename)
+            with open(filepath) as f:
+                content = f.read()
+                matches = url_regex.finditer(content)
+                for match in matches:
+                    text = match.group(1)
+                    link = match.group(2)
+                    parsed = urlparse(link)
+                    text_counter[text] += 1
+                    link_counter[link] += 1
+                    domain_counter[parsed.netloc] += 1
+    print(text_counter)
+    print(link_counter)
+    print(domain_counter)
+    print(len(domain_counter))
+
+if __name__ == "__main__":
+    tree(sys.argv[1])