X-Git-Url: http://unremediatedgender.space/source?p=Ultimately_Untrue_Thought.git;a=blobdiff_plain;f=notes%2Flink_compilation.py;fp=notes%2Flink_compilation.py;h=96a866925b6794e01ff5d9b282d0206d4fd78861;hp=0000000000000000000000000000000000000000;hb=67046e4a33f562f614cf1b7ef952a39f63397f3b;hpb=edafbcbd76c20f1615e73082f020321fec41256a

diff --git a/notes/link_compilation.py b/notes/link_compilation.py
new file mode 100644
index 0000000..96a8669
--- /dev/null
+++ b/notes/link_compilation.py
@@ -0,0 +1,38 @@
+import os
+import re
+import sys
+from collections import Counter
+from urllib.parse import urlparse
+
+# https://davidwells.io/snippets/regex-match-markdown-links
+# (slightly fixed to include `-` and others as valid, thx Laurin Neff in comments)
+url_regex = re.compile(r"\[([\w\s\d]+)\]\(((?:\/|https?:\/\/)[\w\d./?=#%+&-]+)\)")
+
+def tree(root):
+    text_counter = Counter()
+    link_counter = Counter()
+    domain_counter = Counter()
+    for path, _dirs, filenames in os.walk(root):
+        for filename in filenames:
+            if not filename.endswith(".md"):
+                continue
+            if "drafts" in path:
+                continue
+            filepath = os.path.join(path, filename)
+            with open(filepath) as f:
+                content = f.read()
+                matches = url_regex.finditer(content)
+                for match in matches:
+                    text = match.group(1)
+                    link = match.group(2)
+                    parsed = urlparse(link)
+                    text_counter[text] += 1
+                    link_counter[link] += 1
+                    domain_counter[parsed.netloc] += 1
+    print(text_counter)
+    print(link_counter)
+    print(domain_counter)
+    print(len(domain_counter))
+
+if __name__ == "__main__":
+    tree(sys.argv[1])