check in: back to it
[Ultimately_Untrue_Thought.git] / notes / link_compilation.py
1 import os
2 import re
3 import sys
4 from collections import Counter
5 from urllib.parse import urlparse
6
7 # https://davidwells.io/snippets/regex-match-markdown-links
8 # (slightly fixed to include `-` and others as valid, thx Laurin Neff in comments)
9 url_regex = re.compile(r"\[([\w\s\d]+)\]\(((?:\/|https?:\/\/)[\w\d./?=#%+&-]+)\)")
10
11 def tree(root):
12     text_counter = Counter()
13     link_counter = Counter()
14     domain_counter = Counter()
15     for path, _dirs, filenames in os.walk(root):
16         for filename in filenames:
17             if not filename.endswith(".md"):
18                 continue
19             if "drafts" in path:
20                 continue
21             filepath = os.path.join(path, filename)
22             with open(filepath) as f:
23                 content = f.read()
24                 matches = url_regex.finditer(content)
25                 for match in matches:
26                     text = match.group(1)
27                     link = match.group(2)
28                     parsed = urlparse(link)
29                     text_counter[text] += 1
30                     link_counter[link] += 1
31                     domain_counter[parsed.netloc] += 1
32     print(text_counter)
33     print(link_counter)
34     print(domain_counter)
35     print(len(domain_counter))
36
37 if __name__ == "__main__":
38     tree(sys.argv[1])