notes/link_compilation.py

   1 import os
   2 import re
   3 import sys
   4 from collections import Counter
   5 from urllib.parse import urlparse
   6
   7 # https://davidwells.io/snippets/regex-match-markdown-links
   8 # (slightly fixed to include `-` and others as valid, thx Laurin Neff in comments)
   9 url_regex = re.compile(r"\[([\w\s\d]+)\]\(((?:\/|https?:\/\/)[\w\d./?=#%+&-]+)\)")
  10
  11 def tree(root):
  12     text_counter = Counter()
  13     link_counter = Counter()
  14     domain_counter = Counter()
  15     for path, _dirs, filenames in os.walk(root):
  16         for filename in filenames:
  17             if not filename.endswith(".md"):
  18                 continue
  19             if "drafts" in path:
  20                 continue
  21             filepath = os.path.join(path, filename)
  22             with open(filepath) as f:
  23                 content = f.read()
  24                 matches = url_regex.finditer(content)
  25                 for match in matches:
  26                     text = match.group(1)
  27                     link = match.group(2)
  28                     parsed = urlparse(link)
  29                     text_counter[text] += 1
  30                     link_counter[link] += 1
  31                     domain_counter[parsed.netloc] += 1
  32     print(text_counter)
  33     print(link_counter)
  34     print(domain_counter)
  35     print(len(domain_counter))
  36
  37 if __name__ == "__main__":
  38     tree(sys.argv[1])