4 from collections import Counter
5 from urllib.parse import urlparse
7 # https://davidwells.io/snippets/regex-match-markdown-links
8 # (slightly fixed to include `-` and others as valid, thx Laurin Neff in comments)
9 url_regex = re.compile(r"\[([\w\s\d]+)\]\(((?:\/|https?:\/\/)[\w\d./?=#%+&-]+)\)")
12 text_counter = Counter()
13 link_counter = Counter()
14 domain_counter = Counter()
15 for path, _dirs, filenames in os.walk(root):
16 for filename in filenames:
17 if not filename.endswith(".md"):
21 filepath = os.path.join(path, filename)
22 with open(filepath) as f:
24 matches = url_regex.finditer(content)
28 parsed = urlparse(link)
29 text_counter[text] += 1
30 link_counter[link] += 1
31 domain_counter[parsed.netloc] += 1
35 print(len(domain_counter))
37 if __name__ == "__main__":