-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
link-checker.py
122 lines (107 loc) · 3.59 KB
/
link-checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
""" Link-Checker
Author: Finn LeSueur
Last Modified: 11/04/2020
"""
import configparser
from datetime import datetime
import re
import requests
from email.mime.text import MIMEText
from subprocess import Popen, PIPE
import sys
URLS = {}
def main():
""" Control the script. """
start_time = datetime.now()
config = configparser.ConfigParser()
config.read("config")
if len(sys.argv) == 2:
sitemap = sys.argv[1]
else:
sitemap = config["default"]["sitemap_url"]
raw = requests.get(sitemap)
responses = {}
responses["all_urls"] = {}
responses["pages"] = {}
pages = parse_sitemap(raw.text)
page_count = 1
for page in pages:
urls = get_text_of(page)
responses = check_page_anchors(page, urls, responses)
page_count += 1
output(responses, start_time, config)
def parse_sitemap(sitemap):
""" Parses a sitemap as text
for all page URLs and returns them.
"""
return re.findall(r'<loc>(.*)<\/loc>', sitemap)
def get_text_of(page):
""" Takes a URL and requests that URL,
and returns its sitemap.
"""
raw = requests.get(page)
return get_html_achors_of(raw.text)
def get_html_achors_of(text):
""" Finds and returns all URLs from
anchor elements in HTML text.
"""
return re.findall(r'<a.*href=\"(\S*)\".*>.*</a>', text)
def check_page_anchors(page, links, responses):
""" Checks the status of a URL and
diligently fills out the dicitonary
containing all the responses.
"""
p = dict()
p["good"] = 0
p["bad"] = 0
p["errors"] = dict()
for link in links:
if link not in responses["all_urls"]:
try:
raw = requests.get(link)
responses["all_urls"][link] = raw.status_code
except:
responses["all_urls"][link] = "unknown"
if responses["all_urls"][link] == requests.codes.ok:
p["good"] += 1
else:
p["bad"] += 1
p["errors"][link] = responses["all_urls"][link]
responses["pages"][page] = p
return responses
def output(responses, start_time, config):
""" Reads through the dictionary of
responses and makes a nice output
to send to a file.
"""
passed = 0
failed = 0
errors = "Pages with Errors (status code & URL):\n"
okays = "\nPages without Errors:\n"
for page in responses["pages"]:
if responses["pages"][page]["bad"] == 0:
passed += 1
okays += " {0}\n".format(page)
else:
failed += 1
errors += "\n{0}\n".format(page)
for url in responses["pages"][page]["errors"]:
errors += " ({0}): {1}\n".format(responses["pages"][page]["errors"][url], url)
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d-%H%m%S")
with open("reports/{0}.txt".format(dt_string), 'w') as file:
file.write("Summary\n Check took {2}s\n Pages with Errors: {0}\n Pages without Errors: {1}\n\n".format(failed, passed, datetime.now() - start_time))
file.write(errors)
file.write(okays)
file.close()
if config["default"]["email_log"]:
with open("reports/{0}.txt".format(dt_string), 'r') as content_file:
content = content_file.read()
msg = MIMEText(content)
msg["From"] = config["default"]["from_email"]
msg["To"] = config["default"]["to_email"]
msg["Subject"] = config["default"]["email_subject"]
p = Popen(["/usr/sbin/sendmail", "-t", "-oi"], stdin=PIPE)
p.communicate(msg.as_string().encode())
if __name__ == "__main__":
main()