-
Notifications
You must be signed in to change notification settings - Fork 0
/
securitytxt.py
294 lines (254 loc) · 11.9 KB
/
securitytxt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import requests
import os
import csv
import logging
import favicon
import socket
import requests
import urllib3
from urllib.parse import urlparse
from urllib.request import urlopen
from urllib.error import URLError
from bs4 import BeautifulSoup
from datetime import datetime, timezone
# keep silent! see https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings for details
urllib3.disable_warnings()
logging.basicConfig(
filename='error.log',
level=logging.INFO,
format='%(asctime)s %(levelname)-8s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
# Extract information from the response after calling the url
# primarily to extract site title from url for txt-files as input
def extract_information_from_url(url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return soup.title.string if soup.title else None
except requests.RequestException as e:
logging.error(f"Error extracting information from {url}: {e}")
return None
# Extract information (title + url) from csv file
def csv_to_results(input_file):
with open(input_file, 'r', newline='', encoding='utf-8') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=';')
# Skip the header
header = next(csv_reader)
results = {}
for row in csv_reader:
if len(row) >= 2:
# Remove leading/trailing whitespaces
url = row[1].strip()
# checking if url in file is welformed, if domain is registered and scheme is https or http
if is_valid_url(url) and is_existing_domain(url) and https_or_http(url):
title = row[0].strip()
results[url] = {'Title': title, 'BASEURL': url}
else:
logging.error(f"Something else ist wrong with (csv source): {url}")
#print ("results after csv read\n", results)
return results
# Extract information (url) from txt file
def txt_to_results(file_path):
results = {}
with open(file_path, 'r') as file:
for line in file:
url = line.strip()
# checking if url in file is welformed, if domain is registered and scheme is https or http
if is_valid_url(url) and is_existing_domain(url) and https_or_http(url):
title = extract_information_from_url(url)
results[url] = {'Title': title, 'BASEURL': url}
else:
logging.error(f"Somehow else invalid URL {url} in {file_path}")
return results
# Check whether the url is valid or not
def is_valid_url(url):
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
logging.error(f"Malformed URL?: {url} - {e}")
return False
# check, if domain is registered or not
def is_existing_domain(url):
hostname = urlparse(url).netloc
try:
h = socket.gethostbyname(hostname)
return True
except socket.gaierror:
# DNS lookup failed, hostname cannot be resolved
logging.error(f"Domain not existing: {hostname} - please correct input")
return False
#check if given URL is reachable, if not try other scheme and only throw error, if both schemes fail
def https_or_http(url):
hostname = urlparse(url).netloc
http_scheme = "http://"
http_url = http_scheme + hostname
try:
# verify is an annoying option, keeps complaining, see beginning of file urllib3.disable_warnings() silences it
response = requests.get(url, timeout=6, allow_redirects=True, verify=False)
parsed_response = urlparse(response.url)
return all([parsed_response.scheme, parsed_response.netloc])
except:
try:
response_http = requests.get(http_url, timeout=12, allow_redirects=False)
parsed_response_http = urlparse(response_http.url)
logging.info(f"No https: {http_url}")
return (http_url)
except ValueError:
logging.error(f"Something went wrong - no http, no https: {hostname} - {e}")
return False
# Check whether the page is sucessfully loaded or not
def check_page_not_found(url):
try:
response = requests.get(url)
# Raise an exception for HTTP errors (4xx and 5xx status codes)
response.raise_for_status()
# Check if the text "page not found" or "seite nicht gefunden" is present in the response content
# this is because CMS don't deliver http status codes but redirect to pages - maybe TODO NTH maybe more legant solution possible?
page_not_found_texts = ["page not found", "seite nicht gefunden"]
for text in page_not_found_texts:
if text.lower() in response.text.lower():
# Text "Page not found" is present
return False
if "contact" not in response.text.lower():
return False
# Page is accessible
return True
except requests.RequestException as e:
return False
# Extract the text information from the security.txt of the website if it exits
def check_security_txt(url):
# define the path where to check for security.txt
paths_to_check = [".well-known/security.txt", "security.txt"]
for path in paths_to_check:
try:
response = requests.get(url + path, allow_redirects=False)
# Print(url + path, response.status_code)
# TODO why here to check for "page not found"?
if response.status_code == 200 and check_page_not_found(url + path):
return response.text
except requests.RequestException as e:
logging.error(f"Error checking {path} for {url}: {e}")
return None
# Generate html file for the security.txt results
def generate_html_report(results, filename, output_file, template_file="templates/page_template.html"):
with open(template_file, 'r') as template:
template_content = template.read()
with open(output_file, 'w') as file:
file.write(template_content.replace('{report_title}', generate_report_title(filename))
.replace('{report_content}', generate_card_content(results)))
# Generate title for each report
def generate_report_title(filename):
return f"<h1 style='text-align: center; font-family: Arial, sans-serif;'>Security.txt Dashboard for {filename}</h1>"
# Generate the result of each url in a card
def generate_card_content(results):
card_content = '<div class="grid" data-masonry=\'{ "itemSelector": ".grid-item" }\'>'
# TODO this results contain the URL and the status of the Security.txt, no title!
# print ("results before card generation", results)
for url, result in results.items():
favicon_url = get_favicon_url(url)
alert_class = "alert-success" if result.get('SecurityTxt') is not None else "alert-warning"
if url.endswith('.txt'):
# For .txt files, use the URL as the card title
card_title = f"<h5 class='card-title'><a href='{url}' target='_blank' rel='noopener'>{url}</a></h5>"
print ("set card in txt file")
elif url.endswith('.csv'):
# For .csv files, use the title from the CSV as the card title
card_title = f"<h5 class='card-title'>{result.get('Title', 'Unknown')}</h5>"
print ("set card in csv file")
else:
# Use the URL as the default card title
#card_title = f"<h5 class='card-title'><a href='{url}' target='_blank' rel='noopener'>{url}</a></h5>"
card_title = f"<h5 class='card-title'><a href='{url}' target='_blank' rel='noopener'>{result.get('Title', 'Unknown')}</a></h5>"
#print ("set card in some file")
card_content += f"""
<div class="grid-item alert {alert_class}" role="alert">
<div class="card text-center">
<div class="d-flex align-items-center justify-content-center">
<img class="card-img-top" src="{favicon_url}" alt="Card image cap" width="32" height="32" style="max-width: 32px; max-height: 32px; margin-top: 5px; margin-bottom: 2px;">
</div>
<div class="card-body">
{card_title}
<ul class="list-group list-group-flush">
"""
if 'SecurityTxt' in result and result['SecurityTxt'] is not None:
card_content += f"<li class='list-group-item'><pre>{result['SecurityTxt']}</pre></li>"
else:
card_content += "<li class='list-group-item'>No security.txt found</li>"
card_content += """
</ul>
</div>
</div>
</div>
"""
card_content += '</div>'
return card_content
# Get the favicon from the given url in size 32*32
def get_favicon_url(url):
try:
icons = favicon.get(url)
if len(icons) == 0:
return "globe2.jpg"
print ("no favicon")
logging.info(f"No favicon for {url}")
else:
return icons[0].url
except requests.RequestException as e:
logging.error(f"Error checking {url} for favicon: {e}")
return "globe2.jpg"
#domain = urlparse(url).hostname
#return f"https://www.google.com/s2/favicons?domain={domain}&s=32"
# Generate an index from the existing html results
def generate_html_index():
# Read the HTML template
with open("templates/index_template.html", "r") as template_file:
index_template = template_file.read()
# Get a list of HTML files in the public directory
html_files = [file for file in os.listdir("public/") if file.endswith(".html") and file != "index.html"]
# Generate the list items for each HTML file
list_items = ""
for html_file in html_files:
list_items += f"<li class=\"list-group-item\"><a href=\"{html_file}\">{html_file}</a></li>\n"
# Insert the list items into the template
final_index = index_template.format(list_items=list_items)
# Write the final index to a new HTML file
with open("public/index.html", "w") as index_file:
index_file.write(final_index)
# Here is where the magic starts
def main(input_folder, output_folder):
# loop through input folder and generate like-named output file
for input_file in os.listdir(input_folder):
input_file_path = os.path.join(input_folder, input_file)
filename = os.path.splitext(input_file)[0]
output_file_path = os.path.join(output_folder, f"{filename}_report.html")
logging.info(f"{input_file} - processing")
if input_file.lower().endswith('.txt'):
output = txt_to_results(input_file_path)
elif input_file.lower().endswith('.csv'):
output = csv_to_results(input_file_path)
else:
logging.error(f"Unsupported file format: {input_file}")
continue
#print ("\033[1m 'output' after reading files\033[0m", output)
# define new dictionary
results = {}
# outer_key is the url from the input file, BASEURL is the validated url in the inner dictionary
for outer_key, inner_dict in output.items():
baseurl = inner_dict['BASEURL']
title = inner_dict['Title']
# check the availability of the security.txt
security_txt_result = check_security_txt(baseurl)
# packing all elements into the results dictionary
results[outer_key] = {
'URL' : outer_key,
'Title' : title,
'SecurityTxt': security_txt_result
}
#print (results, "\n before generating html\n\033[1mBREAK\033[0m\n")
generate_html_report(results, filename, output_file_path)
generate_html_index()
#print (input_file, " done\n")
if __name__ == "__main__":
input_folder = "input"
output_folder = "public"
main(input_folder, output_folder)