-
Notifications
You must be signed in to change notification settings - Fork 4
/
xml_sitemap_writer.py
284 lines (232 loc) · 8.55 KB
/
xml_sitemap_writer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
"""
Provides XMLSitemap class used to generate large XML sitemap from iterators
"""
import gzip # https://docs.python.org/3/library/gzip.html
import logging
import re
from datetime import datetime
from typing import List, Iterator, IO, Optional
from xml.sax.saxutils import escape as escape_xml
POWERED_BY_URL = "https://github.com/pigs-will-fly/py-xml-sitemap-writer"
W3C_DATE_REGEX = re.compile(r"^\d{4}-\d{2}-\d{2}$")
W3C_DATETIME_REGEX = re.compile(
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\+\d{2}:\d{2}|Z)?$"
)
CHANGEFREQ_VALUES = {
"always",
"hourly",
"daily",
"weekly",
"monthly",
"yearly",
"never",
}
def is_valid_date(date_str: str) -> bool:
"""
Checks if the provided string matches the W3C timestamp format
https://www.w3.org/TR/NOTE-datetime
"""
return (
W3C_DATE_REGEX.match(date_str) is not None
or W3C_DATETIME_REGEX.match(date_str) is not None
)
def is_valid_changefreq(changefreq: str) -> bool:
"""
Checks if the provided string is one of the valid values for the <changefreq> tag
https://www.sitemaps.org/protocol.html#changefreqdef
"""
return changefreq in CHANGEFREQ_VALUES
def is_valid_priority(priority: str) -> bool:
"""
Checks if the provided string is a valid numeric value for the <priority> tag
https://www.sitemaps.org/protocol.html#prioritydef
"""
try:
value = float(priority)
return 0.0 <= value <= 1.0
except ValueError:
return False
# pylint:disable=too-many-instance-attributes
class XMLSitemap:
"""
Generate large XML sitemaps with a sitemap index and sub-sitemap XML files
"""
# Sitemap file that you provide must have no more than 50,000 URLs
# and must be no larger than 10MB (10,485,760 bytes).
# @see http://www.sitemaps.org/protocol.html#index
URLS_PER_FILE = 15000
GZIP_COMPRESSION_LEVEL = 9
def __init__(self, path: str, root_url: str):
"""
Set up XMLSitemap to write to a given path and using a specified root_url.
root_url will be used when generating sitemaps index file.
"""
self.path = path.rstrip("/")
self.root_url = root_url.rstrip("/")
self.logger = logging.getLogger(self.__class__.__name__)
self._sitemaps = []
self.sitemaps_counter = 0
self.current_section_name = ""
self.total_urls_counter = 0
self.sitemap_urls_counter = 0
# file handler for a current sitemap
self._sitemap_file = None
self.add_section("pages")
def add_url(
self,
url: str,
lastmod: Optional[str] = None,
priority: Optional[str] = None,
changefreq: Optional[str] = None,
):
"""
Adds the provided URL to the sitemap,
with optional lastmod, priority and changefreq properties
https://www.sitemaps.org/protocol.html#xmlTagDefinitions
"""
if self.sitemap_urls_counter == 0:
self._add_sitemap()
self.total_urls_counter += 1
self.sitemap_urls_counter += 1
if self.sitemap_urls_counter > self.URLS_PER_FILE:
self.logger.info(
f"URLs per sitemap counter reached the limit of {self.URLS_PER_FILE}"
)
self._add_sitemap()
self.sitemap_urls_counter = 1
url = f'{self.root_url}/{url.lstrip("/")}'
if lastmod and not is_valid_date(lastmod):
self.logger.warning(f"Invalid <lastmod> format for URL <{url}>: {lastmod}")
lastmod = None
if changefreq and not is_valid_changefreq(changefreq):
self.logger.warning(
f"Invalid <changefreq> value for URL <{url}>: {changefreq}"
)
changefreq = None
if priority and not is_valid_priority(priority):
self.logger.warning(f"Invalid <priority> value for URL <{url}>: {priority}")
priority = None
self.logger.debug(f"Adding URL <{url}>")
url_entry = f"<url><loc>{escape_xml(url)}</loc>"
if lastmod:
url_entry += f"<lastmod>{escape_xml(lastmod)}</lastmod>"
if priority:
url_entry += f"<priority>{escape_xml(priority)}</priority>"
if changefreq:
url_entry += f"<changefreq>{escape_xml(changefreq)}</changefreq>"
url_entry += "</url>"
self.write_to_sitemap(url_entry)
def add_urls(self, urls: Iterator[str]):
"""
Add URLs for a provided iterable
"""
for url in urls:
self.add_url(url)
def add_section(self, section_name: str):
"""
Starting a new section will lazily create a new sub-sitemap with
a filename set to "sitemap-<section_name>-<number>.xml.gz"
"""
self._close_sitemap()
self.current_section_name = section_name
self.sitemap_urls_counter = 0
# the sub-sitemap will be created after calling add_url() for the first time
@property
def sitemaps(self) -> List[str]:
"""
Returns list of sitemaps
"""
return self._sitemaps
@property
def sitemap_file(self) -> IO:
"""
Returns file handler for a current file
"""
assert self._sitemap_file is not None, "add_section() needs to called before"
return self._sitemap_file
def write_to_sitemap(self, buf: str, indent: bool = True):
"""
Writes given string to a sitemap file
"""
if indent:
buf = "\t" + buf
self.sitemap_file.write(buf + "\n")
def __repr__(self):
"""
A string representation
"""
return f"<{self.__class__.__name__} at {self.path} ({len(self)} URLs)>"
def __len__(self):
"""
How many URLs are there
"""
return self.total_urls_counter
def __enter__(self):
"""
Called when sitemap context starts
"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""
Called when sitemap context completes
"""
self._close_sitemap()
self._write_index()
def _add_sitemap(self):
"""
Called internally to add a new sitemap:
* when the add_url() after start_section() is called for the first time
* when per-sitemap URLs counter reaches the limit
"""
# close a previous sitemap, if any
self._close_sitemap()
self.sitemaps_counter += 1
sitemap_name = (
f"sitemap-{self.sitemaps_counter:03}-{self.current_section_name}.xml.gz"
)
self._sitemaps.append(sitemap_name)
self.logger.info(f"New sitemap added: {sitemap_name}")
# start a sitemap XML writer
self._sitemap_file = gzip.open(
f"{self.path}/{sitemap_name}",
mode="wt",
compresslevel=self.GZIP_COMPRESSION_LEVEL,
)
self.logger.info(f"Will write sitemap XML to {self.sitemap_file.name}")
self.write_to_sitemap('<?xml version="1.0" encoding="UTF-8"?>', indent=False)
self.write_to_sitemap(
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">', indent=False
)
def _close_sitemap(self):
"""
Close a sitemap XML
"""
if self._sitemap_file:
self.logger.info(f"Closing {self.sitemap_file.name}")
self.write_to_sitemap("</urlset>", indent=False)
self.write_to_sitemap(
f"<!-- {self.sitemap_urls_counter} urls in the sitemap -->",
indent=False,
)
self.sitemap_file.close()
self._sitemap_file = None
def _write_index(self):
"""
Write a sitemap index XML file
"""
with open(f"{self.path}/sitemap.xml", mode="wt", encoding="utf-8") as index:
self.logger.info(f"Will write sitemaps index XML to {index.name}")
generated_on = datetime.now().strftime("%Y-%m-%d") # e.g. 2024-11-22
index.writelines(
[
'<?xml version="1.0" encoding="UTF-8"?>\n',
'<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n',
f"\t<!-- Generated on {generated_on} by {POWERED_BY_URL} -->\n",
f"\t<!-- {len(self)} urls in {len(self.sitemaps)} sub-sitemaps -->\n",
]
)
for sitemap in self.sitemaps:
index.write(
f"\t<sitemap><loc>{self.root_url}/{escape_xml(sitemap)}</loc></sitemap>\n"
)
index.write("</sitemapindex>")