-
Notifications
You must be signed in to change notification settings - Fork 1k
/
SitemapBridge.php
177 lines (164 loc) · 7.92 KB
/
SitemapBridge.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
<?php
class SitemapBridge extends CssSelectorBridge
{
const MAINTAINER = 'ORelio';
const NAME = 'Sitemap Bridge';
const URI = 'https://github.com/RSS-Bridge/rss-bridge/';
const DESCRIPTION = 'Convert any site to RSS feed using SEO Sitemap and CSS selectors (Advanced Users)';
const PARAMETERS = [
[
'home_page' => [
'name' => 'Site URL: Home page with latest articles',
'title' => <<<EOT
The bridge will analyze the site like a search engine does.
The URL specified here determines the feed title and URL.
EOT,
'exampleValue' => 'https://example.com/blog/',
'required' => true
],
'url_pattern' => [
'name' => 'Pattern for site URLs to take in feed',
'title' => 'Select items by applying a regular expression on their URL',
'exampleValue' => 'https://example.com/article/.*',
'required' => true
],
'content_selector' => [
'name' => 'Selector for each article content',
'title' => <<<EOT
This bridge works using CSS selectors, e.g. "div.article" will match <div class="article">.
Everything inside that element becomes feed item content.
EOT,
'exampleValue' => 'article.content',
'required' => true
],
'content_cleanup' => [
'name' => '[Optional] Content cleanup: List of items to remove',
'title' => 'Selector for unnecessary elements to remove inside article contents.',
'exampleValue' => 'div.ads, div.comments',
],
'title_cleanup' => [
'name' => '[Optional] Text to remove from article title',
'title' => 'Specify here some text from page title that need to be removed, e.g. " | BlogName".',
'exampleValue' => ' | BlogName',
],
'site_map' => [
'name' => '[Optional] sitemap.xml URL',
'title' => <<<EOT
By default, the bridge will analyze robots.txt to find out URL for sitemap.xml.
Alternatively, you can specify here the direct URL for sitemap XML.
The sitemap.xml file must have <loc> and <lastmod> fields for the bridge to work:
Eg. <url><loc>https://article/url</loc><lastmod>2000-12-31T23:59Z</lastmod></url>
<loc> is feed item URL, <lastmod> for selecting the most recent entries.
EOT,
'exampleValue' => 'https://example.com/sitemap.xml',
],
'discard_thumbnail' => [
'name' => '[Optional] Discard thumbnail set by site author',
'title' => 'Some sites set their logo as thumbnail for every article. Use this option to discard it.',
'type' => 'checkbox',
],
'thumbnail_as_header' => [
'name' => '[Optional] Insert thumbnail as article header',
'title' => 'Insert article main image on top of article contents.',
'type' => 'checkbox',
],
'limit' => self::LIMIT
]
];
public function collectData()
{
$this->homepageUrl = $this->getInput('home_page');
$url_pattern = $this->getInput('url_pattern');
$content_selector = $this->getInput('content_selector');
$content_cleanup = $this->getInput('content_cleanup');
$title_cleanup = $this->getInput('title_cleanup');
$site_map = $this->getInput('site_map');
$discard_thumbnail = $this->getInput('discard_thumbnail');
$thumbnail_as_header = $this->getInput('thumbnail_as_header');
$limit = $this->getInput('limit');
$this->feedName = $this->titleCleanup($this->getPageTitle($this->homepageUrl), $title_cleanup);
$sitemap_url = empty($site_map) ? $this->homepageUrl : $site_map;
$sitemap_xml = $this->getSitemapXml($sitemap_url, !empty($site_map));
$links = $this->sitemapXmlToList($sitemap_xml, $url_pattern, empty($limit) ? 10 : $limit);
if (empty($links) && empty($this->sitemapXmlToList($sitemap_xml))) {
returnClientError('Could not retrieve URLs with Timestamps from Sitemap: ' . $sitemap_url);
}
foreach ($links as $link) {
$item = $this->expandEntryWithSelector($link, $content_selector, $content_cleanup, $title_cleanup);
if ($discard_thumbnail && isset($item['enclosures'])) {
unset($item['enclosures']);
}
if ($thumbnail_as_header && isset($item['enclosures'])) {
$item['content'] = '<p><img src="' . $item['enclosures'][0] . '" /></p>' . $item['content'];
}
$this->items[] = $item;
}
}
/**
* Retrieve site map from specified URL
* @param string $url URL pointing to any page of the site, e.g. "https://example.com/blog" OR directly to the site map e.g. "https://example.com/sitemap.xml"
* @param string $is_site_map TRUE if the specified URL points directly to the sitemap XML
* @return object Sitemap DOM (from parsed XML)
*/
protected function getSitemapXml(&$url, $is_site_map = false)
{
if (!$is_site_map) {
$robots_txt = getSimpleHTMLDOM(urljoin($url, '/robots.txt'))->outertext;
preg_match('/Sitemap: ([^ ]+)/', $robots_txt, $matches);
if (empty($matches)) {
$sitemap = getSimpleHTMLDOM(urljoin($url, '/sitemap.xml'));
if (!empty($sitemap->find('urlset, sitemap'))) {
$url = urljoin($url, '/sitemap.xml');
return $sitemap;
} else {
returnClientError('Failed to locate Sitemap from /robots.txt or /sitemap.xml. Try setting it manually.');
}
}
$url = $matches[1];
}
return getSimpleHTMLDOM($url);
}
/**
* Retrieve N most recent URLs from Site Map
* @param object $sitemap Site map XML DOM
* @param string $url_pattern Optional pattern to look for in URLs
* @param int $limit Optional maximum amount of URLs to return
* @param bool $keep_date TRUE to keep dates (url => date array instead of url array)
* @return array Array of URLs
*/
protected function sitemapXmlToList($sitemap, $url_pattern = '', $limit = 0, $keep_date = false)
{
$links = [];
foreach ($sitemap->find('sitemap') as $nested_sitemap) {
$url = $nested_sitemap->find('loc');
if (!empty($url)) {
$url = trim($url[0]->plaintext);
if (str_ends_with(strtolower($url), '.xml')) {
$nested_sitemap_xml = $this->getSitemapXml($url, true);
$nested_sitemap_links = $this->sitemapXmlToList($nested_sitemap_xml, $url_pattern, null, true);
$links = array_merge($links, $nested_sitemap_links);
}
}
}
if (!empty($url_pattern)) {
$url_pattern = str_replace('/', '\/', $url_pattern);
}
foreach ($sitemap->find('url') as $item) {
$url = $item->find('loc');
$lastmod = $item->find('lastmod');
if (!empty($url) && !empty($lastmod)) {
$url = trim($url[0]->plaintext);
$lastmod = trim($lastmod[0]->plaintext);
$timestamp = strtotime($lastmod);
if (empty($url_pattern) || preg_match('/' . $url_pattern . '/', $url) === 1) {
$links[$url] = $timestamp;
}
}
}
arsort($links);
if ($limit > 0 && count($links) > $limit) {
$links = array_slice($links, 0, $limit);
}
return $keep_date ? $links : array_keys($links);
}
}