diff --git a/CHANGELOG.md b/CHANGELOG.md index bbbb628..eec1c27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [1.5.2] - 2024-02-07 +### Fixed +* Issue in `GetUrlsFromSitemap` (`Sitemap::getUrlsFromSitemap()`) step when XML content has no line breaks. + ## [1.5.1] - 2024-02-06 ### Fixed * For being more flexible to build a separate headless browser loader (in an extension package) extract the most basic HTTP loader functionality to a new `HttpBaseLoader` and important functionality for the headless browser loader to a new `HeadlessBrowserLoaderHelper`. Further, also share functionality from the `Http` steps via a new abstract `HttpBase` step. It's considered a fix, because there's no new functionality, just refactoring existing code for better extendability. diff --git a/src/Steps/Sitemap/GetUrlsFromSitemap.php b/src/Steps/Sitemap/GetUrlsFromSitemap.php index 817739b..95f0fd3 100644 --- a/src/Steps/Sitemap/GetUrlsFromSitemap.php +++ b/src/Steps/Sitemap/GetUrlsFromSitemap.php @@ -19,7 +19,7 @@ class GetUrlsFromSitemap extends Step public static function fixUrlSetTag(Crawler $dom): Crawler { if ($dom->filter('urlset url')->count() === 0) { - return new Crawler(preg_replace('//', '', $dom->outerHtml())); + return new Crawler(preg_replace('//', '', $dom->outerHtml())); } return $dom; diff --git a/tests/Steps/Sitemap/GetUrlsFromSitemapTest.php b/tests/Steps/Sitemap/GetUrlsFromSitemapTest.php index 2cbc0b5..36a8936 100644 --- a/tests/Steps/Sitemap/GetUrlsFromSitemapTest.php +++ b/tests/Steps/Sitemap/GetUrlsFromSitemapTest.php @@ -96,3 +96,17 @@ function () { expect($outputs)->toHaveCount(3); } ); + +it( + 'doesn\'t fail when the urlset tag contains attributes, that would cause the symfony DomCrawler to not find the ' . + 'elements, when the XML content has no line breaks', + function () { + $xml = <<https://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-5https://www.crwlr.software/blog/dealing-with-http-url-query-strings-in-phphttps://www.crwlr.software/blog/whats-new-in-crwlr-crawler-v0-4 + XML; + + $outputs = helper_invokeStepWithInput(Sitemap::getUrlsFromSitemap(), $xml); + + expect($outputs)->toHaveCount(3); + } +);