From 829d570f8e4f6b38f04b2fac4a3da387e64bf403 Mon Sep 17 00:00:00 2001 From: "Quentin B." Date: Thu, 8 Aug 2024 00:57:40 +0200 Subject: [PATCH] [CentreFranceBridge] Add bridge (#4189) * [CentreFranceBridge] Add bridge * [CentreFranceBridge] Fix bridge * [CentreFranceBridge] Fix bridge * [CentreFranceBridge] Improved icon choice * [CentreFranceBridge] Fetch additional data from articles * [CentreFranceBridge] New parameter to allow client to control how many articles to fetch * [CentreFranceBridge] Improve bridge name based on existing parameters * [CentreFranceBridge] Fixed some edge cases * refactor: reorder * fix --------- Co-authored-by: Dag --- bridges/CentreFranceBridge.php | 279 ++++++++++++++++++++++++++ lib/contents.php | 4 +- lib/simplehtmldom/simple_html_dom.php | 5 - 3 files changed, 281 insertions(+), 7 deletions(-) create mode 100644 bridges/CentreFranceBridge.php diff --git a/bridges/CentreFranceBridge.php b/bridges/CentreFranceBridge.php new file mode 100644 index 00000000000..a6dea227651 --- /dev/null +++ b/bridges/CentreFranceBridge.php @@ -0,0 +1,279 @@ + [ + 'newspaper' => [ + 'name' => 'Newspaper', + 'type' => 'list', + 'values' => [ + 'La Montagne' => 'lamontagne.fr', + 'Le Populaire du Centre' => 'lepopulaire.fr', + 'La République du Centre' => 'larep.fr', + 'Le Berry Républicain' => 'leberry.fr', + 'L\'Yonne Républicaine' => 'lyonne.fr', + 'L\'Écho Républicain' => 'lechorepublicain.fr', + 'Le Journal du Centre' => 'lejdc.fr', + 'L\'Éveil de la Haute-Loire' => 'leveil.fr', + 'Le Pays' => 'le-pays.fr' + ] + ], + 'remove-reserved-for-subscribers-articles' => [ + 'name' => 'Remove reserved for subscribers articles', + 'type' => 'checkbox', + 'title' => 'Filter out articles that are only available to subscribers' + ], + 'limit' => [ + 'name' => 'Limit', + 'type' => 'number', + 'title' => 'How many articles to fetch. 0 to disable.', + 'required' => true, + 'defaultValue' => 15 + ] + ], + 'Local news' => [ + 'locality-slug' => [ + 'name' => 'Locality slug', + 'type' => 'text', + 'required' => false, + 'title' => 'Fetch articles for a specific locality. If not set, headlines from the front page will be used instead.', + 'exampleValue' => 'moulins-03000' + ], + ] + ]; + + public function collectData() + { + $value = $this->getInput('limit'); + if (is_numeric($value) && (int)$value >= 0) { + $limit = $value; + } else { + $limit = static::PARAMETERS['global']['limit']['defaultValue']; + } + + if (empty($this->getInput('newspaper'))) { + return; + } + + $localitySlug = $this->getInput('locality-slug') ?? ''; + $alreadyFoundArticlesURIs = []; + + $newspaperUrl = 'https://www.' . $this->getInput('newspaper') . '/' . $localitySlug . '/'; + $html = getSimpleHTMLDOM($newspaperUrl); + + // Articles are detected through their titles + foreach ($html->find('.c-titre') as $articleTitleDOMElement) { + $articleLinkDOMElement = $articleTitleDOMElement->find('a', 0); + + // Ignore articles in the « Les + partagés » block + if (strpos($articleLinkDOMElement->id, 'les_plus_partages') !== false) { + continue; + } + + $articleURI = $articleLinkDOMElement->href; + + // If the URI has already been processed, ignore it + if (in_array($articleURI, $alreadyFoundArticlesURIs, true)) { + continue; + } + + // If news are filtered for a specific locality, filter out article for other localities + if ($localitySlug !== '' && !str_contains($articleURI, $localitySlug)) { + continue; + } + + $articleTitle = ''; + + // If article is reserved for subscribers + if ($articleLinkDOMElement->find('span.premium-picto', 0)) { + if ($this->getInput('remove-reserved-for-subscribers-articles') === true) { + continue; + } + + $articleTitle .= '🔒 '; + } + + $articleTitleDOMElement = $articleLinkDOMElement->find('span[data-tb-title]', 0); + if ($articleTitleDOMElement === null) { + continue; + } + + if ($limit > 0 && count($this->items) === $limit) { + break; + } + + $articleTitle .= $articleLinkDOMElement->find('span[data-tb-title]', 0)->innertext; + $articleFullURI = urljoin('https://www.' . $this->getInput('newspaper') . '/', $articleURI); + + $item = [ + 'title' => $articleTitle, + 'uri' => $articleFullURI, + ...$this->collectArticleData($articleFullURI) + ]; + $this->items[] = $item; + + $alreadyFoundArticlesURIs[] = $articleURI; + } + } + + private function collectArticleData($uri): array + { + $html = getSimpleHTMLDOMCached($uri, 86400 * 90); // 90d + + $item = [ + 'enclosures' => [], + ]; + + $articleInformations = $html->find('.c-article-informations p'); + if (is_array($articleInformations) && $articleInformations !== []) { + $authorPosition = 1; + + // Article publication date + if (preg_match('/(\d{2})\/(\d{2})\/(\d{4})( à (\d{2})h(\d{2}))?/', $articleInformations[0]->innertext, $articleDateParts) > 0) { + $articleDate = new \DateTime('midnight'); + $articleDate->setDate($articleDateParts[3], $articleDateParts[2], $articleDateParts[1]); + + if (count($articleDateParts) === 7) { + $articleDate->setTime($articleDateParts[5], $articleDateParts[6]); + } + + $item['timestamp'] = $articleDate->getTimestamp(); + } + + // Article update date + if (count($articleInformations) >= 2 && preg_match('/(\d{2})\/(\d{2})\/(\d{4})( à (\d{2})h(\d{2}))?/', $articleInformations[1]->innertext, $articleDateParts) > 0) { + $authorPosition = 2; + + $articleDate = new \DateTime('midnight'); + $articleDate->setDate($articleDateParts[3], $articleDateParts[2], $articleDateParts[1]); + + if (count($articleDateParts) === 7) { + $articleDate->setTime($articleDateParts[5], $articleDateParts[6]); + } + + $item['timestamp'] = $articleDate->getTimestamp(); + } + + if (count($articleInformations) === ($authorPosition + 1)) { + $item['author'] = $articleInformations[$authorPosition]->innertext; + } + } + + $articleContent = $html->find('.b-article .contenu > *'); + if (is_array($articleContent)) { + $item['content'] = ''; + + foreach ($articleContent as $contentPart) { + if (in_array($contentPart->getAttribute('id'), ['cf-audio-player', 'poool-widget'], true)) { + continue; + } + + $articleHiddenParts = $contentPart->find('.bloc, .p402_hide'); + if (is_array($articleHiddenParts)) { + foreach ($articleHiddenParts as $articleHiddenPart) { + $contentPart->removeChild($articleHiddenPart); + } + } + + $item['content'] .= $contentPart->innertext; + } + } + + $articleIllustration = $html->find('.photo-wrapper .photo-box img'); + if (is_array($articleIllustration) && count($articleIllustration) === 1) { + $item['enclosures'][] = $articleIllustration[0]->getAttribute('src'); + } + + $articleAudio = $html->find('#cf-audio-player-container audio'); + if (is_array($articleAudio) && count($articleAudio) === 1) { + $item['enclosures'][] = $articleAudio[0]->getAttribute('src'); + } + + $articleTags = $html->find('.b-article > ul.c-tags > li > a.t-simple'); + if (is_array($articleTags)) { + $item['categories'] = array_map(static fn ($articleTag) => $articleTag->innertext, $articleTags); + } + + $explode = explode('_', $uri); + $array_reverse = array_reverse($explode); + $string = $array_reverse[0]; + $uid = rtrim($string, '/'); + if (is_numeric($uid)) { + $item['uid'] = $uid; + } + + // If the article is a "grand format", we use another parsing strategy + if ($item['content'] === '' && $html->find('article') !== []) { + $articleContent = $html->find('article > section'); + foreach ($articleContent as $contentPart) { + if ($contentPart->find('#journo') !== []) { + $item['author'] = $contentPart->find('#journo')->innertext; + continue; + } + + $item['content'] .= $contentPart->innertext; + } + } + + $item['content'] = str_replace('premium', '🔒', $item['content']); + $item['content'] = trim($item['content']); + + return $item; + } + + public function getName() + { + if (empty($this->getInput('newspaper'))) { + return static::NAME; + } + + $newspaperNameByDomain = array_flip(self::PARAMETERS['global']['newspaper']['values']); + if (!isset($newspaperNameByDomain[$this->getInput('newspaper')])) { + return static::NAME; + } + + $completeTitle = $newspaperNameByDomain[$this->getInput('newspaper')]; + + if (!empty($this->getInput('locality-slug'))) { + $localityName = explode('-', $this->getInput('locality-slug')); + array_pop($localityName); + $completeTitle .= ' ' . ucfirst(implode('-', $localityName)); + } + + return $completeTitle; + } + + public function getIcon() + { + if (empty($this->getInput('newspaper'))) { + return static::URI . '/favicon.ico'; + } + + return 'https://www.' . $this->getInput('newspaper') . '/favicon.ico'; + } + + public function detectParameters($url) + { + $regex = '/^(https?:\/\/)?(www\.)?([a-z-]+\.fr)(\/)?([a-z-]+-[0-9]{5})?(\/)?$/'; + $url = strtolower($url); + + if (preg_match($regex, $url, $urlMatches) === 0) { + return null; + } + + if (!in_array($urlMatches[3], self::PARAMETERS['global']['newspaper']['values'], true)) { + return null; + } + + return [ + 'newspaper' => $urlMatches[3], + 'locality-slug' => empty($urlMatches[5]) ? null : $urlMatches[5] + ]; + } +} diff --git a/lib/contents.php b/lib/contents.php index 893a35121fe..cc9542a956e 100644 --- a/lib/contents.php +++ b/lib/contents.php @@ -142,7 +142,6 @@ function getContents( * when returning plaintext. * @param string $defaultSpanText Specifies the replacement text for `` * tags when returning plaintext. - * @return false|simple_html_dom Contents as simplehtmldom object. */ function getSimpleHTMLDOM( $url, @@ -154,11 +153,12 @@ function getSimpleHTMLDOM( $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT -) { +): \simple_html_dom { $html = getContents($url, $header ?? [], $opts ?? []); if ($html === '') { throw new \Exception('Unable to parse dom because the http response was the empty string'); } + return str_get_html( $html, $lowercase, diff --git a/lib/simplehtmldom/simple_html_dom.php b/lib/simplehtmldom/simple_html_dom.php index 3fc95760133..170f6fb0960 100644 --- a/lib/simplehtmldom/simple_html_dom.php +++ b/lib/simplehtmldom/simple_html_dom.php @@ -118,11 +118,6 @@ function str_get_html( throw new \Exception('Refusing to parse too big input'); } - if (empty($str) || strlen($str) > MAX_FILE_SIZE) { - $dom->clear(); - return false; - } - return $dom->load($str, $lowercase, $stripRN); }