Skip to content

Commit

Permalink
[CentreFranceBridge] Add bridge (#4189)
Browse files Browse the repository at this point in the history
* [CentreFranceBridge] Add bridge

* [CentreFranceBridge] Fix bridge

* [CentreFranceBridge] Fix bridge

* [CentreFranceBridge] Improved icon choice

* [CentreFranceBridge] Fetch additional data from articles

* [CentreFranceBridge] New parameter to allow client to control how many articles to fetch

* [CentreFranceBridge] Improve bridge name based on existing parameters

* [CentreFranceBridge] Fixed some edge cases

* refactor: reorder

* fix

---------

Co-authored-by: Dag <[email protected]>
  • Loading branch information
quent1-fr and dvikan authored Aug 7, 2024
1 parent b25a779 commit 829d570
Show file tree
Hide file tree
Showing 3 changed files with 281 additions and 7 deletions.
279 changes: 279 additions & 0 deletions bridges/CentreFranceBridge.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
<?php

class CentreFranceBridge extends BridgeAbstract
{
const NAME = 'Centre France Newspapers';
const URI = 'https://www.centrefrance.com/';
const DESCRIPTION = 'Common bridge for all Centre France group newspapers.';
const CACHE_TIMEOUT = 7200; // 2h
const MAINTAINER = 'quent1';
const PARAMETERS = [
'global' => [
'newspaper' => [
'name' => 'Newspaper',
'type' => 'list',
'values' => [
'La Montagne' => 'lamontagne.fr',
'Le Populaire du Centre' => 'lepopulaire.fr',
'La République du Centre' => 'larep.fr',
'Le Berry Républicain' => 'leberry.fr',
'L\'Yonne Républicaine' => 'lyonne.fr',
'L\'Écho Républicain' => 'lechorepublicain.fr',
'Le Journal du Centre' => 'lejdc.fr',
'L\'Éveil de la Haute-Loire' => 'leveil.fr',
'Le Pays' => 'le-pays.fr'
]
],
'remove-reserved-for-subscribers-articles' => [
'name' => 'Remove reserved for subscribers articles',
'type' => 'checkbox',
'title' => 'Filter out articles that are only available to subscribers'
],
'limit' => [
'name' => 'Limit',
'type' => 'number',
'title' => 'How many articles to fetch. 0 to disable.',
'required' => true,
'defaultValue' => 15
]
],
'Local news' => [
'locality-slug' => [
'name' => 'Locality slug',
'type' => 'text',
'required' => false,
'title' => 'Fetch articles for a specific locality. If not set, headlines from the front page will be used instead.',
'exampleValue' => 'moulins-03000'
],
]
];

public function collectData()
{
$value = $this->getInput('limit');
if (is_numeric($value) && (int)$value >= 0) {
$limit = $value;
} else {
$limit = static::PARAMETERS['global']['limit']['defaultValue'];
}

if (empty($this->getInput('newspaper'))) {
return;
}

$localitySlug = $this->getInput('locality-slug') ?? '';
$alreadyFoundArticlesURIs = [];

$newspaperUrl = 'https://www.' . $this->getInput('newspaper') . '/' . $localitySlug . '/';
$html = getSimpleHTMLDOM($newspaperUrl);

// Articles are detected through their titles
foreach ($html->find('.c-titre') as $articleTitleDOMElement) {
$articleLinkDOMElement = $articleTitleDOMElement->find('a', 0);

// Ignore articles in the « Les + partagés » block
if (strpos($articleLinkDOMElement->id, 'les_plus_partages') !== false) {
continue;
}

$articleURI = $articleLinkDOMElement->href;

// If the URI has already been processed, ignore it
if (in_array($articleURI, $alreadyFoundArticlesURIs, true)) {
continue;
}

// If news are filtered for a specific locality, filter out article for other localities
if ($localitySlug !== '' && !str_contains($articleURI, $localitySlug)) {
continue;
}

$articleTitle = '';

// If article is reserved for subscribers
if ($articleLinkDOMElement->find('span.premium-picto', 0)) {
if ($this->getInput('remove-reserved-for-subscribers-articles') === true) {
continue;
}

$articleTitle .= '🔒 ';
}

$articleTitleDOMElement = $articleLinkDOMElement->find('span[data-tb-title]', 0);
if ($articleTitleDOMElement === null) {
continue;
}

if ($limit > 0 && count($this->items) === $limit) {
break;
}

$articleTitle .= $articleLinkDOMElement->find('span[data-tb-title]', 0)->innertext;
$articleFullURI = urljoin('https://www.' . $this->getInput('newspaper') . '/', $articleURI);

$item = [
'title' => $articleTitle,
'uri' => $articleFullURI,
...$this->collectArticleData($articleFullURI)
];
$this->items[] = $item;

$alreadyFoundArticlesURIs[] = $articleURI;
}
}

private function collectArticleData($uri): array
{
$html = getSimpleHTMLDOMCached($uri, 86400 * 90); // 90d

$item = [
'enclosures' => [],
];

$articleInformations = $html->find('.c-article-informations p');
if (is_array($articleInformations) && $articleInformations !== []) {
$authorPosition = 1;

// Article publication date
if (preg_match('/(\d{2})\/(\d{2})\/(\d{4})( à (\d{2})h(\d{2}))?/', $articleInformations[0]->innertext, $articleDateParts) > 0) {
$articleDate = new \DateTime('midnight');
$articleDate->setDate($articleDateParts[3], $articleDateParts[2], $articleDateParts[1]);

if (count($articleDateParts) === 7) {
$articleDate->setTime($articleDateParts[5], $articleDateParts[6]);
}

$item['timestamp'] = $articleDate->getTimestamp();
}

// Article update date
if (count($articleInformations) >= 2 && preg_match('/(\d{2})\/(\d{2})\/(\d{4})( à (\d{2})h(\d{2}))?/', $articleInformations[1]->innertext, $articleDateParts) > 0) {
$authorPosition = 2;

$articleDate = new \DateTime('midnight');
$articleDate->setDate($articleDateParts[3], $articleDateParts[2], $articleDateParts[1]);

if (count($articleDateParts) === 7) {
$articleDate->setTime($articleDateParts[5], $articleDateParts[6]);
}

$item['timestamp'] = $articleDate->getTimestamp();
}

if (count($articleInformations) === ($authorPosition + 1)) {
$item['author'] = $articleInformations[$authorPosition]->innertext;
}
}

$articleContent = $html->find('.b-article .contenu > *');
if (is_array($articleContent)) {
$item['content'] = '';

foreach ($articleContent as $contentPart) {
if (in_array($contentPart->getAttribute('id'), ['cf-audio-player', 'poool-widget'], true)) {
continue;
}

$articleHiddenParts = $contentPart->find('.bloc, .p402_hide');
if (is_array($articleHiddenParts)) {
foreach ($articleHiddenParts as $articleHiddenPart) {
$contentPart->removeChild($articleHiddenPart);
}
}

$item['content'] .= $contentPart->innertext;
}
}

$articleIllustration = $html->find('.photo-wrapper .photo-box img');
if (is_array($articleIllustration) && count($articleIllustration) === 1) {
$item['enclosures'][] = $articleIllustration[0]->getAttribute('src');
}

$articleAudio = $html->find('#cf-audio-player-container audio');
if (is_array($articleAudio) && count($articleAudio) === 1) {
$item['enclosures'][] = $articleAudio[0]->getAttribute('src');
}

$articleTags = $html->find('.b-article > ul.c-tags > li > a.t-simple');
if (is_array($articleTags)) {
$item['categories'] = array_map(static fn ($articleTag) => $articleTag->innertext, $articleTags);
}

$explode = explode('_', $uri);
$array_reverse = array_reverse($explode);
$string = $array_reverse[0];
$uid = rtrim($string, '/');
if (is_numeric($uid)) {
$item['uid'] = $uid;
}

// If the article is a "grand format", we use another parsing strategy
if ($item['content'] === '' && $html->find('article') !== []) {
$articleContent = $html->find('article > section');
foreach ($articleContent as $contentPart) {
if ($contentPart->find('#journo') !== []) {
$item['author'] = $contentPart->find('#journo')->innertext;
continue;
}

$item['content'] .= $contentPart->innertext;
}
}

$item['content'] = str_replace('<span class="p-premium">premium</span>', '🔒', $item['content']);
$item['content'] = trim($item['content']);

return $item;
}

public function getName()
{
if (empty($this->getInput('newspaper'))) {
return static::NAME;
}

$newspaperNameByDomain = array_flip(self::PARAMETERS['global']['newspaper']['values']);
if (!isset($newspaperNameByDomain[$this->getInput('newspaper')])) {
return static::NAME;
}

$completeTitle = $newspaperNameByDomain[$this->getInput('newspaper')];

if (!empty($this->getInput('locality-slug'))) {
$localityName = explode('-', $this->getInput('locality-slug'));
array_pop($localityName);
$completeTitle .= ' ' . ucfirst(implode('-', $localityName));
}

return $completeTitle;
}

public function getIcon()
{
if (empty($this->getInput('newspaper'))) {
return static::URI . '/favicon.ico';
}

return 'https://www.' . $this->getInput('newspaper') . '/favicon.ico';
}

public function detectParameters($url)
{
$regex = '/^(https?:\/\/)?(www\.)?([a-z-]+\.fr)(\/)?([a-z-]+-[0-9]{5})?(\/)?$/';
$url = strtolower($url);

if (preg_match($regex, $url, $urlMatches) === 0) {
return null;
}

if (!in_array($urlMatches[3], self::PARAMETERS['global']['newspaper']['values'], true)) {
return null;
}

return [
'newspaper' => $urlMatches[3],
'locality-slug' => empty($urlMatches[5]) ? null : $urlMatches[5]
];
}
}
4 changes: 2 additions & 2 deletions lib/contents.php
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,6 @@ function getContents(
* when returning plaintext.
* @param string $defaultSpanText Specifies the replacement text for `<span />`
* tags when returning plaintext.
* @return false|simple_html_dom Contents as simplehtmldom object.
*/
function getSimpleHTMLDOM(
$url,
Expand All @@ -154,11 +153,12 @@ function getSimpleHTMLDOM(
$stripRN = true,
$defaultBRText = DEFAULT_BR_TEXT,
$defaultSpanText = DEFAULT_SPAN_TEXT
) {
): \simple_html_dom {
$html = getContents($url, $header ?? [], $opts ?? []);
if ($html === '') {
throw new \Exception('Unable to parse dom because the http response was the empty string');
}

return str_get_html(
$html,
$lowercase,
Expand Down
5 changes: 0 additions & 5 deletions lib/simplehtmldom/simple_html_dom.php
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,6 @@ function str_get_html(
throw new \Exception('Refusing to parse too big input');
}

if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
$dom->clear();
return false;
}

return $dom->load($str, $lowercase, $stripRN);
}

Expand Down

0 comments on commit 829d570

Please sign in to comment.