Skip to content

Commit

Permalink
bridges: use BridgeAbstract::getSimpleHTMLDOM
Browse files Browse the repository at this point in the history
instead of BridgeAbstract::file_get_html

Signed-off-by: Pierre Mazière <[email protected]>
  • Loading branch information
Pierre Mazière committed Aug 19, 2016
1 parent f43bbda commit 3c0d13c
Show file tree
Hide file tree
Showing 121 changed files with 1,212 additions and 396 deletions.
4 changes: 2 additions & 2 deletions bridges/ABCTabsBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ public function loadMetadatas() {

public function collectData(array $param){
$html = '';
$html = $this->file_get_html('http://www.abc-tabs.com/tablatures/nouveautes.html') or $this->returnClientError('No results for this query.');
$html = $this->getSimpleHTMLDOM('http://www.abc-tabs.com/tablatures/nouveautes.html') or $this->returnClientError('No results for this query.');
$table = $html->find('table#myTable', 0)->children(1);

foreach ($table->find('tr') as $tab)
{
$item = new \Item();
Expand Down
2 changes: 1 addition & 1 deletion bridges/AcrimedBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ protected function parseRSSItem($newsItem) {
$item->title = trim($newsItem->title);
$item->timestamp = strtotime($dc->date);

$articlePage = $this->file_get_html($newsItem->link);
$articlePage = $this->getSimpleHTMLDOM($newsItem->link);
$article = $hs->sanitize($articlePage->find('article.article1', 0)->innertext);
$article = HTMLSanitizer::defaultImageSrcTo($article, "http://www.acrimed.org/");

Expand Down
8 changes: 4 additions & 4 deletions bridges/AllocineFRBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public function loadMetadatas() {
$this->description = "Bridge for allocine.fr";
$this->update = '2016-08-17';

$this->parameters[] =
$this->parameters[] =
'[
{
"name" : "category",
Expand All @@ -18,7 +18,7 @@ public function loadMetadatas() {
"required" : true,
"exampleValue" : "Faux Raccord",
"title" : "Select your category",
"values" :
"values" :
[
{
"name" : "Faux Raccord",
Expand Down Expand Up @@ -64,12 +64,12 @@ public function collectData(array $params){
// Update bridge name to match selection
$this->name .= ' : ' . $category;

$html = $this->file_get_html($this->uri) or $this->returnServerError("Could not request {$this->uri}!");
$html = $this->getSimpleTMLOM($this->uri) or $this->returnServerError("Could not request {$this->uri}!");

foreach($html->find('figure.media-meta-fig') as $element)
{
$item = new Item();

$title = $element->find('div.titlebar h3.title a', 0);
$content = trim($element->innertext);
$figCaption = strpos($content, $category);
Expand Down
4 changes: 2 additions & 2 deletions bridges/AnimeUltimeBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ public function collectData(array $param) {
//Retrive page contents
$website = 'http://www.anime-ultime.net/';
$url = $website.'history-0-1/'.$requestFilter;
$html = $this->file_get_html($url) or $this->returnServerError('Could not request Anime-Ultime: '.$url);
$html = $this->getSimpleHTMLDOM($url) or $this->returnServerError('Could not request Anime-Ultime: '.$url);

//Relases are sorted by day : process each day individually
foreach ($html->find('div.history', 0)->find('h3') as $daySection) {
Expand Down Expand Up @@ -110,7 +110,7 @@ public function collectData(array $param) {
$item->content = $item_description;
$this->items[] = $item;
$processedOK++;

//Stop processing once limit is reached
if ($processedOK >= 10)
return;
Expand Down
2 changes: 1 addition & 1 deletion bridges/BandcampBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public function collectData(array $param){
$html = '';
if (isset($param['tag'])) {
$this->request = $param['tag'];
$html = $this->file_get_html('http://bandcamp.com/tag/'.urlencode($this->request).'?sort_field=date') or $this->returnServerError('No results for this query.');
$html = $this->getSimpleHTMLDOM('http://bandcamp.com/tag/'.urlencode($this->request).'?sort_field=date') or $this->returnServerError('No results for this query.');
}
else {
$this->returnClientError('You must specify tag (/tag/...)');
Expand Down
8 changes: 4 additions & 4 deletions bridges/BastaBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ public function loadMetadatas() {
$this->description = "Returns the newest articles.";
$this->update = '2016-08-17';
}

public function collectData(array $param){
// Replaces all relative image URLs by absolute URLs. Relative URLs always start with 'local/'!
function ReplaceImageUrl($content){
return preg_replace('/src=["\']{1}([^"\']+)/ims', 'src=\'http://www.bastamag.net/$1\'', $content);
}
$html = $this->file_get_html('http://www.bastamag.net/spip.php?page=backend') or $this->returnServerError('Could not request Bastamag.');

$html = $this->getSimpleHTMLDOM('http://www.bastamag.net/spip.php?page=backend') or $this->returnServerError('Could not request Bastamag.');
$limit = 0;

foreach($html->find('item') as $element) {
Expand All @@ -34,4 +34,4 @@ public function getCacheDuration(){
return 3600*2; // 2 hours
}
}
?>
?>
4 changes: 2 additions & 2 deletions bridges/BlaguesDeMerdeBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ public function loadMetadatas() {
}

public function collectData(array $param){
$html = $this->file_get_html('http://www.blaguesdemerde.fr/') or $this->returnServerError('Could not request BDM.');
$html = $this->getSimpleHTMLDOM('http://www.blaguesdemerde.fr/') or $this->returnServerError('Could not request BDM.');

foreach($html->find('article.joke_contener') as $element) {
$item = new Item();
$temp = $element->find('a');
Expand Down
14 changes: 7 additions & 7 deletions bridges/BooruprojectBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,29 @@ public function loadMetadatas() {

public function collectData(array $param){
$page = 0; $tags = '';
if (!empty($param['p'])) {
$page = (int)preg_replace("/[^0-9]/",'', $param['p']);
if (!empty($param['p'])) {
$page = (int)preg_replace("/[^0-9]/",'', $param['p']);
$page = $page - 1;
$page = $page * 20;
}
if (!empty($param['t'])) {
$tags = '&tags='.urlencode($param['t']);
if (!empty($param['t'])) {
$tags = '&tags='.urlencode($param['t']);
}
if (empty($param['i'])) {
$this->returnServerError('Please enter a ***.booru.org instance.');
}
$html = $this->file_get_html("http://".$param['i'].".booru.org/index.php?page=post&s=list&pid=".$page.$tags) or $this->returnServerError('Could not request Booruproject.');
$html = $this->getSimpleHTMLDOM("http://".$param['i'].".booru.org/index.php?page=post&s=list&pid=".$page.$tags) or $this->returnServerError('Could not request Booruproject.');


foreach($html->find('div[class=content] span') as $element) {
$item = new \Item();
$item->uri = 'http://'.$param['i'].'.booru.org/'.$element->find('a', 0)->href;
$item->postid = (int)preg_replace("/[^0-9]/",'', $element->find('a', 0)->getAttribute('id'));
$item->postid = (int)preg_replace("/[^0-9]/",'', $element->find('a', 0)->getAttribute('id'));
$item->timestamp = time();
$item->tags = $element->find('img', 0)->getAttribute('title');
$item->title = 'Booruproject '.$param['i'].' | '.$item->postid;
$item->content = '<a href="' . $item->uri . '"><img src="' . $element->find('img', 0)->src . '" /></a><br>Tags: '.$item->tags;
$this->items[] = $item;
$this->items[] = $item;
}
}

Expand Down
7 changes: 3 additions & 4 deletions bridges/CADBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ public function loadMetadatas() {
}

private function CADExtractContent($url) {
$html3 = $this->file_get_html($url);
$html3 = $this->getSimpleHTMLDOM($url);

// The request might fail due to missing https support or wrong URL
if($html3 == false)
Expand All @@ -27,7 +27,6 @@ private function CADExtractContent($url) {
default:
return 'Daily comic not released yet';
}

$img = implode ($url2[0]);
$html3->clear();
unset ($html3);
Expand All @@ -45,7 +44,7 @@ function CADUrl($string) {
return $string;
}

$html = $this->file_get_html('http://cdn2.cad-comic.com/rss.xml') or $this->returnServerError('Could not request CAD.');
$html = $this->getSimpleHTMLDOM('http://cdn2.cad-comic.com/rss.xml') or $this->returnServerError('Could not request CAD.');
$limit = 0;

foreach($html->find('item') as $element) {
Expand All @@ -67,4 +66,4 @@ public function getCacheDuration(){
return 3600*2; // 2 hours
}
}
?>
?>
4 changes: 2 additions & 2 deletions bridges/CNETBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ function CleanArticle($article_html) {
$this->topicName = $param['topic'];

$pageUrl = 'http://www.cnet.com/'.(empty($this->topicName) ? '' : 'topics/'.$this->topicName.'/');
$html = $this->file_get_html($pageUrl) or $this->returnServerError('Could not request CNET: '.$pageUrl);
$html = $this->getSimpleHTMLDOM($pageUrl) or $this->returnServerError('Could not request CNET: '.$pageUrl);
$limit = 0;

foreach($html->find('div.assetBody') as $element) {
Expand All @@ -64,7 +64,7 @@ function CleanArticle($article_html) {

if (!empty($article_title) && !empty($article_uri) && strpos($article_uri, '/news/') !== false) {

$article_html = $this->file_get_html($article_uri) or $this->returnServerError('Could not request CNET: '.$article_uri);
$article_html = $this->getSimpleHTMLDOM($article_uri) or $this->returnServerError('Could not request CNET: '.$article_uri);

$article_content = trim(CleanArticle(ExtractFromDelimiters($article_html, '<div class="articleContent', '<footer>')));

Expand Down
56 changes: 56 additions & 0 deletions bridges/CoinDeskBridge.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<?php
class CoinDeskBridge extends BridgeAbstract{

public function loadMetadatas() {

$this->maintainer = "mitsukarenai";
$this->name = "CoinDesk";
$this->uri = "http://www.coindesk.com/";
$this->description = "Returns the 5 newest posts from CoinDesk (full text)";
$this->update = "2014-05-30";

}

public function collectData(array $param){

function CoinDeskStripCDATA($string) {
$string = str_replace('<![CDATA[', '', $string);
$string = str_replace(']]>', '', $string);
return $string;
}
function CoinDeskExtractContent($url) {
$html2 = $this->getSimpleHTMLDOM($url);
$text = $html2->find('div.single-content', 0)->innertext;
$text = strip_tags($text, '<p><a><img>');
return $text;
}
$html = $this->getSimpleHTMLDOM('http://www.coindesk.com/feed/atom/') or $this->returnError('Could not request CoinDesk.', 404);
$limit = 0;

foreach($html->find('entry') as $element) {
if($limit < 5) {
$item = new \Item();
$item->title = CoinDeskStripCDATA($element->find('title', 0)->innertext);
$item->author = $element->find('author', 0)->plaintext;
$item->uri = $element->find('link', 0)->href;
$item->timestamp = strtotime($element->find('published', 0)->plaintext);
$item->content = CoinDeskExtractContent($item->uri);
$this->items[] = $item;
$limit++;
}
}

}

public function getName(){
return 'CoinDesk';
}

public function getURI(){
return 'http://www.coindesk.com/';
}

public function getCacheDuration(){
return 1800; // 30min
}
}
2 changes: 1 addition & 1 deletion bridges/CollegeDeFranceBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public function collectData(array $param) {
* </a>
* </li>
*/
$html = $this->file_get_html('http://www.college-de-france.fr/components/search-audiovideo.jsp?fulltext=&siteid=1156951719600&lang=FR&type=all') or $this->returnServerError('Could not request CollegeDeFrance.');
$html = $this->getSimpleHTMLDOM('http://www.college-de-france.fr/components/search-audiovideo.jsp?fulltext=&siteid=1156951719600&lang=FR&type=all') or $this->returnServerError('Could not request CollegeDeFrance.');
foreach($html->find('a[data-target]') as $element) {
$item = new \Item();
$item->title = $element->find('.title', 0)->plaintext;
Expand Down
4 changes: 2 additions & 2 deletions bridges/CommonDreamsBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ public function loadMetadatas() {
}

private function CommonDreamsExtractContent($url) {
$html3 = $this->file_get_html($url);
$html3 = $this->getSimpleHTMLDOM($url);
$text = $html3->find('div[class=field--type-text-with-summary]', 0)->innertext;
$html3->clear();
unset ($html3);
Expand All @@ -25,7 +25,7 @@ function CommonDreamsUrl($string) {
return $string;
}

$html = $this->file_get_html('http://www.commondreams.org/rss.xml') or $this->returnServerError('Could not request CommonDreams.');
$html = $this->getSimpleHTMLDOM('http://www.commondreams.org/rss.xml') or $this->returnServerError('Could not request CommonDreams.');
$limit = 0;
foreach($html->find('item') as $element) {
if($limit < 4) {
Expand Down
8 changes: 4 additions & 4 deletions bridges/CopieDoubleBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ public function loadMetadatas() {


public function collectData(array $param){
$html = $this->file_get_html('http://www.copie-double.com/') or $this->returnServerError('Could not request CopieDouble.');
$html = $this->getSimpleHTMLDOM('http://www.copie-double.com/') or $this->returnServerError('Could not request CopieDouble.');
$table = $html->find('table table', 2);

foreach($table->find('tr') as $element)
{
$td = $element->find('td', 0);
$cpt++;
if($td->class == "couleur_1")
{
$item = new Item();

$title = $td->innertext;
$pos = strpos($title, "<a");
$title = substr($title, 0, $pos);
Expand All @@ -33,7 +33,7 @@ public function collectData(array $param){
{
$a=$element->find("a", 0);
$item->uri = "http://www.copie-double.com" . $a->href;

$content = str_replace('src="/', 'src="http://www.copie-double.com/',$element->find("td", 0)->innertext);
$content = str_replace('href="/', 'href="http://www.copie-double.com/',$content);
$item->content = $content;
Expand Down
34 changes: 17 additions & 17 deletions bridges/CourrierInternationalBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,49 +12,49 @@ public function loadMetadatas() {
}

public function collectData(array $param){

$html = '';

$html = $this->file_get_html('http://www.courrierinternational.com/') or $this->returnServerError('Error.');

$html = $this->getSimpleHTMLDOM('http://www.courrierinternational.com/') or $this->returnServerError('Error.');




$element = $html->find("article");

$article_count = 1;
$article_count = 1;

foreach($element as $article) {

$item = new \Item();

$item->uri = $article->parent->getAttribute("href");

if(strpos($item->uri, "http") === FALSE) {
$item->uri = "http://courrierinternational.fr/".$item->uri;
}
$page = $this->file_get_html($item->uri);

$page = $this->getSimpleHTMLDOM($item->uri);

$cleaner = new HTMLSanitizer();

$item->content = $cleaner->sanitize($page->find("div.article-text")[0]);
$item->title = strip_tags($article->find(".title")[0]);

$dateTime = date_parse($page->find("time")[0]);

$item->timestamp = mktime(
$dateTime['hour'],
$dateTime['minute'],
$dateTime['second'],
$dateTime['month'],
$dateTime['day'],
$dateTime['hour'],
$dateTime['minute'],
$dateTime['second'],
$dateTime['month'],
$dateTime['day'],
$dateTime['year']
);

$this->items[] = $item;
$article_count ++;
if($article_count > 5) break;

}


Expand Down
Loading

0 comments on commit 3c0d13c

Please sign in to comment.