Skip to content

Commit

Permalink
Don't fail on libxml errors if the RSD URL can still be found
Browse files Browse the repository at this point in the history
Various things, such as duplicate element IDs or repeated
attributes, break DOMDocument::loadHTML() and so this turns off
the direct reporting (E_ERROR) of these and istead adds them
to the RsdException if the RSD URL really can't be determined
from the HTML. In most cases, the URL can be found correctly
and the errors can be disregarded.

A test is added for this as well, although it does *not* test
for the case when the RSD URL can't be found and there are
libxml errors (because we need to serve up a broken HTML file,
and the mediawiki-api-base test system can only interact with
MediaWiki via the API, which makes it hard to produce broken
HTML that doesn't also have the correct LINK element for the
RSD URL).

A new TestEnvironment::savePage method is added, for easier
creation of test wiki pages.

Bug: https://phabricator.wikimedia.org/T163527
  • Loading branch information
samwilson authored and addshore committed May 3, 2017
1 parent 246f0f5 commit 0e35e20
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 4 deletions.
23 changes: 19 additions & 4 deletions src/MediawikiApi.php
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,30 @@ public static function newFromApiEndpoint( $apiEndpoint ) {
* @throws RsdException If the RSD URL could not be found in the page's HTML.
*/
public static function newFromPage( $url ) {
// Set up HTTP client and HTML document.
$tempClient = new Client( [ 'headers' => [ 'User-Agent' => 'addwiki-mediawiki-client' ] ] );

// Get the page HTML and extract the RSD link.
$pageHtml = $tempClient->get( $url )->getBody();
$pageDoc = new DOMDocument();

// Try to load the HTML (turn off errors temporarily; most don't matter, and if they do get
// in the way of finding the API URL, will be reported in the RsdException below).
$internalErrors = libxml_use_internal_errors( true );
$pageDoc->loadHTML( $pageHtml );
$link = ( new DOMXpath( $pageDoc ) )->query( 'head/link[@type="application/rsd+xml"][@href]' );
$libXmlErrors = libxml_get_errors();
libxml_use_internal_errors( $internalErrors );

// Extract the RSD link.
$xpath = 'head/link[@type="application/rsd+xml"][@href]';
$link = ( new DOMXpath( $pageDoc ) )->query( $xpath );
if ( $link->length === 0 ) {
throw new RsdException( "Unable to find RSD URL in page: $url" );
// Format libxml errors for display.
$libXmlErrorStr = array_reduce( $libXmlErrors, function( $prevErr, $err ) {
return $prevErr . ', ' . $err->message . ' (line '.$err->line . ')';
} );
if ( $libXmlErrorStr ) {
$libXmlErrorStr = "In addition, libxml had the following errors: $libXmlErrorStr";
}
throw new RsdException( "Unable to find RSD URL in page: $url $libXmlErrorStr" );
}
$rsdUrl = $link->item( 0 )->attributes->getnamedItem( 'href' )->nodeValue;

Expand Down
22 changes: 22 additions & 0 deletions tests/Integration/MediawikiApiTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,28 @@ public function testNewFromPageInvalidHtml() {
MediawikiApi::newFromPage( $nonWikiPage );
}

/**
* Duplicate element IDs break DOMDocument::loadHTML
* @see https://phabricator.wikimedia.org/T163527#3219833
* @covers Mediawiki\Api\MediawikiApi::newFromPage
*/
public function testNewFromPageWithDuplicateId() {
$testPageName = __METHOD__;
$testEnv = TestEnvironment::newInstance();
$wikiPageUrl = str_replace( 'api.php', "index.php?title=$testPageName", $testEnv->getApiUrl() );

// Test with no duplicate IDs.
$testEnv->savePage( $testPageName, '<p id="unique-id"></p>' );
$api1 = MediawikiApi::newFromPage( $wikiPageUrl );
$this->assertInstanceOf( MediawikiApi::class, $api1 );

// Test with duplicate ID.
$wikiText = '<p id="duplicated-id"></p><div id="duplicated-id"></div>';
$testEnv->savePage( $testPageName, $wikiText );
$api2 = MediawikiApi::newFromPage( $wikiPageUrl );
$this->assertInstanceOf( MediawikiApi::class, $api2 );
}

/**
* @covers Mediawiki\Api\MediawikiApi::getRequest
* @covers Mediawiki\Api\MediawikiApi::getClientRequestOptions
Expand Down
16 changes: 16 additions & 0 deletions tests/Integration/TestEnvironment.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use Exception;
use Mediawiki\Api\MediawikiApi;
use Mediawiki\Api\SimpleRequest;

/**
* @author Addshore
Expand Down Expand Up @@ -68,4 +69,19 @@ public function getApi() {
return $this->api;
}

/**
* Save a wiki page.
* @param string $title
* @param string $content
*/
public function savePage( $title, $content ) {

$params = [
'title' => $title,
'text' => $content,
'md5' => md5( $content ),
'token' => $this->api->getToken(),
];
$this->api->postRequest( new SimpleRequest( 'edit', $params ) );
}
}

0 comments on commit 0e35e20

Please sign in to comment.