Skip to content

Commit

Permalink
Improve ODText Content Reader
Browse files Browse the repository at this point in the history
Fix PHPOffice#2493. There is much that the ODT Reader ignores. This change adds support for the `text:section`, `text:span`, `text:s`, and `text:tab` tags, thereby handling multiple sections, text runs, tab characters, and multiple spaces. There will still be many omissions (e.g. styles and tables), but you will now often be able to access the text content of valid ODT documents. The issue suggests variations in a simple file created on its own by LibreOffice, and a similar file created by PhpWord. Both are unit-tested.

A `getText` method is added to TextRun to facilitate testing (and can be useful on its own). It will return the concatenated texts of all elements of the text run.
  • Loading branch information
oleibman committed Nov 12, 2023
1 parent a836c32 commit e4d4d99
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 10 deletions.
5 changes: 0 additions & 5 deletions phpstan-baseline.neon
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,6 @@ parameters:
count: 1
path: src/PhpWord/Reader/HTML.php

-
message: "#^Call to an undefined method DOMNode\\:\\:getAttribute\\(\\)\\.$#"
count: 2
path: src/PhpWord/Reader/ODText/Content.php

-
message: "#^Offset 'textNodes' on array\\{changed\\: PhpOffice\\\\PhpWord\\\\Element\\\\TrackChange, textNodes\\: DOMNodeList\\<DOMElement\\>\\} in isset\\(\\) always exists and is not nullable\\.$#"
count: 1
Expand Down
12 changes: 12 additions & 0 deletions src/PhpWord/Element/TextRun.php
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,16 @@ public function setParagraphStyle($style = null)

return $this->paragraphStyle;
}

public function getText(): string
{
$outstr = '';
foreach ($this->getElements() as $element) {
if ($element instanceof Text) {
$outstr .= $element->getText();
}
}

return $outstr;
}
}
80 changes: 75 additions & 5 deletions src/PhpWord/Reader/ODText/Content.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
namespace PhpOffice\PhpWord\Reader\ODText;

use DateTime;
use DOMElement;
use DOMNodeList;
use PhpOffice\PhpWord\Element\Section;
use PhpOffice\PhpWord\Element\TrackChange;
use PhpOffice\PhpWord\PhpWord;
use PhpOffice\PhpWord\Shared\XMLReader;
Expand All @@ -29,6 +32,9 @@
*/
class Content extends AbstractPart
{
/** @var ?Section */
private $section;

/**
* Read content.xml.
*/
Expand All @@ -40,18 +46,31 @@ public function read(PhpWord $phpWord): void
$trackedChanges = [];

$nodes = $xmlReader->getElements('office:body/office:text/*');
$this->section = null;
$this->processNodes($nodes, $xmlReader, $phpWord);
$this->section = null;
}

/** @param DOMNodeList<DOMElement> $nodes */
public function processNodes(DOMNodeList $nodes, XMLReader $xmlReader, PhpWord $phpWord): void
{
if ($nodes->length > 0) {
$section = $phpWord->addSection();
foreach ($nodes as $node) {
// $styleName = $xmlReader->getAttribute('text:style-name', $node);
switch ($node->nodeName) {
case 'text:h': // Heading
$depth = $xmlReader->getAttribute('text:outline-level', $node);
$section->addTitle($node->nodeValue, $depth);
$this->getSection($phpWord)->addTitle($node->nodeValue, $depth);

break;
case 'text:p': // Paragraph
$styleName = $xmlReader->getAttribute('text:style-name', $node);
if (substr($styleName, 0, 2) === 'SB') {
break;
}
$children = $node->childNodes;
$spans = false;
/** @var DOMElement $child */
foreach ($children as $child) {
switch ($child->nodeName) {
case 'text:change-start':
Expand All @@ -71,16 +90,50 @@ public function read(PhpWord $phpWord): void
$changed = $trackedChanges[$changeId];
}

break;
case 'text:span':
$spans = true;

break;
}
}

$element = $section->addText($node->nodeValue);
if ($spans) {
$element = $this->getSection($phpWord)->addTextRun();
foreach ($children as $child) {
switch ($child->nodeName) {
case 'text:span':
/** @var DOMElement $child2 */
foreach ($child->childNodes as $child2) {
switch ($child2->nodeName) {
case '#text':
$element->addText($child2->nodeValue);

break;
case 'text:tab':
$element->addText("\t");

break;
case 'text:s':
$spaces = (int) $child2->getAttribute('text:c') ?: 1;
$element->addText(str_repeat(' ', $spaces));

break;
}
}

break;

}
}
} else {
$element = $this->getSection($phpWord)->addText($node->nodeValue);
}
if (isset($changed) && is_array($changed)) {
$element->setTrackChange($changed['changed']);
if (isset($changed['textNodes'])) {
foreach ($changed['textNodes'] as $changedNode) {
$element = $section->addText($changedNode->nodeValue);
$element = $this->getSection($phpWord)->addText($changedNode->nodeValue);
$element->setTrackChange($changed['changed']);
}
}
Expand All @@ -91,7 +144,7 @@ public function read(PhpWord $phpWord): void
$listItems = $xmlReader->getElements('text:list-item/text:p', $node);
foreach ($listItems as $listItem) {
// $listStyleName = $xmlReader->getAttribute('text:style-name', $listItem);
$section->addListItem($listItem->nodeValue, 0);
$this->getSection($phpWord)->addListItem($listItem->nodeValue, 0);
}

break;
Expand All @@ -110,9 +163,26 @@ public function read(PhpWord $phpWord): void
$trackedChanges[$changedRegion->getAttribute('text:id')] = ['changed' => $changed, 'textNodes' => $textNodes];
}

break;
case 'text:section': // Section
// $sectionStyleName = $xmlReader->getAttribute('text:style-name', $listItem);
$this->section = $phpWord->addSection();
$children = $node->childNodes;
$this->processNodes($children, $xmlReader, $phpWord);

break;
}
}
}
}

private function getSection(PhpWord $phpWord): Section
{
$section = $this->section;
if ($section === null) {
$section = $this->section = $phpWord->addSection();
}

return $section;
}
}
83 changes: 83 additions & 0 deletions tests/PhpWordTests/Reader/ODText/ODTextSectionTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
<?php
/**
* This file is part of PHPWord - A pure PHP library for reading and writing
* word processing documents.
*
* PHPWord is free software distributed under the terms of the GNU Lesser
* General Public License version 3 as published by the Free Software Foundation.
*
* For the full copyright and license information, please read the LICENSE
* file that was distributed with this source code. For the full list of
* contributors, visit https://github.com/PHPOffice/PHPWord/contributors.
*
* @see https://github.com/PHPOffice/PHPWord
*
* @license http://www.gnu.org/licenses/lgpl.txt LGPL version 3
*/

namespace PhpOffice\PhpWordTests\Reader\ODText;

use PhpOffice\PhpWord\IOFactory;
use PhpOffice\PhpWord\PhpWord;
use PhpOffice\PhpWord\Settings;

class ODTextSectionTest extends \PHPUnit\Framework\TestCase
{
/** @var string */
private $filename = '';

protected function tearDown(): void
{
if ($this->filename !== '') {
unlink($this->filename);
$this->filename = '';
}
}

public function testWriteThenReadSection(): void
{
$dir = 'tests/PhpWordTests/_files';
Settings::setOutputEscapingEnabled(true);
$phpWord = new PhpWord();
$section = $phpWord->addSection();
$inputText = ['days', 'monday', 'tuesday'];
$inputText[] = "Tab\tthen two spaces then done.";
foreach ($inputText as $text) {
$section->addText($text);
}
$writer = IOFactory::createWriter($phpWord, 'ODText');
$this->filename = "$dir/sectiontest.odt";
$writer->save($this->filename);

$reader = IOFactory::createReader('ODText');
$phpWord2 = $reader->load($this->filename);
$outputText = [];
foreach ($phpWord2->getSections() as $section) {
foreach ($section->getElements() as $element) {
if (is_object($element) && method_exists($element, 'getText')) {
$outputText[] = $element->getText();
}
}
}
self::assertSame($inputText, $outputText);
}

public function testReadNoSections(): void
{
$dir = 'tests/PhpWordTests/_files/documents';
$inputText = ['days', 'monday', 'tuesday'];

$reader = IOFactory::createReader('ODText');
$filename = "$dir/word.2493.nosection.odt";
$phpWord2 = $reader->load($filename);
$outputText = [];
foreach ($phpWord2->getSections() as $section) {
foreach ($section->getElements() as $element) {
if (is_object($element) && method_exists($element, 'getText')) {
$outputText[] = $element->getText();
}
}
}
self::assertSame($inputText, $outputText);
}
}
Binary file not shown.

0 comments on commit e4d4d99

Please sign in to comment.