From c0f2ba9f9c628afcf6714b93eb72e074df6199b9 Mon Sep 17 00:00:00 2001 From: tom-dea Date: Tue, 28 Feb 2023 14:33:15 +0100 Subject: [PATCH] Add stopAfter setting to limit number of records harvested (#13) --- etc/oai.ini | 7 +++- src/OaiPmh/Harvester.php | 41 +++++++++++++++++-- src/OaiPmh/HarvesterCommand.php | 11 +++++ .../src/VuFindTest/OaiPmh/HarvesterTest.php | 39 ++++++++++++++++++ 4 files changed, 93 insertions(+), 5 deletions(-) diff --git a/etc/oai.ini b/etc/oai.ini index 4cefe39..f7be577 100644 --- a/etc/oai.ini +++ b/etc/oai.ini @@ -28,6 +28,7 @@ ; badXMLLog = bad.log ; httpUser = myUsername ; httpPass = myPassword +; stopAfter = 100 ; ; The section_name may be passed to harvest_oai.php as a parameter to harvest only ; records from that source. This is also the directory name that records will be @@ -141,7 +142,11 @@ ; ; httpPass is an optional password in case the OAI repository is behind HTTP basic ; authentication. It must be set in combination with httpUser. - +; +; stopAfter may be set to a natural positive number 'n' in order to stop harvesting +; after just the first 'n' records have been harvested. This option can be used for +; testing purposes. It allows the harvesting of smaller data sets. +; ; SAMPLE CONFIGURATION FOR OPEN JOURNAL SYSTEMS ;[OJS] ;url = http://ojs.myuniversity.edu/oai diff --git a/src/OaiPmh/Harvester.php b/src/OaiPmh/Harvester.php index fcbdfea..fc7a699 100644 --- a/src/OaiPmh/Harvester.php +++ b/src/OaiPmh/Harvester.php @@ -106,6 +106,19 @@ class Harvester */ protected $identifyResponse = null; + /** + * Flag to limit number of harvested records (null = no limit). + * Used only for testing. + * + * @var ?int + */ + protected $stopAfter = null; + + /** + * Count harvested records. + */ + protected $recordsCount = 0; + /** * Constructor. * @@ -235,6 +248,17 @@ public function launch() // Keep harvesting as long as a resumption token is provided: while ($token !== false) { + // If stopAfter is set, stop harvesting after given limit + if (!empty($this->stopAfter) + && $this->recordsCount >= $this->stopAfter + ) { + $this->writeLine( + "reached limit of records to harvest: " . $this->stopAfter + ); + $this->writeLine("stop harvesting."); + $token = false; + break; + } // Save current state in case we need to resume later: $this->stateManager->saveState( $set, @@ -246,9 +270,11 @@ public function launch() } } - // If we made it this far, all was successful. Save last harvest info - // and clean up the stored state. - $this->stateManager->saveDate($explicitHarvestEndDate); + // If we made it this far, all was successful. Save last harvest info and + // clean up the stored state (unless we have a limit imposed by stopAfter) + if (empty($this->stopAfter)) { + $this->stateManager->saveDate($explicitHarvestEndDate); + } $this->stateManager->clearState(); } @@ -312,9 +338,13 @@ protected function getRecords($params) // Save the records from the response: if ($response->ListRecords->record) { + $newRecords = count($response->ListRecords->record); $this->writeLine( - 'Processing ' . count($response->ListRecords->record) . " records..." + '[' . $this->recordsCount . ' records harvested] Processing ' + . $newRecords . " records..." ); + // count numRecords + $this->recordsCount += $newRecords; $this->writer->write($response->ListRecords->record); } @@ -426,5 +456,8 @@ protected function storeMiscSettings($settings) if (isset($settings['dateGranularity'])) { $this->granularity = $settings['dateGranularity']; } + if (isset($settings['stopAfter'])) { + $this->stopAfter = $settings['stopAfter']; + } } } diff --git a/src/OaiPmh/HarvesterCommand.php b/src/OaiPmh/HarvesterCommand.php index 049bf44..35baad5 100644 --- a/src/OaiPmh/HarvesterCommand.php +++ b/src/OaiPmh/HarvesterCommand.php @@ -274,6 +274,11 @@ protected function configure() InputOption::VALUE_REQUIRED, 'Filename (relative to harvest directory) to log' . ' XML fixed by sanitize setting' + )->addOption( + 'stopAfter', + null, + InputOption::VALUE_NONE, + 'an option to stop harvesting after the first n records of each set.' ); } @@ -365,6 +370,12 @@ protected function execute(InputInterface $input, OutputInterface $output) } // All done. + if (isset($settings['stopAfter'])) { + $this->writeLine( + 'stopAfter option set; ' + . 'all sources may not have been fully harvested.' + ); + } if ($processed == 0 && $skipped > 0) { $this->writeLine( 'No valid settings found; ' diff --git a/tests/unit-tests/src/VuFindTest/OaiPmh/HarvesterTest.php b/tests/unit-tests/src/VuFindTest/OaiPmh/HarvesterTest.php index e3a3add..c5a1a64 100644 --- a/tests/unit-tests/src/VuFindTest/OaiPmh/HarvesterTest.php +++ b/tests/unit-tests/src/VuFindTest/OaiPmh/HarvesterTest.php @@ -338,6 +338,45 @@ public function testListRecordsWithResumption() $harvester->launch(); } + /** + * Test that harvesting is stopped after x records + * if stopAfter is set to x. + * + * @return void + */ + public function testListRecordsWithStopAfterOption() + { + $comm = $this->getMockCommunicator(); + $expectedSettings0 = [ + 'metadataPrefix' => 'oai_dc', 'set' => 'xyzzy', + 'from' => '2016-07-01', 'until' => '2016-07-31', + ]; + $comm->expects($this->exactly(1))->method('request') + ->withConsecutive( + ['ListRecords', $expectedSettings0], + ) + ->willReturnOnConsecutiveCalls( + simplexml_load_string($this->getFakeResponse()) + ); + $writer = $this->getMockRecordWriter(); + $writer->expects($this->exactly(1))->method('write') + ->with($this->isInstanceOf('SimpleXMLElement')); + $sm = $this->getMockStateManager(); + $sm->expects($this->once())->method('clearState'); + $sm->expects($this->never())->method('saveState'); + $harvester = $this->getHarvester( + [ + 'set' => 'xyzzy', 'dateGranularity' => 'YYYY-MM-DDThh:mm:ssZ', + 'from' => '2016-07-01', 'until' => '2016-07-31', + 'stopAfter' => 100 + ], + $comm, + $writer, + $sm + ); + $harvester->launch(); + } + /** * Test a bad resumption token error. *