Skip to content

Commit

Permalink
Add stopAfter setting to limit number of records harvested (#13)
Browse files Browse the repository at this point in the history
  • Loading branch information
tom-dea authored Feb 28, 2023
1 parent 55413fa commit c0f2ba9
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 5 deletions.
7 changes: 6 additions & 1 deletion etc/oai.ini
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
; badXMLLog = bad.log
; httpUser = myUsername
; httpPass = myPassword
; stopAfter = 100
;
; The section_name may be passed to harvest_oai.php as a parameter to harvest only
; records from that source. This is also the directory name that records will be
Expand Down Expand Up @@ -141,7 +142,11 @@
;
; httpPass is an optional password in case the OAI repository is behind HTTP basic
; authentication. It must be set in combination with httpUser.

;
; stopAfter may be set to a natural positive number 'n' in order to stop harvesting
; after just the first 'n' records have been harvested. This option can be used for
; testing purposes. It allows the harvesting of smaller data sets.
;
; SAMPLE CONFIGURATION FOR OPEN JOURNAL SYSTEMS
;[OJS]
;url = http://ojs.myuniversity.edu/oai
Expand Down
41 changes: 37 additions & 4 deletions src/OaiPmh/Harvester.php
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,19 @@ class Harvester
*/
protected $identifyResponse = null;

/**
* Flag to limit number of harvested records (null = no limit).
* Used only for testing.
*
* @var ?int
*/
protected $stopAfter = null;

/**
* Count harvested records.
*/
protected $recordsCount = 0;

/**
* Constructor.
*
Expand Down Expand Up @@ -235,6 +248,17 @@ public function launch()

// Keep harvesting as long as a resumption token is provided:
while ($token !== false) {
// If stopAfter is set, stop harvesting after given limit
if (!empty($this->stopAfter)
&& $this->recordsCount >= $this->stopAfter
) {
$this->writeLine(
"reached limit of records to harvest: " . $this->stopAfter
);
$this->writeLine("stop harvesting.");
$token = false;
break;
}
// Save current state in case we need to resume later:
$this->stateManager->saveState(
$set,
Expand All @@ -246,9 +270,11 @@ public function launch()
}
}

// If we made it this far, all was successful. Save last harvest info
// and clean up the stored state.
$this->stateManager->saveDate($explicitHarvestEndDate);
// If we made it this far, all was successful. Save last harvest info and
// clean up the stored state (unless we have a limit imposed by stopAfter)
if (empty($this->stopAfter)) {
$this->stateManager->saveDate($explicitHarvestEndDate);
}
$this->stateManager->clearState();
}

Expand Down Expand Up @@ -312,9 +338,13 @@ protected function getRecords($params)

// Save the records from the response:
if ($response->ListRecords->record) {
$newRecords = count($response->ListRecords->record);
$this->writeLine(
'Processing ' . count($response->ListRecords->record) . " records..."
'[' . $this->recordsCount . ' records harvested] Processing '
. $newRecords . " records..."
);
// count numRecords
$this->recordsCount += $newRecords;
$this->writer->write($response->ListRecords->record);
}

Expand Down Expand Up @@ -426,5 +456,8 @@ protected function storeMiscSettings($settings)
if (isset($settings['dateGranularity'])) {
$this->granularity = $settings['dateGranularity'];
}
if (isset($settings['stopAfter'])) {
$this->stopAfter = $settings['stopAfter'];
}
}
}
11 changes: 11 additions & 0 deletions src/OaiPmh/HarvesterCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,11 @@ protected function configure()
InputOption::VALUE_REQUIRED,
'Filename (relative to harvest directory) to log'
. ' XML fixed by sanitize setting'
)->addOption(
'stopAfter',
null,
InputOption::VALUE_NONE,
'an option to stop harvesting after the first n records of each set.'
);
}

Expand Down Expand Up @@ -365,6 +370,12 @@ protected function execute(InputInterface $input, OutputInterface $output)
}

// All done.
if (isset($settings['stopAfter'])) {
$this->writeLine(
'stopAfter option set; '
. 'all sources may not have been fully harvested.'
);
}
if ($processed == 0 && $skipped > 0) {
$this->writeLine(
'No valid settings found; '
Expand Down
39 changes: 39 additions & 0 deletions tests/unit-tests/src/VuFindTest/OaiPmh/HarvesterTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,45 @@ public function testListRecordsWithResumption()
$harvester->launch();
}

/**
* Test that harvesting is stopped after x records
* if stopAfter is set to x.
*
* @return void
*/
public function testListRecordsWithStopAfterOption()
{
$comm = $this->getMockCommunicator();
$expectedSettings0 = [
'metadataPrefix' => 'oai_dc', 'set' => 'xyzzy',
'from' => '2016-07-01', 'until' => '2016-07-31',
];
$comm->expects($this->exactly(1))->method('request')
->withConsecutive(
['ListRecords', $expectedSettings0],
)
->willReturnOnConsecutiveCalls(
simplexml_load_string($this->getFakeResponse())
);
$writer = $this->getMockRecordWriter();
$writer->expects($this->exactly(1))->method('write')
->with($this->isInstanceOf('SimpleXMLElement'));
$sm = $this->getMockStateManager();
$sm->expects($this->once())->method('clearState');
$sm->expects($this->never())->method('saveState');
$harvester = $this->getHarvester(
[
'set' => 'xyzzy', 'dateGranularity' => 'YYYY-MM-DDThh:mm:ssZ',
'from' => '2016-07-01', 'until' => '2016-07-31',
'stopAfter' => 100
],
$comm,
$writer,
$sm
);
$harvester->launch();
}

/**
* Test a bad resumption token error.
*
Expand Down

0 comments on commit c0f2ba9

Please sign in to comment.