Skip to content

Commit

Permalink
perf: push kb (#831)
Browse files Browse the repository at this point in the history
* perf: PushKB improvements

Initial crack at improving pushKB ingest times. Currently ~0.17 per title (~0.03 for normal ingest)

* chore: Commented out logging for now to see if we get a speed boost by happy-accident

* chore: Removed commented code

Remove old commented out code

* refactor: PackageIngestService lookupOrCreatePackage

Split out lookup/update/create package methods so that we can have more granular decisions on when and where packages get created for pushKB process

* chore: added test1 tenant sample data file

* chore: Comment removal

Removed commented out logdebugs

* chore: Newline
  • Loading branch information
EthanFreestone authored Oct 29, 2024
1 parent 5c83c49 commit 29c282f
Show file tree
Hide file tree
Showing 5 changed files with 165 additions and 81 deletions.
109 changes: 69 additions & 40 deletions service/grails-app/services/org/olf/PackageIngestService.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import static org.springframework.transaction.annotation.Propagation.REQUIRES_NE

import java.util.concurrent.TimeUnit

import org.olf.general.Org

import org.olf.general.StringUtils
import org.olf.general.IngestException

Expand Down Expand Up @@ -135,14 +137,11 @@ class PackageIngestService implements DataBinder {
// ENSURE MDC title is set as early as possible
MDC.put('title', StringUtils.truncate(pc.title.toString()))

// log.debug("Try to resolve ${pc}")

try {
PackageContentItem.withNewSession { tsess ->
PackageContentItem.withNewTransaction { status ->
// Delegate out to TitleIngestService so that any shared steps can move there.
Map titleIngestResult = titleIngestService.upsertTitle(pc, kb, trustedSourceTI)

// titleIngestResult.titleInstanceId will be non-null IFF TitleIngestService managed to find a title with that Id.
if ( titleIngestResult.titleInstanceId != null ) {
// Pass off to new hierarchy method (?)
Expand Down Expand Up @@ -243,14 +242,13 @@ class PackageIngestService implements DataBinder {
}
}

/*
* Separate out the create or lookup pkg code, so that it can
* be used both by the ingest service (via upsert pkg), as well
* as the pushKBService (directly)
*
* This method ALSO provides update information for packages.
/*
* We split out the lookup vs the update vs the create code for packages,
* as there is potentially differences in behaviour between pushKB and harvest.
*/
public Pkg lookupOrCreatePkg(PackageSchema package_data) {

// This WILL NOT update or create a package from the data.
public Pkg lookupPkg(PackageSchema package_data) {
Pkg pkg = null

// header.packageSlug contains the package maintainers authoritative identifier for this package.
Expand All @@ -266,8 +264,16 @@ class PackageIngestService implements DataBinder {
}
}

def vendor = null
if ( ( package_data.header?.packageProvider?.name != null ) && ( package_data.header?.packageProvider?.name.trim().length() > 0 ) ) {
return pkg;
}

public Org getVendorFromPackageData(PackageSchema package_data) {
Org vendor = null;

if (
package_data.header?.packageProvider?.name != null &&
package_data.header?.packageProvider?.name.trim().length() > 0
) {
log.debug("Package contains provider information: ${package_data.header?.packageProvider?.name} -- trying to match to an existing organisation.")
vendor = dependentModuleProxyService.coordinateOrg(package_data.header?.packageProvider?.name)
// reference has been removed at the request of the UI team
Expand All @@ -277,18 +283,66 @@ class PackageIngestService implements DataBinder {
log.warn('Package ingest - no provider information present')
}

return vendor;
}

public Pkg lookupPkgAndUpdate(PackageSchema package_data) {
Pkg pkg = lookupPkg(package_data)
Org vendor = getVendorFromPackageData(package_data)
// Do update step but NOT create step
if (pkg != null) {
pkg.sourceDataUpdated = package_data.header.sourceDataUpdated

if (package_data.header.lifecycleStatus) {
pkg.lifecycleStatusFromString = package_data.header.lifecycleStatus
}

if (package_data.header.availabilityScope) {
pkg.availabilityScopeFromString = package_data.header.availabilityScope
}

pkg.vendor = vendor
pkg.description = package_data.header.description
pkg.name = package_data.header.packageName
pkg.save(failOnError:true)

// Call separate methods for updating collections for code cleanliness
// These methods are responsible for their own saves
updateContentTypes(pkg.id, package_data)
updateAlternateNames(pkg.id, package_data)
updateAvailabilityConstraints(pkg.id, package_data)
updatePackageDescriptionUrls(pkg.id, package_data)
}

return pkg;
}


/*
* Separate out the create or lookup pkg code, so that it can
* be used both by the ingest service (via upsert pkg), as well
* as the pushKBService (directly)
*
* This method ALSO updates information for packages.
*/
public Pkg lookupOrCreatePkg(PackageSchema package_data) {
// This takes care of any updates
Pkg pkg = lookupPkgAndUpdate(package_data);

// If pkg is null, then we can safely create a new one
if ( pkg == null ) {
Org vendor = getVendorFromPackageData(package_data)
pkg = new Pkg(
name: package_data.header.packageName,
source: package_data.header.packageSource,
name: package_data.header.packageName,
source: package_data.header.packageSource,
reference: package_data.header.packageSlug,
description: package_data.header.description,
sourceDataCreated: package_data.header.sourceDataCreated,
sourceDataUpdated: package_data.header.sourceDataUpdated,
sourceTitleCount: package_data.header.sourceTitleCount,
availabilityScope: ( package_data.header.availabilityScope != null ? Pkg.lookupOrCreateAvailabilityScope(package_data.header.availabilityScope) : null ),
lifecycleStatus: Pkg.lookupOrCreateLifecycleStatus(package_data.header.lifecycleStatus != null ? package_data.header.lifecycleStatus : 'Unknown'),
vendor: vendor,
vendor: vendor,
).save(flush:true, failOnError:true)

(package_data?.header?.contentTypes ?: []).each {
Expand All @@ -308,28 +362,6 @@ class PackageIngestService implements DataBinder {
}

pkg.save(failOnError: true)
} else {
pkg.sourceDataUpdated = package_data.header.sourceDataUpdated

if (package_data.header.lifecycleStatus) {
pkg.lifecycleStatusFromString = package_data.header.lifecycleStatus
}

if (package_data.header.availabilityScope) {
pkg.availabilityScopeFromString = package_data.header.availabilityScope
}

pkg.vendor = vendor
pkg.description = package_data.header.description
pkg.name = package_data.header.packageName
pkg.save(failOnError:true)

// Call separate methods for updating collections for code cleanliness
// These methods are responsible for their own saves
updateContentTypes(pkg.id, package_data)
updateAlternateNames(pkg.id, package_data)
updateAvailabilityConstraints(pkg.id, package_data)
updatePackageDescriptionUrls(pkg.id, package_data)
}

return pkg;
Expand Down Expand Up @@ -475,7 +507,6 @@ class PackageIngestService implements DataBinder {
}

Platform platform = Platform.resolve(platform_url_to_use, pc.platformName)
// log.debug("Platform: ${platform}")

if ( platform == null && PROXY_MISSING_PLATFORM ) {
platform = Platform.resolve('http://localhost.localdomain', 'This platform entry is used for error cases')
Expand Down Expand Up @@ -528,8 +559,6 @@ class PackageIngestService implements DataBinder {
pciStatus: 'none' // This should be 'none', 'updated' or 'new'
]

// log.debug("platform ${pc.platformUrl} ${pc.platformName} (item URL is ${pc.url})")

// lets try and work out the platform for the item
Platform platform = lookupOrCreatePlatform(pc);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class TitleIngestService implements DataBinder {
TitleInstance.withNewTransaction {
if (!kb) {
// This KB is created without a Type... so if it was trusted for TI data then it'd fail secondary enrichment
kb = new RemoteKB( name:remotekbname,
kb = new RemoteKB( name:remotekbname,
rectype: RemoteKB.RECTYPE_TITLE,
active: Boolean.TRUE,
readonly:readOnly,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,6 @@ order by pj.dateCreated
org.slf4j.MDC.clear()
org.slf4j.MDC.setContextMap( jobId: '' + jid, tenantId: '' + tid, 'tenant': '' + tenantName)
JobContext.current.set(new JobContext( jobId: jid, tenantId: tid ))
//log.debug("LOGDEBUG TID: ${tId}")
Tenants.withId(tid) {
beginJob(jid)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ class PushKBService implements DataBinder {
updatedAccessEnd: 0,
]
KBIngressType ingressType = kbManagementBean.ingressType

if (ingressType == KBIngressType.PushKB) {
try {
pcis.each { Map record ->
Expand All @@ -133,47 +132,64 @@ class PushKBService implements DataBinder {
if (utilityService.checkValidBinding(pc)) {
try {
Pkg pkg = null;
Pkg.withNewTransaction { status ->
// TODO this will allow the PCI data to update the PKG record... do we want this?
pkg = packageIngestService.lookupOrCreatePackageFromTitle(pc);
log.debug("LOGGING PACKAGE OBTAINED FROM PCI: ${pkg}")

Map titleIngestResult = titleIngestService.upsertTitleDirect(pc)
log.debug("LOGGING titleIngestResult: ${titleIngestResult}")

if ( titleIngestResult.titleInstanceId != null ) {
Map hierarchyResult = packageIngestService.lookupOrCreateTitleHierarchy(
titleIngestResult.titleInstanceId,
pkg.id,
true,
pc,
result.updateTime,
result.titleCount // FIXME not sure about this
)

PackageContentItem pci = PackageContentItem.get(hierarchyResult.pciId)
packageIngestService.hierarchyResultMapLogic(hierarchyResult, result, pci)

/* TODO figure out if use of removedTimestamp
* should be something harvest also needs to do directly
* And whether we should be doing it after all the above
* or before.
*/
if (pc.removedTimestamp) {
try {
log.debug("Removal candidate: pci.id #${pci.id} (Last seen ${pci.lastSeenTimestamp}, thisUpdate ${result.updateTime}) -- Set removed")
pci.removedTimestamp = pc.removedTimestamp
pci.save(failOnError:true)
} catch ( Exception e ) {
log.error("Problem removing ${pci} in package load", e)
Pkg.withSession { currentSess ->
Pkg.withTransaction {
Pkg.withNewSession { newSess ->
Pkg.withTransaction {
// TODO this will allow the PCI data to update the PKG record... do we want this?

pkg = packageIngestService.lookupOrCreatePackageFromTitle(pc);
}
newSess.clear()
}
}
}

TitleInstance.withSession { currentSess ->
TitleInstance.withTransaction {
TitleInstance.withNewSession { newSess ->
TitleInstance.withTransaction {
Map titleIngestResult = titleIngestService.upsertTitleDirect(pc)

if ( titleIngestResult.titleInstanceId != null ) {

Map hierarchyResult = packageIngestService.lookupOrCreateTitleHierarchy(
titleIngestResult.titleInstanceId,
pkg.id,
true,
pc,
result.updateTime,
result.titleCount // Not totally sure this is valuable here
)

PackageContentItem pci = PackageContentItem.get(hierarchyResult.pciId)
packageIngestService.hierarchyResultMapLogic(hierarchyResult, result, pci)

/* TODO figure out if use of removedTimestamp
* should be something harvest also needs to do directly
* And whether we should be doing it after all the above
* or before.
*/
if (pc.removedTimestamp) {
try {
log.debug("Removal candidate: pci.id #${pci.id} (Last seen ${pci.lastSeenTimestamp}, thisUpdate ${result.updateTime}) -- Set removed")
pci.removedTimestamp = pc.removedTimestamp
pci.save(failOnError:true)
} catch ( Exception e ) {
log.error("Problem removing ${pci} in package load", e)
}
result.removedTitles++
}
} else {
String message = "Skipping \"${pc.title}\". Unable to resolve title from ${pc.title} with identifiers ${pc.instanceIdentifiers}"
log.error(message)
}
}
result.removedTitles++
newSess.clear()
}
} else {
String message = "Skipping \"${pc.title}\". Unable to resolve title from ${pc.title} with identifiers ${pc.instanceIdentifiers}"
log.error(message)
}
}

} catch ( IngestException ie ) {
// When we've caught an ingest exception, should have helpful error log message
String message = "Skipping \"${pc.title}\": ${ie.message}"
Expand All @@ -196,7 +212,7 @@ class PushKBService implements DataBinder {
} */
}

long finishedTime = (System.currentTimeMillis()-result.startTime)/1000
long finishedTime = (System.currentTimeMillis()-result.startTime) // Don't divide by 1000 here
result.success = true

// TODO Logging may need tweaking between pushKB and harvest
Expand Down
40 changes: 40 additions & 0 deletions service/src/main/okapi/tenant/sample_data/test1-data.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// DEV NOTE -- This is what will be used for the default rancher-desktop-db/dc environments
// For vagrant development (which I think is deprecated now) there may need to be a different file
// to avoid making changes to diku-data or _data.
log.info "Running test1-data tenant sample data file"

import org.olf.kb.RemoteKB

/* RemoteKB.findByName('GOKb_TEST') ?: (new RemoteKB(
name:'GOKb_TEST',
type:'org.olf.kb.adapters.GOKbOAIAdapter',
uri:'https://gokbt.gbv.de/gokb/oai/index',
fullPrefix:'gokb',
rectype: RemoteKB.RECTYPE_PACKAGE,
active:Boolean.TRUE,
supportsHarvesting:true,
activationEnabled:false,
//cursor: "2022-08-09T19:34:42Z"
).save(failOnError:true)) */

RemoteKB.findByName('GOKb') ?: (new RemoteKB(
name:'GOKb',
type:'org.olf.kb.adapters.GOKbOAIAdapter',
uri:'https://gokb.org/gokb/oai/index',
fullPrefix:'gokb',
rectype: RemoteKB.RECTYPE_PACKAGE,
active:Boolean.TRUE,
supportsHarvesting:true,
activationEnabled:false
).save(failOnError:true))

/* RemoteKB.findByName('DEBUG') ?: (new RemoteKB(
name:'DEBUG',
type:'org.olf.kb.adapters.DebugGoKbAdapter',
// uri can be used to directly force a package from the resources folder
// uri: 'src/integration-test/resources/DebugGoKbAdapter/borked_ids.xml'
rectype: RemoteKB.RECTYPE_PACKAGE,
active:Boolean.TRUE,
supportsHarvesting:true,
activationEnabled:false
).save(failOnError:true)) */

0 comments on commit 29c282f

Please sign in to comment.