From 29c282f4f707ac3f8824876d4c03c8a4b4afeb93 Mon Sep 17 00:00:00 2001 From: EthanFreestone <54310740+EthanFreestone@users.noreply.github.com> Date: Tue, 29 Oct 2024 14:26:03 +0000 Subject: [PATCH] perf: push kb (#831) * perf: PushKB improvements Initial crack at improving pushKB ingest times. Currently ~0.17 per title (~0.03 for normal ingest) * chore: Commented out logging for now to see if we get a speed boost by happy-accident * chore: Removed commented code Remove old commented out code * refactor: PackageIngestService lookupOrCreatePackage Split out lookup/update/create package methods so that we can have more granular decisions on when and where packages get created for pushKB process * chore: added test1 tenant sample data file * chore: Comment removal Removed commented out logdebugs * chore: Newline --- .../org/olf/PackageIngestService.groovy | 109 +++++++++++------- .../org/olf/TitleIngestService.groovy | 2 +- .../olf/general/jobs/JobRunnerService.groovy | 1 - .../olf/general/pushKB/PushKBService.groovy | 94 ++++++++------- .../tenant/sample_data/test1-data.groovy | 40 +++++++ 5 files changed, 165 insertions(+), 81 deletions(-) create mode 100644 service/src/main/okapi/tenant/sample_data/test1-data.groovy diff --git a/service/grails-app/services/org/olf/PackageIngestService.groovy b/service/grails-app/services/org/olf/PackageIngestService.groovy index df2949c3..78e574d9 100644 --- a/service/grails-app/services/org/olf/PackageIngestService.groovy +++ b/service/grails-app/services/org/olf/PackageIngestService.groovy @@ -4,6 +4,8 @@ import static org.springframework.transaction.annotation.Propagation.REQUIRES_NE import java.util.concurrent.TimeUnit +import org.olf.general.Org + import org.olf.general.StringUtils import org.olf.general.IngestException @@ -135,14 +137,11 @@ class PackageIngestService implements DataBinder { // ENSURE MDC title is set as early as possible MDC.put('title', StringUtils.truncate(pc.title.toString())) - // log.debug("Try to resolve ${pc}") - try { PackageContentItem.withNewSession { tsess -> PackageContentItem.withNewTransaction { status -> // Delegate out to TitleIngestService so that any shared steps can move there. Map titleIngestResult = titleIngestService.upsertTitle(pc, kb, trustedSourceTI) - // titleIngestResult.titleInstanceId will be non-null IFF TitleIngestService managed to find a title with that Id. if ( titleIngestResult.titleInstanceId != null ) { // Pass off to new hierarchy method (?) @@ -243,14 +242,13 @@ class PackageIngestService implements DataBinder { } } - /* - * Separate out the create or lookup pkg code, so that it can - * be used both by the ingest service (via upsert pkg), as well - * as the pushKBService (directly) - * - * This method ALSO provides update information for packages. + /* + * We split out the lookup vs the update vs the create code for packages, + * as there is potentially differences in behaviour between pushKB and harvest. */ - public Pkg lookupOrCreatePkg(PackageSchema package_data) { + + // This WILL NOT update or create a package from the data. + public Pkg lookupPkg(PackageSchema package_data) { Pkg pkg = null // header.packageSlug contains the package maintainers authoritative identifier for this package. @@ -266,8 +264,16 @@ class PackageIngestService implements DataBinder { } } - def vendor = null - if ( ( package_data.header?.packageProvider?.name != null ) && ( package_data.header?.packageProvider?.name.trim().length() > 0 ) ) { + return pkg; + } + + public Org getVendorFromPackageData(PackageSchema package_data) { + Org vendor = null; + + if ( + package_data.header?.packageProvider?.name != null && + package_data.header?.packageProvider?.name.trim().length() > 0 + ) { log.debug("Package contains provider information: ${package_data.header?.packageProvider?.name} -- trying to match to an existing organisation.") vendor = dependentModuleProxyService.coordinateOrg(package_data.header?.packageProvider?.name) // reference has been removed at the request of the UI team @@ -277,10 +283,58 @@ class PackageIngestService implements DataBinder { log.warn('Package ingest - no provider information present') } + return vendor; + } + + public Pkg lookupPkgAndUpdate(PackageSchema package_data) { + Pkg pkg = lookupPkg(package_data) + Org vendor = getVendorFromPackageData(package_data) + // Do update step but NOT create step + if (pkg != null) { + pkg.sourceDataUpdated = package_data.header.sourceDataUpdated + + if (package_data.header.lifecycleStatus) { + pkg.lifecycleStatusFromString = package_data.header.lifecycleStatus + } + + if (package_data.header.availabilityScope) { + pkg.availabilityScopeFromString = package_data.header.availabilityScope + } + + pkg.vendor = vendor + pkg.description = package_data.header.description + pkg.name = package_data.header.packageName + pkg.save(failOnError:true) + + // Call separate methods for updating collections for code cleanliness + // These methods are responsible for their own saves + updateContentTypes(pkg.id, package_data) + updateAlternateNames(pkg.id, package_data) + updateAvailabilityConstraints(pkg.id, package_data) + updatePackageDescriptionUrls(pkg.id, package_data) + } + + return pkg; + } + + + /* + * Separate out the create or lookup pkg code, so that it can + * be used both by the ingest service (via upsert pkg), as well + * as the pushKBService (directly) + * + * This method ALSO updates information for packages. + */ + public Pkg lookupOrCreatePkg(PackageSchema package_data) { + // This takes care of any updates + Pkg pkg = lookupPkgAndUpdate(package_data); + + // If pkg is null, then we can safely create a new one if ( pkg == null ) { + Org vendor = getVendorFromPackageData(package_data) pkg = new Pkg( - name: package_data.header.packageName, - source: package_data.header.packageSource, + name: package_data.header.packageName, + source: package_data.header.packageSource, reference: package_data.header.packageSlug, description: package_data.header.description, sourceDataCreated: package_data.header.sourceDataCreated, @@ -288,7 +342,7 @@ class PackageIngestService implements DataBinder { sourceTitleCount: package_data.header.sourceTitleCount, availabilityScope: ( package_data.header.availabilityScope != null ? Pkg.lookupOrCreateAvailabilityScope(package_data.header.availabilityScope) : null ), lifecycleStatus: Pkg.lookupOrCreateLifecycleStatus(package_data.header.lifecycleStatus != null ? package_data.header.lifecycleStatus : 'Unknown'), - vendor: vendor, + vendor: vendor, ).save(flush:true, failOnError:true) (package_data?.header?.contentTypes ?: []).each { @@ -308,28 +362,6 @@ class PackageIngestService implements DataBinder { } pkg.save(failOnError: true) - } else { - pkg.sourceDataUpdated = package_data.header.sourceDataUpdated - - if (package_data.header.lifecycleStatus) { - pkg.lifecycleStatusFromString = package_data.header.lifecycleStatus - } - - if (package_data.header.availabilityScope) { - pkg.availabilityScopeFromString = package_data.header.availabilityScope - } - - pkg.vendor = vendor - pkg.description = package_data.header.description - pkg.name = package_data.header.packageName - pkg.save(failOnError:true) - - // Call separate methods for updating collections for code cleanliness - // These methods are responsible for their own saves - updateContentTypes(pkg.id, package_data) - updateAlternateNames(pkg.id, package_data) - updateAvailabilityConstraints(pkg.id, package_data) - updatePackageDescriptionUrls(pkg.id, package_data) } return pkg; @@ -475,7 +507,6 @@ class PackageIngestService implements DataBinder { } Platform platform = Platform.resolve(platform_url_to_use, pc.platformName) - // log.debug("Platform: ${platform}") if ( platform == null && PROXY_MISSING_PLATFORM ) { platform = Platform.resolve('http://localhost.localdomain', 'This platform entry is used for error cases') @@ -528,8 +559,6 @@ class PackageIngestService implements DataBinder { pciStatus: 'none' // This should be 'none', 'updated' or 'new' ] - // log.debug("platform ${pc.platformUrl} ${pc.platformName} (item URL is ${pc.url})") - // lets try and work out the platform for the item Platform platform = lookupOrCreatePlatform(pc); diff --git a/service/grails-app/services/org/olf/TitleIngestService.groovy b/service/grails-app/services/org/olf/TitleIngestService.groovy index 9d48f256..3ed9e787 100644 --- a/service/grails-app/services/org/olf/TitleIngestService.groovy +++ b/service/grails-app/services/org/olf/TitleIngestService.groovy @@ -36,7 +36,7 @@ class TitleIngestService implements DataBinder { TitleInstance.withNewTransaction { if (!kb) { // This KB is created without a Type... so if it was trusted for TI data then it'd fail secondary enrichment - kb = new RemoteKB( name:remotekbname, + kb = new RemoteKB( name:remotekbname, rectype: RemoteKB.RECTYPE_TITLE, active: Boolean.TRUE, readonly:readOnly, diff --git a/service/grails-app/services/org/olf/general/jobs/JobRunnerService.groovy b/service/grails-app/services/org/olf/general/jobs/JobRunnerService.groovy index 1e942784..8e7b0bcd 100644 --- a/service/grails-app/services/org/olf/general/jobs/JobRunnerService.groovy +++ b/service/grails-app/services/org/olf/general/jobs/JobRunnerService.groovy @@ -537,7 +537,6 @@ order by pj.dateCreated org.slf4j.MDC.clear() org.slf4j.MDC.setContextMap( jobId: '' + jid, tenantId: '' + tid, 'tenant': '' + tenantName) JobContext.current.set(new JobContext( jobId: jid, tenantId: tid )) - //log.debug("LOGDEBUG TID: ${tId}") Tenants.withId(tid) { beginJob(jid) } diff --git a/service/grails-app/services/org/olf/general/pushKB/PushKBService.groovy b/service/grails-app/services/org/olf/general/pushKB/PushKBService.groovy index 62644fca..15070769 100644 --- a/service/grails-app/services/org/olf/general/pushKB/PushKBService.groovy +++ b/service/grails-app/services/org/olf/general/pushKB/PushKBService.groovy @@ -111,7 +111,6 @@ class PushKBService implements DataBinder { updatedAccessEnd: 0, ] KBIngressType ingressType = kbManagementBean.ingressType - if (ingressType == KBIngressType.PushKB) { try { pcis.each { Map record -> @@ -133,47 +132,64 @@ class PushKBService implements DataBinder { if (utilityService.checkValidBinding(pc)) { try { Pkg pkg = null; - Pkg.withNewTransaction { status -> - // TODO this will allow the PCI data to update the PKG record... do we want this? - pkg = packageIngestService.lookupOrCreatePackageFromTitle(pc); - log.debug("LOGGING PACKAGE OBTAINED FROM PCI: ${pkg}") - - Map titleIngestResult = titleIngestService.upsertTitleDirect(pc) - log.debug("LOGGING titleIngestResult: ${titleIngestResult}") - - if ( titleIngestResult.titleInstanceId != null ) { - Map hierarchyResult = packageIngestService.lookupOrCreateTitleHierarchy( - titleIngestResult.titleInstanceId, - pkg.id, - true, - pc, - result.updateTime, - result.titleCount // FIXME not sure about this - ) - - PackageContentItem pci = PackageContentItem.get(hierarchyResult.pciId) - packageIngestService.hierarchyResultMapLogic(hierarchyResult, result, pci) - - /* TODO figure out if use of removedTimestamp - * should be something harvest also needs to do directly - * And whether we should be doing it after all the above - * or before. - */ - if (pc.removedTimestamp) { - try { - log.debug("Removal candidate: pci.id #${pci.id} (Last seen ${pci.lastSeenTimestamp}, thisUpdate ${result.updateTime}) -- Set removed") - pci.removedTimestamp = pc.removedTimestamp - pci.save(failOnError:true) - } catch ( Exception e ) { - log.error("Problem removing ${pci} in package load", e) + Pkg.withSession { currentSess -> + Pkg.withTransaction { + Pkg.withNewSession { newSess -> + Pkg.withTransaction { + // TODO this will allow the PCI data to update the PKG record... do we want this? + + pkg = packageIngestService.lookupOrCreatePackageFromTitle(pc); + } + newSess.clear() + } + } + } + + TitleInstance.withSession { currentSess -> + TitleInstance.withTransaction { + TitleInstance.withNewSession { newSess -> + TitleInstance.withTransaction { + Map titleIngestResult = titleIngestService.upsertTitleDirect(pc) + + if ( titleIngestResult.titleInstanceId != null ) { + + Map hierarchyResult = packageIngestService.lookupOrCreateTitleHierarchy( + titleIngestResult.titleInstanceId, + pkg.id, + true, + pc, + result.updateTime, + result.titleCount // Not totally sure this is valuable here + ) + + PackageContentItem pci = PackageContentItem.get(hierarchyResult.pciId) + packageIngestService.hierarchyResultMapLogic(hierarchyResult, result, pci) + + /* TODO figure out if use of removedTimestamp + * should be something harvest also needs to do directly + * And whether we should be doing it after all the above + * or before. + */ + if (pc.removedTimestamp) { + try { + log.debug("Removal candidate: pci.id #${pci.id} (Last seen ${pci.lastSeenTimestamp}, thisUpdate ${result.updateTime}) -- Set removed") + pci.removedTimestamp = pc.removedTimestamp + pci.save(failOnError:true) + } catch ( Exception e ) { + log.error("Problem removing ${pci} in package load", e) + } + result.removedTitles++ + } + } else { + String message = "Skipping \"${pc.title}\". Unable to resolve title from ${pc.title} with identifiers ${pc.instanceIdentifiers}" + log.error(message) + } } - result.removedTitles++ + newSess.clear() } - } else { - String message = "Skipping \"${pc.title}\". Unable to resolve title from ${pc.title} with identifiers ${pc.instanceIdentifiers}" - log.error(message) } } + } catch ( IngestException ie ) { // When we've caught an ingest exception, should have helpful error log message String message = "Skipping \"${pc.title}\": ${ie.message}" @@ -196,7 +212,7 @@ class PushKBService implements DataBinder { } */ } - long finishedTime = (System.currentTimeMillis()-result.startTime)/1000 + long finishedTime = (System.currentTimeMillis()-result.startTime) // Don't divide by 1000 here result.success = true // TODO Logging may need tweaking between pushKB and harvest diff --git a/service/src/main/okapi/tenant/sample_data/test1-data.groovy b/service/src/main/okapi/tenant/sample_data/test1-data.groovy new file mode 100644 index 00000000..b5c03fb1 --- /dev/null +++ b/service/src/main/okapi/tenant/sample_data/test1-data.groovy @@ -0,0 +1,40 @@ +// DEV NOTE -- This is what will be used for the default rancher-desktop-db/dc environments +// For vagrant development (which I think is deprecated now) there may need to be a different file +// to avoid making changes to diku-data or _data. +log.info "Running test1-data tenant sample data file" + +import org.olf.kb.RemoteKB + +/* RemoteKB.findByName('GOKb_TEST') ?: (new RemoteKB( + name:'GOKb_TEST', + type:'org.olf.kb.adapters.GOKbOAIAdapter', + uri:'https://gokbt.gbv.de/gokb/oai/index', + fullPrefix:'gokb', + rectype: RemoteKB.RECTYPE_PACKAGE, + active:Boolean.TRUE, + supportsHarvesting:true, + activationEnabled:false, + //cursor: "2022-08-09T19:34:42Z" +).save(failOnError:true)) */ + +RemoteKB.findByName('GOKb') ?: (new RemoteKB( + name:'GOKb', + type:'org.olf.kb.adapters.GOKbOAIAdapter', + uri:'https://gokb.org/gokb/oai/index', + fullPrefix:'gokb', + rectype: RemoteKB.RECTYPE_PACKAGE, + active:Boolean.TRUE, + supportsHarvesting:true, + activationEnabled:false +).save(failOnError:true)) + +/* RemoteKB.findByName('DEBUG') ?: (new RemoteKB( + name:'DEBUG', + type:'org.olf.kb.adapters.DebugGoKbAdapter', + // uri can be used to directly force a package from the resources folder + // uri: 'src/integration-test/resources/DebugGoKbAdapter/borked_ids.xml' + rectype: RemoteKB.RECTYPE_PACKAGE, + active:Boolean.TRUE, + supportsHarvesting:true, + activationEnabled:false +).save(failOnError:true)) */