From cf174b218b79073bb52ddf2e3d8eb557662403e4 Mon Sep 17 00:00:00 2001 From: qqmyers Date: Thu, 15 Aug 2024 14:49:43 -0400 Subject: [PATCH] IQSS/7068 Reserve File Pids (#7334) * file pid reservation * add file pid reservation step to publish (analogous to dataset pid register if needed) Conflicts: src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDatasetVersionCommand.java src/main/java/propertyFiles/Bundle.properties * comment change * check if file PIDs used once, use constants - per comments * adding release note * release notes, API doc update * reflecting datasets and files for the PID endpoint * removing release note about pre-reg for file PIDs as this is not supported * file pid pre-reservation Conflicts: src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractDatasetCommand.java src/main/java/propertyFiles/Bundle.properties * avoid problem when GlobalIDServiceBean implicitly merges @kcondon sees a DB error persisting a file with null createtime during the GlobalIDServiceBean.getBean call which uses a set namedQuery to find the :DoiProvider. Create times for files are set above, but not merged prior to calling registerFilePidsIfNeeded. Assuming the namedQuery is forcing the file (without a merge) to persist which triggers the error. In #7337, the code is reworked so there is a merge prior to registerFilePidsIfNeeded. This commit adds one temporarily so this PR works indepdently of the other. * update theDataset * noting that PID reservation can cause old timeouts to be too short * more specifics * release note update * cleanup reformatting * further cleanup * set createTime earlier --------- Co-authored-by: Danny Brooke --- doc/release-notes/7068-reserve-file-pids.md | 9 +++ doc/sphinx-guides/source/api/native-api.rst | 2 +- .../iq/dataverse/DataFileServiceBean.java | 8 -- .../command/impl/AbstractDatasetCommand.java | 80 ++++++++++++++++++- .../FinalizeDatasetPublicationCommand.java | 22 ++--- .../command/impl/PublishDatasetCommand.java | 7 -- .../command/impl/ReservePidCommand.java | 30 ++----- .../impl/UpdateDatasetVersionCommand.java | 3 +- .../iq/dataverse/util/SystemConfig.java | 4 +- src/main/java/propertyFiles/Bundle.properties | 7 +- 10 files changed, 110 insertions(+), 62 deletions(-) create mode 100644 doc/release-notes/7068-reserve-file-pids.md diff --git a/doc/release-notes/7068-reserve-file-pids.md b/doc/release-notes/7068-reserve-file-pids.md new file mode 100644 index 00000000000..182a0d7f67b --- /dev/null +++ b/doc/release-notes/7068-reserve-file-pids.md @@ -0,0 +1,9 @@ +## Release Highlights + +### Pre-Publish File DOI Reservation with DataCite + +Dataverse installations using DataCite (or other persistent identifier (PID) Providers that support reserving PIDs) will be able to reserve PIDs for files when they are uploaded (rather than at publication time). Note that reserving file DOIs can slow uploads with large numbers of files so administrators may need to adjust timeouts (specifically any Apache "``ProxyPass / ajp://localhost:8009/ timeout=``" setting in the recommended Dataverse configuration). + +## Major Use Cases + +- Users will have DOIs/PIDs reserved for their files as part of file upload instead of at publication time. (Issue #7068, PR #7334) diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index 97cc1acc5c3..bf790c47778 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -5171,7 +5171,7 @@ The fully expanded example above (without environment variables) looks like this Reserve a PID ~~~~~~~~~~~~~ -Reserved a PID for a dataset. A superuser API token is required. +Reserve a PID for a dataset if not yet registered, and, if FilePIDs are enabled, reserve any file PIDs that are not yet registered. A superuser API token is required. .. note:: See :ref:`curl-examples-and-environment-variables` if you are unfamiliar with the use of export below. diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java index 21f925f8981..9331ec67d12 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @@ -1248,14 +1248,6 @@ public List selectFilesWithMissingOriginalSizes() { } - /** - * Check that a identifier entered by the user is unique (not currently used - * for any other study in this Dataverse Network). Also check for duplicate - * in the remote PID service if needed - * @param datafileId - * @param storageLocation - * @return {@code true} iff the global identifier is unique. - */ public void finalizeFileDelete(Long dataFileId, String storageLocation) throws IOException { // Verify that the DataFile no longer exists: if (find(dataFileId) != null) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractDatasetCommand.java index 1a1f4f9318b..bd38245d334 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/AbstractDatasetCommand.java @@ -1,5 +1,6 @@ package edu.harvard.iq.dataverse.engine.command.impl; +import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetField; import edu.harvard.iq.dataverse.DatasetFieldServiceBean; @@ -18,9 +19,11 @@ import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException; import edu.harvard.iq.dataverse.pidproviders.PidProvider; import edu.harvard.iq.dataverse.pidproviders.PidUtil; +import edu.harvard.iq.dataverse.pidproviders.doi.fake.FakeDOIProvider; import edu.harvard.iq.dataverse.util.BundleUtil; import java.sql.Timestamp; +import java.util.Arrays; import java.util.Date; import java.util.Set; import java.util.logging.Level; @@ -169,13 +172,12 @@ protected void registerExternalIdentifier(Dataset theDataset, CommandContext ctx } while (pidProvider.alreadyRegistered(theDataset) && attempts <= FOOLPROOF_RETRIAL_ATTEMPTS_LIMIT); } if(!retry) { - logger.warning("Reserving PID for: " + getDataset().getId() + " during publication failed."); - throw new IllegalCommandException(BundleUtil.getStringFromBundle("publishDatasetCommand.pidNotReserved"), this); + logger.warning("Reserving PID for: " + getDataset().getId() + " failed."); + throw new CommandExecutionException(BundleUtil.getStringFromBundle("abstractDatasetCommand.pidNotReserved", Arrays.asList(theDataset.getIdentifier())), this); } if(attempts > FOOLPROOF_RETRIAL_ATTEMPTS_LIMIT) { //Didn't work - we existed the loop with too many tries - throw new CommandExecutionException("This dataset may not be published because its identifier is already in use by another dataset; " - + "gave up after " + attempts + " attempts. Current (last requested) identifier: " + theDataset.getIdentifier(), this); + throw new CommandExecutionException(BundleUtil.getStringFromBundle("abstractDatasetCommand.pidReservationRetryExceeded", Arrays.asList(Integer.toString(attempts), theDataset.getIdentifier())), this); } } // Invariant: Dataset identifier does not exist in the remote registry @@ -188,6 +190,9 @@ protected void registerExternalIdentifier(Dataset theDataset, CommandContext ctx } } catch (Throwable e) { + if (e instanceof CommandException) { + throw (CommandException) e; + } throw new CommandException(BundleUtil.getStringFromBundle("dataset.publish.error", pidProvider.getProviderInformation()), this); } } else { @@ -217,6 +222,73 @@ protected Timestamp getTimestamp() { return timestamp; } + protected void registerFilePidsIfNeeded(Dataset theDataset, CommandContext ctxt, boolean b) throws CommandException { + // Register file PIDs if needed + PidProvider pidGenerator = ctxt.dvObjects().getEffectivePidGenerator(getDataset()); + boolean shouldRegister = !pidGenerator.registerWhenPublished() && + ctxt.systemConfig().isFilePIDsEnabledForCollection(getDataset().getOwner()) && + pidGenerator.canCreatePidsLike(getDataset().getGlobalId()); + if (shouldRegister) { + for (DataFile dataFile : theDataset.getFiles()) { + logger.fine(dataFile.getId() + " is registered?: " + dataFile.isIdentifierRegistered()); + if (!dataFile.isIdentifierRegistered()) { + // pre-register a persistent id + registerFileExternalIdentifier(dataFile, pidGenerator, ctxt, true); + } + } + } + } + + private void registerFileExternalIdentifier(DataFile dataFile, PidProvider pidProvider, CommandContext ctxt, boolean retry) throws CommandException { + + if (!dataFile.isIdentifierRegistered()) { + + if (pidProvider instanceof FakeDOIProvider) { + retry = false; // No reason to allow a retry with the FakeProvider (even if it allows + // pre-registration someday), so set false for efficiency + } + try { + if (pidProvider.alreadyRegistered(dataFile)) { + int attempts = 0; + if (retry) { + do { + pidProvider.generatePid(dataFile); + logger.log(Level.INFO, "Attempting to register external identifier for datafile {0} (trying: {1}).", + new Object[] { dataFile.getId(), dataFile.getIdentifier() }); + attempts++; + } while (pidProvider.alreadyRegistered(dataFile) && attempts <= FOOLPROOF_RETRIAL_ATTEMPTS_LIMIT); + } + if (!retry) { + logger.warning("Reserving File PID for: " + getDataset().getId() + ", fileId: " + dataFile.getId() + ", during publication failed."); + throw new CommandExecutionException(BundleUtil.getStringFromBundle("abstractDatasetCommand.filePidNotReserved", Arrays.asList(getDataset().getIdentifier())), this); + } + if (attempts > FOOLPROOF_RETRIAL_ATTEMPTS_LIMIT) { + // Didn't work - we existed the loop with too many tries + throw new CommandExecutionException("This dataset may not be published because its identifier is already in use by another dataset; " + + "gave up after " + attempts + " attempts. Current (last requested) identifier: " + dataFile.getIdentifier(), this); + } + } + // Invariant: DataFile identifier does not exist in the remote registry + try { + pidProvider.createIdentifier(dataFile); + dataFile.setGlobalIdCreateTime(getTimestamp()); + dataFile.setIdentifierRegistered(true); + } catch (Throwable ex) { + logger.info("Call to globalIdServiceBean.createIdentifier failed: " + ex); + } + + } catch (Throwable e) { + if (e instanceof CommandException) { + throw (CommandException) e; + } + throw new CommandException(BundleUtil.getStringFromBundle("file.register.error", pidProvider.getProviderInformation()), this); + } + } else { + throw new IllegalCommandException("This datafile may not have a PID because its id registry service is not supported.", this); + } + + } + protected void checkSystemMetadataKeyIfNeeded(DatasetVersion newVersion, DatasetVersion persistedVersion) throws IllegalCommandException { Set changedMDBs = DatasetVersionDifference.getBlocksWithChanges(newVersion, persistedVersion); for (MetadataBlock mdb : changedMDBs) { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java index 287e877f6e0..69ebe6feed8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java @@ -102,13 +102,13 @@ public Dataset execute(CommandContext ctxt) throws CommandException { try { // This can potentially throw a CommandException, so let's make // sure we exit cleanly: - - registerExternalIdentifier(theDataset, ctxt, false); + registerExternalIdentifier(theDataset, ctxt, false); + registerFilePidsIfNeeded(theDataset, ctxt, false); } catch (CommandException comEx) { - logger.warning("Failed to reserve the identifier "+theDataset.getGlobalId().asString()+"; notifying the user(s), unlocking the dataset"); - // Send failure notification to the user: + logger.warning("Failed to reserve the identifier " + theDataset.getGlobalId().asString() + "; notifying the user(s), unlocking the dataset"); + // Send failure notification to the user: notifyUsersDatasetPublishStatus(ctxt, theDataset, UserNotification.Type.PUBLISHFAILED_PIDREG); - // Remove the dataset lock: + // Remove the dataset lock: ctxt.datasets().removeDatasetLocks(theDataset, DatasetLock.Reason.finalizePublication); // re-throw the exception: throw comEx; @@ -395,8 +395,7 @@ private void publicizeExternalIdentifier(Dataset dataset, CommandContext ctxt) t // we can't get "dependent" DOIs assigned to files in a dataset // with the registered id that is a handle; or even a DOI, but in // an authority that's different from what's currently configured. - // Additionaly in 4.9.3 we have added a system variable to disable - // registering file PIDs on the installation level. + // File PIDs may be enabled/disabled per collection. boolean registerGlobalIdsForFiles = ctxt.systemConfig().isFilePIDsEnabledForCollection( getDataset().getOwner()) && pidProvider.canCreatePidsLike(dataset.getGlobalId()); @@ -422,8 +421,8 @@ private void publicizeExternalIdentifier(Dataset dataset, CommandContext ctxt) t // pidProvider. dataset.setIdentifierRegistered(true); } catch (Throwable e) { - logger.warning("Failed to register the identifier " + dataset.getGlobalId().asString() - + ", or to register a file in the dataset; notifying the user(s), unlocking the dataset"); + logger.warning("Failed to publicize the identifier " + dataset.getGlobalId().asString() + + ", or to publicize a file in the dataset; notifying the user(s), unlocking the dataset"); // Send failure notification to the user: notifyUsersDatasetPublishStatus(ctxt, dataset, UserNotification.Type.PUBLISHFAILED_PIDREG); @@ -440,8 +439,9 @@ private void updateFiles(Timestamp updateTime, CommandContext ctxt) throws Comma if (dataFile.getPublicationDate() == null) { // this is a new, previously unpublished file, so publish by setting date dataFile.setPublicationDate(updateTime); - - // check if any prexisting roleassignments have file download and send notifications + + // check if any pre-existing role assignments have file download and send + // notifications notifyUsersFileDownload(ctxt, dataFile); } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java index 6b95f3b6de1..1ac41105237 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java @@ -4,20 +4,13 @@ import edu.harvard.iq.dataverse.DatasetLock; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser; -import edu.harvard.iq.dataverse.engine.command.Command; import edu.harvard.iq.dataverse.engine.command.CommandContext; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException; -import edu.harvard.iq.dataverse.pidproviders.PidProvider; -import edu.harvard.iq.dataverse.privateurl.PrivateUrl; -import edu.harvard.iq.dataverse.settings.SettingsServiceBean; -import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.workflow.Workflow; import edu.harvard.iq.dataverse.workflow.WorkflowContext.TriggerType; -import java.util.Date; -import java.util.List; import java.util.Optional; import java.util.logging.Logger; import static java.util.stream.Collectors.joining; diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/ReservePidCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/ReservePidCommand.java index b7e3ddd8ce6..77b06e4e152 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/ReservePidCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/ReservePidCommand.java @@ -3,27 +3,21 @@ import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser; -import edu.harvard.iq.dataverse.engine.command.AbstractVoidCommand; import edu.harvard.iq.dataverse.engine.command.CommandContext; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.engine.command.exception.CommandException; -import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException; import edu.harvard.iq.dataverse.engine.command.exception.PermissionException; -import edu.harvard.iq.dataverse.pidproviders.PidProvider; -import edu.harvard.iq.dataverse.pidproviders.PidUtil; -import edu.harvard.iq.dataverse.settings.SettingsServiceBean; import edu.harvard.iq.dataverse.util.BundleUtil; -import java.util.Arrays; import java.util.Collections; -import java.util.Date; import java.util.logging.Logger; /** * No required permissions because we check for superuser status. + * @param */ @RequiredPermissions({}) -public class ReservePidCommand extends AbstractVoidCommand { +public class ReservePidCommand extends AbstractDatasetCommand { private static final Logger logger = Logger.getLogger(ReservePidCommand.class.getCanonicalName()); @@ -35,27 +29,15 @@ public ReservePidCommand(DataverseRequest request, Dataset dataset) { } @Override - protected void executeImpl(CommandContext ctxt) throws CommandException { + public Dataset execute(CommandContext ctxt) throws CommandException { if (!(getUser() instanceof AuthenticatedUser) || !getUser().isSuperuser()) { throw new PermissionException(BundleUtil.getStringFromBundle("admin.api.auth.mustBeSuperUser"), this, Collections.singleton(Permission.EditDataset), dataset); } - - PidProvider pidProvider = ctxt.dvObjects().getEffectivePidGenerator(dataset); - - try { - String returnString = pidProvider.createIdentifier(dataset); - logger.fine(returnString); - // No errors caught, so mark PID as reserved. - dataset.setGlobalIdCreateTime(new Date()); - // We don't setIdentifierRegistered(true) yet. - ctxt.datasets().merge(dataset); - } catch (Throwable ex) { - String message = BundleUtil.getStringFromBundle("pids.commands.reservePid.failure", Arrays.asList(dataset.getId().toString(), ex.getLocalizedMessage())); - logger.info(message); - throw new IllegalCommandException(message, this); - } + registerExternalIdentifier(getDataset(), ctxt, true); + registerFilePidsIfNeeded(getDataset(), ctxt, true); + return dataset; } } diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDatasetVersionCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDatasetVersionCommand.java index 994f4c7dfb6..768bb88fd43 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDatasetVersionCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateDatasetVersionCommand.java @@ -154,7 +154,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException { throw e; } } - + //Set creator and create date for files if needed for (DataFile dataFile : theDataset.getFiles()) { if (dataFile.getCreateDate() == null) { dataFile.setCreateDate(getTimestamp()); @@ -259,6 +259,7 @@ public Dataset execute(CommandContext ctxt) throws CommandException { for(FileMetadata fmd: theDataset.getOrCreateEditVersion().getFileMetadatas()) { logger.fine("FMD: " + fmd.getId() + " for file: " + fmd.getDataFile().getId() + "is in final draft version"); } + registerFilePidsIfNeeded(theDataset, ctxt, true); if (recalculateUNF) { ctxt.ingest().recalculateDatasetVersionUNF(theDataset.getOrCreateEditVersion()); diff --git a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java index f9801419e47..5cc28e4b225 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java @@ -986,7 +986,7 @@ public boolean isFilePIDsEnabledForCollection(Dataverse collection) { Dataverse thisCollection = collection; // If neither enabled nor disabled specifically for this collection, - // the parent collection setting is inhereted (recursively): + // the parent collection setting is inherited (recursively): while (thisCollection.getFilePIDsEnabled() == null) { if (thisCollection.getOwner() == null) { // We've reached the root collection, and file PIDs registration @@ -1002,8 +1002,6 @@ public boolean isFilePIDsEnabledForCollection(Dataverse collection) { // takes precedent: return thisCollection.getFilePIDsEnabled(); } - - public String getMDCLogPath() { String mDCLogPath = settingsService.getValueForKey(SettingsServiceBean.Key.MDCLogPath, null); diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index edf0206ab5d..a092b0f456b 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -3008,12 +3008,13 @@ pids.api.reservePid.success=PID reserved for {0} pids.api.deletePid.success=PID deleted for {0} pids.deletePid.failureExpected=Unable to delete PID {0}. Status code: {1}. pids.deletePid.failureOther=Problem deleting PID {0}: {1} -pids.commands.reservePid.failure=Problem reserving PID for dataset id {0}: {1}. pids.datacite.errors.noResponseCode=Problem getting HTTP status code from {0}. Is it in DNS? Is doi.dataciterestapiurlstring configured properly? pids.datacite.errors.DoiOnly=Only doi: is supported. -#PublishDatasetCommand -publishDatasetCommand.pidNotReserved=Cannot publish dataset because its persistent identifier has not been reserved. +#AbstractDatasetCommand +abstractDatasetCommand.pidNotReserved=Unable to reserve a persistent identifier for the dataset: {0}. +abstractDatasetCommand.filePidNotReserved=Unable to reserve a persistent identifier for one or more files in the dataset: {0}. +abstractDatasetCommand.pidReservationRetryExceeded="This dataset may not be registered because its identifier is already in use by another dataset: gave up after {0} attempts. Current (last requested) identifier: {1}" # APIs api.errors.invalidApiToken=Invalid API token.