diff --git a/doc/release-notes/10977-globus-filesize-lookup.md b/doc/release-notes/10977-globus-filesize-lookup.md new file mode 100644 index 00000000000..49fd10d9ffe --- /dev/null +++ b/doc/release-notes/10977-globus-filesize-lookup.md @@ -0,0 +1,6 @@ +## A new Globus optimization setting + +An optimization has been added for the Globus upload workflow, with a corresponding new database setting: `:GlobusBatchLookupSize` + + +See the [Database Settings](https://guides.dataverse.org/en/6.5/installation/config.html#GlobusBatchLookupSize) section of the Guides for more information. \ No newline at end of file diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index e3965e3cd7c..30a36da9499 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -4849,6 +4849,13 @@ The URL where the `dataverse-globus initialFileList; List finalFileList; + private boolean trustSuppliedFileSizes; // ----------------------------------- // Ingested files @@ -610,15 +609,9 @@ private boolean runAddReplacePhase1(Dataset owner, return false; } - if(optionalFileParams != null) { - if(optionalFileParams.hasCheckSum()) { - newCheckSum = optionalFileParams.getCheckSum(); - newCheckSumType = optionalFileParams.getCheckSumType(); - } - } msgt("step_030_createNewFilesViaIngest"); - if (!this.step_030_createNewFilesViaIngest()){ + if (!this.step_030_createNewFilesViaIngest(optionalFileParams)){ return false; } @@ -1191,7 +1184,7 @@ private boolean step_007_auto_isReplacementInLatestVersion(DataFile existingFile } - private boolean step_030_createNewFilesViaIngest(){ + private boolean step_030_createNewFilesViaIngest(OptionalFileParams optionalFileParams){ if (this.hasError()){ return false; @@ -1203,21 +1196,28 @@ private boolean step_030_createNewFilesViaIngest(){ //Don't repeatedly update the clone (losing changes) in multifile case clone = workingVersion.cloneDatasetVersion(); } + + Long suppliedFileSize = null; + String newCheckSum = null; + ChecksumType newCheckSumType = null; + + + if (optionalFileParams != null) { + if (optionalFileParams.hasCheckSum()) { + newCheckSum = optionalFileParams.getCheckSum(); + newCheckSumType = optionalFileParams.getCheckSumType(); + } + if (trustSuppliedFileSizes && optionalFileParams.hasFileSize()) { + suppliedFileSize = optionalFileParams.getFileSize(); + } + } + try { - /*CreateDataFileResult result = FileUtil.createDataFiles(workingVersion, - this.newFileInputStream, - this.newFileName, - this.newFileContentType, - this.newStorageIdentifier, - this.newCheckSum, - this.newCheckSumType, - this.systemConfig);*/ - UploadSessionQuotaLimit quota = null; if (systemConfig.isStorageQuotasEnforced()) { quota = fileService.getUploadSessionQuotaLimit(dataset); } - Command cmd = new CreateNewDataFilesCommand(dvRequest, workingVersion, newFileInputStream, newFileName, newFileContentType, newStorageIdentifier, quota, newCheckSum, newCheckSumType); + Command cmd = new CreateNewDataFilesCommand(dvRequest, workingVersion, newFileInputStream, newFileName, newFileContentType, newStorageIdentifier, quota, newCheckSum, newCheckSumType, suppliedFileSize); CreateDataFileResult createDataFilesResult = commandEngine.submit(cmd); initialFileList = createDataFilesResult.getDataFiles(); @@ -2033,9 +2033,15 @@ public void setDuplicateFileWarning(String duplicateFileWarning) { * @param jsonData - an array of jsonData entries (one per file) using the single add file jsonData format * @param dataset * @param authUser + * @param trustSuppliedSizes - whether to accept the fileSize values passed + * in jsonData (we don't want to trust the users of the S3 direct + * upload API with that information - we will verify the status of + * the files in the S3 bucket and confirm the sizes in the process. + * we do want GlobusService to be able to pass the file sizes, since + * they are obtained and verified via a Globus API lookup). * @return */ - public Response addFiles(String jsonData, Dataset dataset, User authUser) { + public Response addFiles(String jsonData, Dataset dataset, User authUser, boolean trustSuppliedFileSizes) { msgt("(addFilesToDataset) jsonData: " + jsonData.toString()); JsonArrayBuilder jarr = Json.createArrayBuilder(); @@ -2044,6 +2050,7 @@ public Response addFiles(String jsonData, Dataset dataset, User authUser) { int totalNumberofFiles = 0; int successNumberofFiles = 0; + this.trustSuppliedFileSizes = trustSuppliedFileSizes; // ----------------------------------------------------------- // Read jsonData and Parse files information from jsondata : // ----------------------------------------------------------- @@ -2176,6 +2183,10 @@ public Response addFiles(String jsonData, Dataset dataset, User authUser) { .add("data", Json.createObjectBuilder().add("Files", jarr).add("Result", result)).build() ).build(); } + public Response addFiles(String jsonData, Dataset dataset, User authUser) { + return addFiles(jsonData, dataset, authUser, false); + } + /** * Replace multiple files with prepositioned replacements as listed in the * jsonData. Works with direct upload, Globus, and other out-of-band methods. diff --git a/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java b/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java index 959dbc4e262..54844160163 100644 --- a/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java +++ b/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java @@ -39,6 +39,12 @@ * - Provenance related information * * @author rmp553 + * @todo (?) We may want to consider renaming this class to DataFileParams or + * DataFileInfo... it was originally created to encode some bits of info - + * the file "tags" specifically, that didn't fit in elsewhere in the normal + * workflow; but it's been expanded to cover pretty much everything else associated + * with DataFiles and it's not really "optional" anymore when, for example, used + * in the direct upload workflow. (?) */ public class OptionalFileParams { @@ -76,6 +82,8 @@ public class OptionalFileParams { public static final String MIME_TYPE_ATTR_NAME = "mimeType"; private String checkSumValue; private ChecksumType checkSumType; + public static final String FILE_SIZE_ATTR_NAME = "fileSize"; + private Long fileSize; public static final String LEGACY_CHECKSUM_ATTR_NAME = "md5Hash"; public static final String CHECKSUM_OBJECT_NAME = "checksum"; public static final String CHECKSUM_OBJECT_TYPE = "@type"; @@ -268,6 +276,18 @@ public String getCheckSum() { public ChecksumType getCheckSumType() { return checkSumType; } + + public boolean hasFileSize() { + return fileSize != null; + } + + public Long getFileSize() { + return fileSize; + } + + public void setFileSize(long fileSize) { + this.fileSize = fileSize; + } /** * Set tags @@ -416,7 +436,13 @@ else if ((jsonObj.has(CHECKSUM_OBJECT_NAME)) && (!jsonObj.get(CHECKSUM_OBJECT_NA this.checkSumType = ChecksumType.fromString(((JsonObject) jsonObj.get(CHECKSUM_OBJECT_NAME)).get(CHECKSUM_OBJECT_TYPE).getAsString()); } - + // ------------------------------- + // get file size as a Long, if supplied + // ------------------------------- + if ((jsonObj.has(FILE_SIZE_ATTR_NAME)) && (!jsonObj.get(FILE_SIZE_ATTR_NAME).isJsonNull())){ + + this.fileSize = jsonObj.get(FILE_SIZE_ATTR_NAME).getAsLong(); + } // ------------------------------- // get tags // ------------------------------- diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java index 76939751899..e9a2025b112 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java @@ -93,6 +93,10 @@ public CreateNewDataFilesCommand(DataverseRequest aRequest, DatasetVersion versi this(aRequest, version, inputStream, fileName, suppliedContentType, newStorageIdentifier, quota, newCheckSum, newCheckSumType, null, null); } + public CreateNewDataFilesCommand(DataverseRequest aRequest, DatasetVersion version, InputStream inputStream, String fileName, String suppliedContentType, String newStorageIdentifier, UploadSessionQuotaLimit quota, String newCheckSum, DataFile.ChecksumType newCheckSumType, Long newFileSize) { + this(aRequest, version, inputStream, fileName, suppliedContentType, newStorageIdentifier, quota, newCheckSum, newCheckSumType, newFileSize, null); + } + // This version of the command must be used when files are created in the // context of creating a brand new dataset (from the Add Dataset page): diff --git a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java index ac3c81622fc..58992805dc8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java @@ -74,6 +74,7 @@ import edu.harvard.iq.dataverse.util.URLTokenUtil; import edu.harvard.iq.dataverse.util.UrlSignerUtil; import edu.harvard.iq.dataverse.util.json.JsonUtil; +import jakarta.json.JsonNumber; import jakarta.json.JsonReader; import jakarta.persistence.EntityManager; import jakarta.persistence.PersistenceContext; @@ -284,6 +285,52 @@ private int makeDir(GlobusEndpoint endpoint, String dir) { return result.status; } + private Map lookupFileSizes(GlobusEndpoint endpoint, String dir) { + MakeRequestResponse result; + + try { + logger.fine("Attempting to look up the contents of the Globus folder "+dir); + URL url = new URL( + "https://transfer.api.globusonline.org/v0.10/operation/endpoint/" + endpoint.getId() + + "/ls?path=" + dir); + result = makeRequest(url, "Bearer", endpoint.getClientToken(), "GET", null); + + switch (result.status) { + case 200: + logger.fine("Looked up directory " + dir + " successfully."); + break; + default: + logger.warning("Status " + result.status + " received when looking up dir " + dir); + logger.fine("Response: " + result.jsonResponse); + return null; + } + } catch (MalformedURLException ex) { + // Misconfiguration + logger.warning("Failed to list the contents of the directory "+ dir + " on endpoint " + endpoint.getId()); + return null; + } + + Map ret = new HashMap<>(); + + JsonObject listObject = JsonUtil.getJsonObject(result.jsonResponse); + JsonArray dataArray = listObject.getJsonArray("DATA"); + + if (dataArray != null && !dataArray.isEmpty()) { + for (int i = 0; i < dataArray.size(); i++) { + String dataType = dataArray.getJsonObject(i).getString("DATA_TYPE", null); + if (dataType != null && dataType.equals("file")) { + // is it safe to assume that any entry with a valid "DATA_TYPE": "file" + // will also have valid "name" and "size" entries? + String fileName = dataArray.getJsonObject(i).getString("name"); + long fileSize = dataArray.getJsonObject(i).getJsonNumber("size").longValueExact(); + ret.put(fileName, fileSize); + } + } + } + + return ret; + } + private int requestPermission(GlobusEndpoint endpoint, Dataset dataset, Permissions permissions) { Gson gson = new GsonBuilder().create(); MakeRequestResponse result = null; @@ -938,9 +985,20 @@ private void processUploadedFiles(JsonArray filesJsonArray, Dataset dataset, Aut inputList.add(fileId + "IDsplit" + fullPath + "IDsplit" + fileName); } + + Map fileSizeMap = null; + + if (filesJsonArray.size() >= systemConfig.getGlobusBatchLookupSize()) { + // Look up the sizes of all the files in the dataset folder, to avoid + // looking them up one by one later: + // @todo: we should only be doing this if this is a managed store, probably (?) + GlobusEndpoint endpoint = getGlobusEndpoint(dataset); + fileSizeMap = lookupFileSizes(endpoint, endpoint.getBasePath()); + } // calculateMissingMetadataFields: checksum, mimetype JsonObject newfilesJsonObject = calculateMissingMetadataFields(inputList, myLogger); + JsonArray newfilesJsonArray = newfilesJsonObject.getJsonArray("files"); logger.fine("Size: " + newfilesJsonArray.size()); logger.fine("Val: " + JsonUtil.prettyPrint(newfilesJsonArray.getJsonObject(0))); @@ -964,20 +1022,26 @@ private void processUploadedFiles(JsonArray filesJsonArray, Dataset dataset, Aut if (newfileJsonObject != null) { logger.fine("List Size: " + newfileJsonObject.size()); // if (!newfileJsonObject.get(0).getString("hash").equalsIgnoreCase("null")) { - JsonPatch path = Json.createPatchBuilder() + JsonPatch patch = Json.createPatchBuilder() .add("/md5Hash", newfileJsonObject.get(0).getString("hash")).build(); - fileJsonObject = path.apply(fileJsonObject); - path = Json.createPatchBuilder() + fileJsonObject = patch.apply(fileJsonObject); + patch = Json.createPatchBuilder() .add("/mimeType", newfileJsonObject.get(0).getString("mime")).build(); - fileJsonObject = path.apply(fileJsonObject); + fileJsonObject = patch.apply(fileJsonObject); + // If we already know the size of this file on the Globus end, + // we'll pass it to /addFiles, to avoid looking up file sizes + // one by one: + if (fileSizeMap != null && fileSizeMap.get(fileId) != null) { + Long uploadedFileSize = fileSizeMap.get(fileId); + myLogger.info("Found size for file " + fileId + ": " + uploadedFileSize + " bytes"); + patch = Json.createPatchBuilder() + .add("/fileSize", Json.createValue(uploadedFileSize)).build(); + fileJsonObject = patch.apply(fileJsonObject); + } else { + logger.fine("No file size entry found for file "+fileId); + } addFilesJsonData.add(fileJsonObject); countSuccess++; - // } else { - // globusLogger.info(fileName - // + " will be skipped from adding to dataset by second API due to missing - // values "); - // countError++; - // } } else { myLogger.info(fileName + " will be skipped from adding to dataset in the final AddReplaceFileHelper.addFiles() call. "); @@ -1029,7 +1093,7 @@ private void processUploadedFiles(JsonArray filesJsonArray, Dataset dataset, Aut // The old code had 2 sec. of sleep, so ... Thread.sleep(2000); - Response addFilesResponse = addFileHelper.addFiles(newjsonData, dataset, authUser); + Response addFilesResponse = addFileHelper.addFiles(newjsonData, dataset, authUser, true); if (addFilesResponse == null) { logger.info("null response from addFiles call"); @@ -1211,7 +1275,7 @@ private GlobusTaskState globusStatusCheck(GlobusEndpoint endpoint, String taskId return task; } - public JsonObject calculateMissingMetadataFields(List inputList, Logger globusLogger) + private JsonObject calculateMissingMetadataFields(List inputList, Logger globusLogger) throws InterruptedException, ExecutionException, IOException { List> hashvalueCompletableFutures = inputList.stream() @@ -1230,7 +1294,7 @@ public JsonObject calculateMissingMetadataFields(List inputList, Logger }); JsonArrayBuilder filesObject = (JsonArrayBuilder) completableFuture.get(); - + JsonObject output = Json.createObjectBuilder().add("files", filesObject).build(); return output; diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index b42fd950528..71c498a4d0b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -344,10 +344,20 @@ public List saveAndAddFilesToDataset(DatasetVersion version, try { StorageIO dataAccess = DataAccess.getStorageIO(dataFile); //Populate metadata - dataAccess.open(DataAccessOption.READ_ACCESS); - // (the .open() above makes a remote call to check if - // the file exists and obtains its size) - confirmedFileSize = dataAccess.getSize(); + + // There are direct upload sub-cases where the file size + // is already known at this point. For example, direct uploads + // to S3 that go through the jsf dataset page. Or the Globus + // uploads, where the file sizes are looked up in bulk on + // the completion of the remote upload task. + if (dataFile.getFilesize() >= 0) { + confirmedFileSize = dataFile.getFilesize(); + } else { + dataAccess.open(DataAccessOption.READ_ACCESS); + // (the .open() above makes a remote call to check if + // the file exists and obtains its size) + confirmedFileSize = dataAccess.getSize(); + } // For directly-uploaded files, we will perform the file size // limit and quota checks here. Perform them *again*, in @@ -362,13 +372,16 @@ public List saveAndAddFilesToDataset(DatasetVersion version, if (fileSizeLimit == null || confirmedFileSize < fileSizeLimit) { //set file size - logger.fine("Setting file size: " + confirmedFileSize); - dataFile.setFilesize(confirmedFileSize); + if (dataFile.getFilesize() < 0) { + logger.fine("Setting file size: " + confirmedFileSize); + dataFile.setFilesize(confirmedFileSize); + } if (dataAccess instanceof S3AccessIO) { ((S3AccessIO) dataAccess).removeTempTag(); } savedSuccess = true; + logger.info("directly uploaded file successfully saved. file size: "+dataFile.getFilesize()); } } catch (IOException ioex) { logger.warning("Failed to get file size, storage id, or failed to remove the temp tag on the saved S3 object" + dataFile.getStorageIdentifier() + " (" diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 8ed96690e84..b5eb483c2c8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -539,6 +539,12 @@ Whether Harvesting (OAI) service is enabled * */ GlobusSingleFileTransfer, + /** Lower limit of the number of files in a Globus upload task where + * the batch mode should be utilized in looking up the file information + * on the remote end node (file sizes, primarily), instead of individual + * lookups. + */ + GlobusBatchLookupSize, /** * Optional external executables to run on the metadata for dataverses * and datasets being published; as an extra validation step, to diff --git a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java index 434b3bd8f8f..e769cacfdb1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java @@ -78,6 +78,7 @@ public class SystemConfig { public static final long defaultZipDownloadLimit = 104857600L; // 100MB private static final int defaultMultipleUploadFilesLimit = 1000; private static final int defaultLoginSessionTimeout = 480; // = 8 hours + private static final int defaultGlobusBatchLookupSize = 50; private String buildNumber = null; @@ -954,6 +955,11 @@ public boolean isGlobusFileDownload() { return (isGlobusDownload() && settingsService.isTrueForKey(SettingsServiceBean.Key.GlobusSingleFileTransfer, false)); } + public int getGlobusBatchLookupSize() { + String batchSizeOption = settingsService.getValueForKey(SettingsServiceBean.Key.GlobusBatchLookupSize); + return getIntLimitFromStringOrDefault(batchSizeOption, defaultGlobusBatchLookupSize); + } + private Boolean getMethodAvailable(String method, boolean upload) { String methods = settingsService.getValueForKey( upload ? SettingsServiceBean.Key.UploadMethods : SettingsServiceBean.Key.DownloadMethods); diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index 012b389ce32..02bc19f86cf 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -307,7 +307,13 @@ notification.typeDescription.WORKFLOW_FAILURE=External workflow run has failed notification.typeDescription.STATUSUPDATED=Status of dataset has been updated notification.typeDescription.DATASETCREATED=Dataset was created by user notification.typeDescription.DATASETMENTIONED=Dataset was referenced in remote system - +notification.typeDescription.GLOBUSUPLOADCOMPLETED=Globus upload is completed +notification.typeDescription.GLOBUSUPLOADCOMPLETEDWITHERRORS=Globus upload completed with errors +notification.typeDescription.GLOBUSDOWNLOADCOMPLETED=Globus download is completed +notification.typeDescription.GLOBUSDOWNLOADCOMPLETEDWITHERRORS=Globus download completed with errors +notification.typeDescription.GLOBUSUPLOADLOCALFAILURE=Globus upload failed, internal error +notification.typeDescription.GLOBUSUPLOADREMOTEFAILURE=Globus upload failed, remote transfer error +notification.typeDescription.REQUESTEDFILEACCESS=File access requested groupAndRoles.manageTips=Here is where you can access and manage all the groups you belong to, and the roles you have been assigned. user.message.signup.label=Create Account user.message.signup.tip=Why have a Dataverse account? To create your own dataverse and customize it, add datasets, or request access to restricted files. @@ -837,7 +843,8 @@ notification.email.datasetWasMentioned=Hello {0},

The {1} has just been notification.email.datasetWasMentioned.subject={0}: A Dataset Relationship has been reported! notification.email.globus.uploadCompleted.subject={0}: Files uploaded successfully via Globus and verified notification.email.globus.downloadCompleted.subject={0}: Files downloaded successfully via Globus -notification.email.globus.uploadCompletedWithErrors.subject={0}: Uploaded files via Globus with errors +notification.email.globus.downloadCompletedWithErrors.subject={0}: Globus download task completed, errors encountered +notification.email.globus.uploadCompletedWithErrors.subject={0}: Globus upload task completed with errors notification.email.globus.uploadFailedRemotely.subject={0}: Failed to upload files via Globus notification.email.globus.uploadFailedLocally.subject={0}: Failed to add files uploaded via Globus to dataset # dataverse.xhtml