From c35dc5babb06f4af6be0b66962a2517f6ffd9082 Mon Sep 17 00:00:00 2001 From: eskander Date: Thu, 19 Oct 2023 14:15:56 +0300 Subject: [PATCH] [CST-7695] detect the duplication and return list of matched objects into ExternalDataObject --- .../service/impl/SolrDedupServiceImpl.java | 16 +++ .../utils/MD5ValueSignature.java | 100 ++++++++++++++++++ .../app/deduplication/utils/Signature.java | 5 + .../external/model/ExternalDataObject.java | 15 +++ .../service/impl/ExternalDataServiceImpl.java | 64 ++++++++++- .../ExternalSourceEntryRestConverter.java | 37 +++++++ .../rest/model/ExternalSourceEntryRest.java | 11 ++ .../rest/ExternalSourcesRestControllerIT.java | 59 +++++++++++ .../provider/impl/MockDataProvider.java | 1 + 9 files changed, 307 insertions(+), 1 deletion(-) diff --git a/dspace-api/src/main/java/org/dspace/app/deduplication/service/impl/SolrDedupServiceImpl.java b/dspace-api/src/main/java/org/dspace/app/deduplication/service/impl/SolrDedupServiceImpl.java index 6f719ff85f2c..cc5c0f2bc861 100644 --- a/dspace-api/src/main/java/org/dspace/app/deduplication/service/impl/SolrDedupServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/app/deduplication/service/impl/SolrDedupServiceImpl.java @@ -314,6 +314,22 @@ private void fillSignature(Context ctx, DSpaceObject iu, Map plainSignatures = algo.getPlainSignature(iu, ctx); + for (String signature : plainSignatures) { + if (StringUtils.isNotEmpty(signature)) { + String key = "plain_" + algo.getSignatureType() + "_signature"; + if (tmpMapFilter.containsKey(key)) { + List obj = tmpMapFilter.get(key); + obj.add(signature); + tmpMapFilter.put(key, obj); + } else { + List obj = new ArrayList(); + obj.add(signature); + tmpMapFilter.put(key, obj); + } + } + } } } diff --git a/dspace-api/src/main/java/org/dspace/app/deduplication/utils/MD5ValueSignature.java b/dspace-api/src/main/java/org/dspace/app/deduplication/utils/MD5ValueSignature.java index 8b047584bc1b..aacc7aa9ae0e 100644 --- a/dspace-api/src/main/java/org/dspace/app/deduplication/utils/MD5ValueSignature.java +++ b/dspace-api/src/main/java/org/dspace/app/deduplication/utils/MD5ValueSignature.java @@ -6,6 +6,7 @@ * http://www.dspace.org/license/ */ package org.dspace.app.deduplication.utils; + import java.io.UnsupportedEncodingException; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -13,6 +14,7 @@ import java.util.List; import java.util.Locale; import java.util.Objects; +import java.util.stream.Collectors; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; @@ -22,12 +24,15 @@ import org.apache.logging.log4j.Logger; import org.dspace.content.DSpaceObject; import org.dspace.content.Item; +import org.dspace.content.MetadataFieldName; import org.dspace.content.MetadataValue; import org.dspace.content.WorkspaceItem; +import org.dspace.content.dto.MetadataValueDTO; import org.dspace.content.factory.ContentServiceFactory; import org.dspace.content.service.ItemService; import org.dspace.content.service.WorkspaceItemService; import org.dspace.core.Context; +import org.dspace.external.model.ExternalDataObject; import org.dspace.workflow.WorkflowItem; import org.dspace.workflow.WorkflowItemService; import org.dspace.workflow.factory.WorkflowServiceFactory; @@ -95,6 +100,37 @@ public List getSignature(DSpaceObject item, Context context) { } } + public List getPlainSignature(DSpaceObject item, Context context) { + List result = new ArrayList(); + try { + MessageDigest digester = MessageDigest.getInstance("MD5"); + List values = getMultiValue(item, metadata); + if (values != null) { + for (String value : values) { + if (StringUtils.isNotEmpty(value)) { + String valueNorm = normalize(item, value); + digester.update(valueNorm.getBytes("UTF-8")); + byte[] signature = digester.digest(); + char[] arr = new char[signature.length << 1]; + for (int i = 0; i < signature.length; i++) { + int b = signature[i]; + int idx = i << 1; + arr[idx] = HEX_DIGITS[(b >> 4) & 0xf]; + arr[idx + 1] = HEX_DIGITS[b & 0xf]; + } + String sigString = new String(arr); + result.add(sigString); + } + } + } + return result; + } catch (NoSuchAlgorithmException e) { + throw new RuntimeException(e.getMessage(), e); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + protected String normalize(DSpaceObject item, Context context, String value) { if (value != null) { String temp = StringUtils.EMPTY; @@ -210,6 +246,70 @@ protected List getMultiValue(DSpaceObject item, String metadata) { return retValue; } + public List getSignature(ExternalDataObject object) { + List result = new ArrayList(); + try { + MessageDigest digester = MessageDigest.getInstance("MD5"); + List values = getMultiValue(object, metadata); + if (values != null) { + for (String value : values) { + if (StringUtils.isNotEmpty(value)) { + String valueNorm = normalize(object, value); + digester.update(valueNorm.getBytes("UTF-8")); + byte[] signature = digester.digest(); + char[] arr = new char[signature.length << 1]; + for (int i = 0; i < signature.length; i++) { + int b = signature[i]; + int idx = i << 1; + arr[idx] = HEX_DIGITS[(b >> 4) & 0xf]; + arr[idx + 1] = HEX_DIGITS[b & 0xf]; + } + String sigString = new String(arr); + result.add(sigString); + } + } + } + return result; + } catch (NoSuchAlgorithmException e) { + throw new RuntimeException(e.getMessage(), e); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e.getMessage(), e); + } + } + + protected List getMultiValue(ExternalDataObject object, String metadata) { + return object.getMetadata() + .stream() + .filter(metadataValueDTO -> + new MetadataFieldName(metadataValueDTO.getSchema(), metadataValueDTO.getElement(), + metadataValueDTO.getQualifier()).toString().equals(metadata)) + .map(MetadataValueDTO::getValue) + .collect(Collectors.toList()); + } + + protected String normalize(ExternalDataObject object, String value) { + String result = value; + if (StringUtils.isEmpty(value)) { + if (StringUtils.isNotEmpty(prefix)) { + result = prefix + object.getId(); + } else { + result = "entity:" + object.getId(); + } + } else { + for (String prefix : ignorePrefix) { + if (value.startsWith(prefix)) { + result = value.substring(prefix.length()); + break; + } + } + if (StringUtils.isNotEmpty(prefix)) { + result = prefix + result; + } + } + + return result; + } + public String getMetadata() { return metadata; } diff --git a/dspace-api/src/main/java/org/dspace/app/deduplication/utils/Signature.java b/dspace-api/src/main/java/org/dspace/app/deduplication/utils/Signature.java index 2bf662b39d75..81a0fb228911 100644 --- a/dspace-api/src/main/java/org/dspace/app/deduplication/utils/Signature.java +++ b/dspace-api/src/main/java/org/dspace/app/deduplication/utils/Signature.java @@ -11,10 +11,15 @@ import org.dspace.content.DSpaceObject; import org.dspace.core.Context; +import org.dspace.external.model.ExternalDataObject; public interface Signature { public List getSignature(/* BrowsableDSpaceObject */DSpaceObject item, Context context); + public List getPlainSignature(DSpaceObject item, Context context); + + public List getSignature(ExternalDataObject object); + public int getResourceTypeID(); public String getSignatureType(); diff --git a/dspace-api/src/main/java/org/dspace/external/model/ExternalDataObject.java b/dspace-api/src/main/java/org/dspace/external/model/ExternalDataObject.java index eac9921df6cc..44ad6a70953e 100644 --- a/dspace-api/src/main/java/org/dspace/external/model/ExternalDataObject.java +++ b/dspace-api/src/main/java/org/dspace/external/model/ExternalDataObject.java @@ -9,6 +9,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.UUID; import org.dspace.content.dto.MetadataValueDTO; @@ -38,6 +39,8 @@ public class ExternalDataObject { */ private String displayValue; + private List matchUUIDs; + /** * Default constructor */ @@ -143,4 +146,16 @@ public String getValue() { public void setValue(String value) { this.value = value; } + + public List getMatchUUIDs() { + return matchUUIDs; + } + + public void setMatchUUIDs(List matchUUIDs) { + this.matchUUIDs = matchUUIDs; + } + + public boolean isDuplicated() { + return !matchUUIDs.isEmpty(); + } } diff --git a/dspace-api/src/main/java/org/dspace/external/service/impl/ExternalDataServiceImpl.java b/dspace-api/src/main/java/org/dspace/external/service/impl/ExternalDataServiceImpl.java index 7804dfa5689f..76e4fff4f527 100644 --- a/dspace-api/src/main/java/org/dspace/external/service/impl/ExternalDataServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/external/service/impl/ExternalDataServiceImpl.java @@ -7,12 +7,24 @@ */ package org.dspace.external.service.impl; +import static org.dspace.app.deduplication.service.impl.SolrDedupServiceImpl.RESOURCE_FLAG_FIELD; +import static org.dspace.app.deduplication.service.impl.SolrDedupServiceImpl.RESOURCE_IDS_FIELD; +import static org.dspace.app.deduplication.service.impl.SolrDedupServiceImpl.RESOURCE_SIGNATURE_FIELD; + import java.sql.SQLException; +import java.util.ArrayList; import java.util.List; import java.util.Optional; +import java.util.UUID; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.Logger; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.dspace.app.deduplication.service.DedupService; +import org.dspace.app.deduplication.service.impl.SolrDedupServiceImpl; +import org.dspace.app.deduplication.utils.Signature; import org.dspace.app.suggestion.SuggestionProvider; import org.dspace.app.suggestion.SuggestionService; import org.dspace.authorize.AuthorizeException; @@ -22,11 +34,14 @@ import org.dspace.content.dto.MetadataValueDTO; import org.dspace.content.service.ItemService; import org.dspace.content.service.WorkspaceItemService; +import org.dspace.core.Constants; import org.dspace.core.Context; import org.dspace.core.LogHelper; +import org.dspace.discovery.SearchServiceException; import org.dspace.external.model.ExternalDataObject; import org.dspace.external.provider.ExternalDataProvider; import org.dspace.external.service.ExternalDataService; +import org.dspace.utils.DSpace; import org.springframework.beans.factory.annotation.Autowired; /** @@ -49,6 +64,9 @@ public class ExternalDataServiceImpl implements ExternalDataService { @Autowired private SuggestionService suggestionService; + @Autowired + private DedupService dedupService; + @Override public Optional getExternalDataObject(String source, String id) { ExternalDataProvider provider = getExternalDataProvider(source); @@ -64,9 +82,53 @@ public List searchExternalDataObjects(String source, String if (provider == null) { throw new IllegalArgumentException("Provider for: " + source + " couldn't be found"); } - return provider.searchExternalDataObjects(query, start, limit); + + List externalDataObjects = provider.searchExternalDataObjects(query, start, limit); + appendMatchedUUIDs(externalDataObjects); + + return externalDataObjects; + } + + private void appendMatchedUUIDs(List externalDataObjects) { + for (ExternalDataObject externalDataObject : externalDataObjects) { + List uuids = new ArrayList<>(); + try { + QueryResponse response = dedupService.find("*:*", buildFilters(externalDataObject)); + for (SolrDocument resultDoc : response.getResults()) { + uuids.addAll(resultDoc.getFieldValues(RESOURCE_IDS_FIELD) + .stream() + .map(id -> + UUID.fromString(String.valueOf(id))) + .collect(Collectors.toList())); + } + externalDataObject.setMatchUUIDs(uuids); + } catch (SearchServiceException e) { + throw new RuntimeException(e); + } + } + } + + private String[] buildFilters(ExternalDataObject object) { + List filters = new ArrayList<>(); + List allSignatures = getAllSignatures(object); + + if (!allSignatures.isEmpty()) { + filters.add(RESOURCE_FLAG_FIELD + ":" + SolrDedupServiceImpl.DeduplicationFlag.FAKE.getDescription()); + filters.add(RESOURCE_SIGNATURE_FIELD + ":(" + + StringUtils.joinWith(" OR ", allSignatures.stream().toArray(String[]::new)) + ")"); + } + + return filters.toArray(new String[filters.size()]); } + private List getAllSignatures(ExternalDataObject iu) { + List signAlgo = new DSpace().getServiceManager().getServicesByType(Signature.class); + return signAlgo.stream() + .filter(algo -> Constants.ITEM == algo.getResourceTypeID()) + .flatMap(algo -> algo.getSignature(iu).stream()) + .filter(signature -> StringUtils.isNotEmpty(signature)) + .collect(Collectors.toList()); + } @Override public List getExternalDataProviders() { diff --git a/dspace-server-webapp/src/main/java/org/dspace/app/rest/converter/ExternalSourceEntryRestConverter.java b/dspace-server-webapp/src/main/java/org/dspace/app/rest/converter/ExternalSourceEntryRestConverter.java index 585de2a99a57..35921c16d254 100644 --- a/dspace-server-webapp/src/main/java/org/dspace/app/rest/converter/ExternalSourceEntryRestConverter.java +++ b/dspace-server-webapp/src/main/java/org/dspace/app/rest/converter/ExternalSourceEntryRestConverter.java @@ -7,9 +7,19 @@ */ package org.dspace.app.rest.converter; +import java.sql.SQLException; +import java.util.List; +import java.util.Objects; +import java.util.UUID; +import java.util.stream.Collectors; + import org.dspace.app.rest.model.ExternalSourceEntryRest; +import org.dspace.app.rest.model.ItemRest; import org.dspace.app.rest.projection.Projection; +import org.dspace.content.service.ItemService; +import org.dspace.core.Context; import org.dspace.external.model.ExternalDataObject; +import org.dspace.web.ContextUtil; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; @@ -22,6 +32,12 @@ public class ExternalSourceEntryRestConverter implements DSpaceConverter convertToItemRests(List uuids, Projection projection) { + + if (uuids == null) { + return List.of(); + } + + Context context = ContextUtil.obtainCurrentRequestContext(); + return uuids.stream() + .map(uuid -> { + try { + return itemService.find(context, uuid); + } catch (SQLException e) { + throw new RuntimeException(e); + } + }) + .filter(item -> Objects.nonNull(item)) + .map(item -> itemConverter.convert(item, projection)) + .collect(Collectors.toList()); + } + public Class getModelClass() { return ExternalDataObject.class; } diff --git a/dspace-server-webapp/src/main/java/org/dspace/app/rest/model/ExternalSourceEntryRest.java b/dspace-server-webapp/src/main/java/org/dspace/app/rest/model/ExternalSourceEntryRest.java index 06af7e222713..4e578c313870 100644 --- a/dspace-server-webapp/src/main/java/org/dspace/app/rest/model/ExternalSourceEntryRest.java +++ b/dspace-server-webapp/src/main/java/org/dspace/app/rest/model/ExternalSourceEntryRest.java @@ -7,6 +7,8 @@ */ package org.dspace.app.rest.model; +import java.util.List; + import org.dspace.app.rest.ExternalSourcesRestController; /** @@ -38,6 +40,7 @@ public String getType() { private String value; private String externalSource; private MetadataRest metadata = new MetadataRest(); + private List matchObjects; /** * Generic getter for the id @@ -118,4 +121,12 @@ public MetadataRest getMetadata() { public void setMetadata(MetadataRest metadata) { this.metadata = metadata; } + + public List getMatchObjects() { + return matchObjects; + } + + public void setMatchObjects(List matchObjects) { + this.matchObjects = matchObjects; + } } diff --git a/dspace-server-webapp/src/test/java/org/dspace/app/rest/ExternalSourcesRestControllerIT.java b/dspace-server-webapp/src/test/java/org/dspace/app/rest/ExternalSourcesRestControllerIT.java index 565a4d003f78..7c396e803537 100644 --- a/dspace-server-webapp/src/test/java/org/dspace/app/rest/ExternalSourcesRestControllerIT.java +++ b/dspace-server-webapp/src/test/java/org/dspace/app/rest/ExternalSourcesRestControllerIT.java @@ -19,15 +19,18 @@ import org.dspace.app.rest.matcher.EntityTypeMatcher; import org.dspace.app.rest.matcher.ExternalSourceEntryMatcher; import org.dspace.app.rest.matcher.ExternalSourceMatcher; +import org.dspace.app.rest.matcher.ItemMatcher; import org.dspace.app.rest.matcher.PageMatcher; import org.dspace.app.rest.test.AbstractControllerIntegrationTest; import org.dspace.builder.CollectionBuilder; import org.dspace.builder.CommunityBuilder; import org.dspace.builder.EntityTypeBuilder; +import org.dspace.builder.ItemBuilder; import org.dspace.builder.WorkflowItemBuilder; import org.dspace.content.Collection; import org.dspace.content.Community; import org.dspace.content.EntityType; +import org.dspace.content.Item; import org.dspace.core.CrisConstants; import org.dspace.external.provider.AbstractExternalDataProvider; import org.dspace.external.provider.ExternalDataProvider; @@ -485,4 +488,60 @@ public void findSupportedEntityTypesOfAnExternalDataProviderPaginationTest() thr } } + @Test + public void findOneExternalSourceEntriesDuplicationTest() throws Exception { + context.turnOffAuthorisationSystem(); + + parentCommunity = CommunityBuilder.createCommunity(context) + .withName("Parent Community") + .build(); + + Community child1 = CommunityBuilder.createSubCommunity(context, parentCommunity) + .withName("Sub Community") + .build(); + Collection col1 = CollectionBuilder.createCollection(context, child1).withName("Collection 1").build(); + + // create item withDoiIdentifier equals 10.1016/j.procs.2017.03.031 + Item itemOne = ItemBuilder.createItem(context, col1) + .withFullName("Public item one") + .withIssueDate("2023-10-17") + .withDoiIdentifier("10.1016/j.procs.2017.03.031") + .withEntityType("Publication") + .build(); + + // create another item withDoiIdentifier equals 10.1016/j.procs.2017.03.031 + Item itemTwo = ItemBuilder.createItem(context, col1) + .withFullName("Public item two") + .withIssueDate("2023-10-17") + .withDoiIdentifier("10.1016/j.procs.2017.03.031") + .withEntityType("Publication") + .build(); + + context.restoreAuthSystemState(); + + getClient().perform(get("/api/integration/externalsources/mock/entries") + .param("query", "one").param("size", "1")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$._embedded.externalSourceEntries", Matchers.hasItem( + ExternalSourceEntryMatcher.matchExternalSourceEntry("onetwo", "onetwo", "onetwo", "mock") + ))) + .andExpect(jsonPath("$._embedded.externalSourceEntries[0].matchObjects", containsInAnyOrder( + ItemMatcher.matchItemProperties(itemOne), + ItemMatcher.matchItemProperties(itemTwo) + ))) + .andExpect(jsonPath("$.page", PageMatcher.pageEntryWithTotalPagesAndElements(0, 1, 2, 2))); + + getClient().perform(get("/api/integration/externalsources/mock/entries") + .param("query", "one").param("size", "1").param("page", "1")) + .andExpect(status().isOk()) + .andExpect(jsonPath("$._embedded.externalSourceEntries", Matchers.hasItem( + ExternalSourceEntryMatcher.matchExternalSourceEntry("one", "one", "one", "mock") + ))) + .andExpect(jsonPath("$._embedded.externalSourceEntries[0].matchObjects", containsInAnyOrder( + ItemMatcher.matchItemProperties(itemOne), + ItemMatcher.matchItemProperties(itemTwo) + ))) + .andExpect(jsonPath("$.page", PageMatcher.pageEntryWithTotalPagesAndElements(1, 1, 2, 2))); + } + } diff --git a/dspace-server-webapp/src/test/java/org/dspace/external/provider/impl/MockDataProvider.java b/dspace-server-webapp/src/test/java/org/dspace/external/provider/impl/MockDataProvider.java index 894b8e409a4f..0a0b4f062d31 100644 --- a/dspace-server-webapp/src/test/java/org/dspace/external/provider/impl/MockDataProvider.java +++ b/dspace-server-webapp/src/test/java/org/dspace/external/provider/impl/MockDataProvider.java @@ -85,6 +85,7 @@ public void init() throws IOException { externalDataObject.setDisplayValue(id); List list = new LinkedList<>(); list.add(new MetadataValueDTO("dc", "contributor", "author", null, "Donald, Smith")); + list.add(new MetadataValueDTO("dc", "identifier", "doi", null, "10.1016/j.procs.2017.03.031")); externalDataObject.setMetadata(list); mockLookupMap.put(id, externalDataObject);