Skip to content

Commit

Permalink
Merged in CST-7695 (pull request DSpace#1214)
Browse files Browse the repository at this point in the history
[CST-7695] detect the duplication and return list of matched objects into ExternalDataObject

Approved-by: Stefano Maffei
  • Loading branch information
eskander17 authored and steph-ieffam committed Oct 26, 2023
2 parents c060a18 + c35dc5b commit 458b710
Show file tree
Hide file tree
Showing 9 changed files with 307 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,22 @@ private void fillSignature(Context ctx, DSpaceObject iu, Map<String, List<String
}
}
}

List<String> plainSignatures = algo.getPlainSignature(iu, ctx);
for (String signature : plainSignatures) {
if (StringUtils.isNotEmpty(signature)) {
String key = "plain_" + algo.getSignatureType() + "_signature";
if (tmpMapFilter.containsKey(key)) {
List<String> obj = tmpMapFilter.get(key);
obj.add(signature);
tmpMapFilter.put(key, obj);
} else {
List<String> obj = new ArrayList<String>();
obj.add(signature);
tmpMapFilter.put(key, obj);
}
}
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
* http://www.dspace.org/license/
*/
package org.dspace.app.deduplication.utils;

import java.io.UnsupportedEncodingException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.stream.Collectors;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
Expand All @@ -22,12 +24,15 @@
import org.apache.logging.log4j.Logger;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.content.MetadataFieldName;
import org.dspace.content.MetadataValue;
import org.dspace.content.WorkspaceItem;
import org.dspace.content.dto.MetadataValueDTO;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.ItemService;
import org.dspace.content.service.WorkspaceItemService;
import org.dspace.core.Context;
import org.dspace.external.model.ExternalDataObject;
import org.dspace.workflow.WorkflowItem;
import org.dspace.workflow.WorkflowItemService;
import org.dspace.workflow.factory.WorkflowServiceFactory;
Expand Down Expand Up @@ -95,6 +100,37 @@ public List<String> getSignature(DSpaceObject item, Context context) {
}
}

public List<String> getPlainSignature(DSpaceObject item, Context context) {
List<String> result = new ArrayList<String>();
try {
MessageDigest digester = MessageDigest.getInstance("MD5");
List<String> values = getMultiValue(item, metadata);
if (values != null) {
for (String value : values) {
if (StringUtils.isNotEmpty(value)) {
String valueNorm = normalize(item, value);
digester.update(valueNorm.getBytes("UTF-8"));
byte[] signature = digester.digest();
char[] arr = new char[signature.length << 1];
for (int i = 0; i < signature.length; i++) {
int b = signature[i];
int idx = i << 1;
arr[idx] = HEX_DIGITS[(b >> 4) & 0xf];
arr[idx + 1] = HEX_DIGITS[b & 0xf];
}
String sigString = new String(arr);
result.add(sigString);
}
}
}
return result;
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e.getMessage(), e);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e.getMessage(), e);
}
}

protected String normalize(DSpaceObject item, Context context, String value) {
if (value != null) {
String temp = StringUtils.EMPTY;
Expand Down Expand Up @@ -210,6 +246,70 @@ protected List<String> getMultiValue(DSpaceObject item, String metadata) {
return retValue;
}

public List<String> getSignature(ExternalDataObject object) {
List<String> result = new ArrayList<String>();
try {
MessageDigest digester = MessageDigest.getInstance("MD5");
List<String> values = getMultiValue(object, metadata);
if (values != null) {
for (String value : values) {
if (StringUtils.isNotEmpty(value)) {
String valueNorm = normalize(object, value);
digester.update(valueNorm.getBytes("UTF-8"));
byte[] signature = digester.digest();
char[] arr = new char[signature.length << 1];
for (int i = 0; i < signature.length; i++) {
int b = signature[i];
int idx = i << 1;
arr[idx] = HEX_DIGITS[(b >> 4) & 0xf];
arr[idx + 1] = HEX_DIGITS[b & 0xf];
}
String sigString = new String(arr);
result.add(sigString);
}
}
}
return result;
} catch (NoSuchAlgorithmException e) {
throw new RuntimeException(e.getMessage(), e);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e.getMessage(), e);
}
}

protected List<String> getMultiValue(ExternalDataObject object, String metadata) {
return object.getMetadata()
.stream()
.filter(metadataValueDTO ->
new MetadataFieldName(metadataValueDTO.getSchema(), metadataValueDTO.getElement(),
metadataValueDTO.getQualifier()).toString().equals(metadata))
.map(MetadataValueDTO::getValue)
.collect(Collectors.toList());
}

protected String normalize(ExternalDataObject object, String value) {
String result = value;
if (StringUtils.isEmpty(value)) {
if (StringUtils.isNotEmpty(prefix)) {
result = prefix + object.getId();
} else {
result = "entity:" + object.getId();
}
} else {
for (String prefix : ignorePrefix) {
if (value.startsWith(prefix)) {
result = value.substring(prefix.length());
break;
}
}
if (StringUtils.isNotEmpty(prefix)) {
result = prefix + result;
}
}

return result;
}

public String getMetadata() {
return metadata;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@

import org.dspace.content.DSpaceObject;
import org.dspace.core.Context;
import org.dspace.external.model.ExternalDataObject;

public interface Signature {
public List<String> getSignature(/* BrowsableDSpaceObject */DSpaceObject item, Context context);

public List<String> getPlainSignature(DSpaceObject item, Context context);

public List<String> getSignature(ExternalDataObject object);

public int getResourceTypeID();

public String getSignatureType();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

import org.dspace.content.dto.MetadataValueDTO;

Expand Down Expand Up @@ -38,6 +39,8 @@ public class ExternalDataObject {
*/
private String displayValue;

private List<UUID> matchUUIDs;

/**
* Default constructor
*/
Expand Down Expand Up @@ -143,4 +146,16 @@ public String getValue() {
public void setValue(String value) {
this.value = value;
}

public List<UUID> getMatchUUIDs() {
return matchUUIDs;
}

public void setMatchUUIDs(List<UUID> matchUUIDs) {
this.matchUUIDs = matchUUIDs;
}

public boolean isDuplicated() {
return !matchUUIDs.isEmpty();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,24 @@
*/
package org.dspace.external.service.impl;

import static org.dspace.app.deduplication.service.impl.SolrDedupServiceImpl.RESOURCE_FLAG_FIELD;
import static org.dspace.app.deduplication.service.impl.SolrDedupServiceImpl.RESOURCE_IDS_FIELD;
import static org.dspace.app.deduplication.service.impl.SolrDedupServiceImpl.RESOURCE_SIGNATURE_FIELD;

import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.UUID;
import java.util.stream.Collectors;

import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.Logger;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.dspace.app.deduplication.service.DedupService;
import org.dspace.app.deduplication.service.impl.SolrDedupServiceImpl;
import org.dspace.app.deduplication.utils.Signature;
import org.dspace.app.suggestion.SuggestionProvider;
import org.dspace.app.suggestion.SuggestionService;
import org.dspace.authorize.AuthorizeException;
Expand All @@ -22,11 +34,14 @@
import org.dspace.content.dto.MetadataValueDTO;
import org.dspace.content.service.ItemService;
import org.dspace.content.service.WorkspaceItemService;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.core.LogHelper;
import org.dspace.discovery.SearchServiceException;
import org.dspace.external.model.ExternalDataObject;
import org.dspace.external.provider.ExternalDataProvider;
import org.dspace.external.service.ExternalDataService;
import org.dspace.utils.DSpace;
import org.springframework.beans.factory.annotation.Autowired;

/**
Expand All @@ -49,6 +64,9 @@ public class ExternalDataServiceImpl implements ExternalDataService {
@Autowired
private SuggestionService suggestionService;

@Autowired
private DedupService dedupService;

@Override
public Optional<ExternalDataObject> getExternalDataObject(String source, String id) {
ExternalDataProvider provider = getExternalDataProvider(source);
Expand All @@ -64,9 +82,53 @@ public List<ExternalDataObject> searchExternalDataObjects(String source, String
if (provider == null) {
throw new IllegalArgumentException("Provider for: " + source + " couldn't be found");
}
return provider.searchExternalDataObjects(query, start, limit);

List<ExternalDataObject> externalDataObjects = provider.searchExternalDataObjects(query, start, limit);
appendMatchedUUIDs(externalDataObjects);

return externalDataObjects;
}

private void appendMatchedUUIDs(List<ExternalDataObject> externalDataObjects) {
for (ExternalDataObject externalDataObject : externalDataObjects) {
List<UUID> uuids = new ArrayList<>();
try {
QueryResponse response = dedupService.find("*:*", buildFilters(externalDataObject));
for (SolrDocument resultDoc : response.getResults()) {
uuids.addAll(resultDoc.getFieldValues(RESOURCE_IDS_FIELD)
.stream()
.map(id ->
UUID.fromString(String.valueOf(id)))
.collect(Collectors.toList()));
}
externalDataObject.setMatchUUIDs(uuids);
} catch (SearchServiceException e) {
throw new RuntimeException(e);
}
}
}

private String[] buildFilters(ExternalDataObject object) {
List<String> filters = new ArrayList<>();
List<String> allSignatures = getAllSignatures(object);

if (!allSignatures.isEmpty()) {
filters.add(RESOURCE_FLAG_FIELD + ":" + SolrDedupServiceImpl.DeduplicationFlag.FAKE.getDescription());
filters.add(RESOURCE_SIGNATURE_FIELD + ":(" +
StringUtils.joinWith(" OR ", allSignatures.stream().toArray(String[]::new)) + ")");
}

return filters.toArray(new String[filters.size()]);
}

private List<String> getAllSignatures(ExternalDataObject iu) {
List<Signature> signAlgo = new DSpace().getServiceManager().getServicesByType(Signature.class);
return signAlgo.stream()
.filter(algo -> Constants.ITEM == algo.getResourceTypeID())
.flatMap(algo -> algo.getSignature(iu).stream())
.filter(signature -> StringUtils.isNotEmpty(signature))
.collect(Collectors.toList());
}

@Override
public List<ExternalDataProvider> getExternalDataProviders() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,19 @@
*/
package org.dspace.app.rest.converter;

import java.sql.SQLException;
import java.util.List;
import java.util.Objects;
import java.util.UUID;
import java.util.stream.Collectors;

import org.dspace.app.rest.model.ExternalSourceEntryRest;
import org.dspace.app.rest.model.ItemRest;
import org.dspace.app.rest.projection.Projection;
import org.dspace.content.service.ItemService;
import org.dspace.core.Context;
import org.dspace.external.model.ExternalDataObject;
import org.dspace.web.ContextUtil;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

Expand All @@ -22,6 +32,12 @@ public class ExternalSourceEntryRestConverter implements DSpaceConverter<Externa
@Autowired
private MetadataValueDTOListConverter metadataConverter;

@Autowired
private ItemService itemService;

@Autowired
private ItemConverter itemConverter;

public ExternalSourceEntryRest convert(ExternalDataObject modelObject, Projection projection) {
ExternalSourceEntryRest externalSourceEntryRest = new ExternalSourceEntryRest();
externalSourceEntryRest.setId(modelObject.getId());
Expand All @@ -30,9 +46,30 @@ public ExternalSourceEntryRest convert(ExternalDataObject modelObject, Projectio
externalSourceEntryRest.setValue(modelObject.getValue());
externalSourceEntryRest.setExternalSource(modelObject.getSource());
externalSourceEntryRest.setMetadata(metadataConverter.convert(modelObject.getMetadata()));
externalSourceEntryRest.setMatchObjects(convertToItemRests(modelObject.getMatchUUIDs(), projection));
return externalSourceEntryRest;
}

private List<ItemRest> convertToItemRests(List<UUID> uuids, Projection projection) {

if (uuids == null) {
return List.of();
}

Context context = ContextUtil.obtainCurrentRequestContext();
return uuids.stream()
.map(uuid -> {
try {
return itemService.find(context, uuid);
} catch (SQLException e) {
throw new RuntimeException(e);
}
})
.filter(item -> Objects.nonNull(item))
.map(item -> itemConverter.convert(item, projection))
.collect(Collectors.toList());
}

public Class<ExternalDataObject> getModelClass() {
return ExternalDataObject.class;
}
Expand Down
Loading

0 comments on commit 458b710

Please sign in to comment.