Skip to content

Commit

Permalink
Merge pull request DSpace#9265 from the-library-code/DSpace_duplicate…
Browse files Browse the repository at this point in the history
…_detection_PR

Basic Duplicate Detection in submission and workflow
  • Loading branch information
tdonohue authored Mar 4, 2024
2 parents 5b76b17 + 9a5427e commit e90ab9e
Show file tree
Hide file tree
Showing 26 changed files with 2,368 additions and 33 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.dspace.content.service.CommunityService;
import org.dspace.content.service.DSpaceObjectLegacySupportService;
import org.dspace.content.service.DSpaceObjectService;
import org.dspace.content.service.DuplicateDetectionService;
import org.dspace.content.service.EntityService;
import org.dspace.content.service.EntityTypeService;
import org.dspace.content.service.InProgressSubmissionService;
Expand Down Expand Up @@ -113,6 +114,13 @@ public InProgressSubmissionService getInProgressSubmissionService(InProgressSubm
}
}

/**
* Return the implementation of the DuplicateDetectionService interface
*
* @return the DuplicateDetectionService
*/
public abstract DuplicateDetectionService getDuplicateDetectionService();

public <T extends DSpaceObject> DSpaceObjectService<T> getDSpaceObjectService(T dso) {
return getDSpaceObjectService(dso.getType());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import org.dspace.content.service.CommunityService;
import org.dspace.content.service.DSpaceObjectLegacySupportService;
import org.dspace.content.service.DSpaceObjectService;
import org.dspace.content.service.DuplicateDetectionService;
import org.dspace.content.service.EntityService;
import org.dspace.content.service.EntityTypeService;
import org.dspace.content.service.InstallItemService;
Expand Down Expand Up @@ -81,6 +82,8 @@ public class ContentServiceFactoryImpl extends ContentServiceFactory {
private EntityTypeService entityTypeService;
@Autowired(required = true)
private EntityService entityService;
@Autowired(required = true)
private DuplicateDetectionService duplicateDetectionService;

@Override
public List<DSpaceObjectService<? extends DSpaceObject>> getDSpaceObjectServices() {
Expand Down Expand Up @@ -181,4 +184,9 @@ public EntityService getEntityService() {
public RelationshipMetadataService getRelationshipMetadataService() {
return relationshipMetadataService;
}

@Override
public DuplicateDetectionService getDuplicateDetectionService() {
return duplicateDetectionService;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.content.service;

import java.sql.SQLException;
import java.util.List;
import java.util.Optional;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.DuplicateDetectionServiceImpl;
import org.dspace.content.Item;
import org.dspace.content.virtual.PotentialDuplicate;
import org.dspace.core.Context;
import org.dspace.discovery.DiscoverResult;
import org.dspace.discovery.IndexableObject;
import org.dspace.discovery.SearchServiceException;

/**
* Duplicate Detection Service handles get, search and validation operations for duplicate detection.
* @see DuplicateDetectionServiceImpl for implementation details
*
* @author Kim Shepherd
*/
public interface DuplicateDetectionService {

/**
* Logger
*/
Logger log = LogManager.getLogger(DuplicateDetectionService.class);

/**
* Get a list of PotentialDuplicate objects (wrappers with some metadata included for previewing) that
* are identified as potential duplicates of the given item
*
* @param context DSpace context
* @param item Item to check
* @return List of potential duplicates (empty if none found)
* @throws SearchServiceException if an error occurs performing the discovery search
*/
List<PotentialDuplicate> getPotentialDuplicates(Context context, Item item)
throws SearchServiceException;

/**
* Validate an indexable object (returned by discovery search) to ensure it is permissible, readable and valid
* and can be added to a list of results.
* An Optional is returned, if it is empty then it was invalid or did not pass validation.
*
* @param context The DSpace context
* @param indexableObject The discovery search result
* @param original The original item (to compare IDs, submitters, etc)
* @return An Optional potential duplicate
* @throws SQLException
* @throws AuthorizeException
*/
Optional<PotentialDuplicate> validateDuplicateResult(Context context, IndexableObject indexableObject,
Item original) throws SQLException, AuthorizeException;

/**
* Search discovery for potential duplicates of a given item. The search uses levenshtein distance (configurable)
* and a single-term "comparison value" constructed out of the item title
*
* @param context DSpace context
* @param item The item to check
* @return DiscoverResult as a result of performing search. Null if invalid.
*
* @throws SearchServiceException if an error was encountered during the discovery search itself.
*/
DiscoverResult searchDuplicates(Context context, Item item) throws SearchServiceException;

/**
* Build a comparison value string made up of values of configured fields, used when indexing and querying
* items for deduplication
* @param context DSpace context
* @param item The DSpace item
* @return a constructed, normalised string
*/
String buildComparisonValue(Context context, Item item);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.content.virtual;

import java.util.LinkedList;
import java.util.List;
import java.util.UUID;

import org.dspace.content.Item;
import org.dspace.content.MetadataValue;

/**
* Model of potential duplicate item. Provides as little data as possible, but enough to be useful
* about the context / state of the duplicate, and metadata for preview purposes.
* This class lives in the virtual package because it is not stored, addressable data, it's a stub / preview
* based on an items' search result and metadata.
*
* @author Kim Shepherd
*/
public class PotentialDuplicate {
/**
* Title of duplicate object
*/
private String title;
/**
* UUID of duplicate object
*/
private UUID uuid;
/**
* Owning collection name (title) for duplicate item
*/
private String owningCollectionName;
/**
* Workspace item ID, if the duplicate is a workspace item
*/
private Integer workspaceItemId;
/**
* Workflow item ID, if the duplicate is a workflow item
*/
private Integer workflowItemId;

/**
* List of configured metadata values copied across from the duplicate item
*/
private List<MetadataValue> metadataValueList;

/**
* Default constructor
*/
public PotentialDuplicate() {
this.metadataValueList = new LinkedList<>();
}

/**
* Constructor that accepts an item and sets some values accordingly
* @param item the potential duplicate item
*/
public PotentialDuplicate(Item item) {
// Throw error if item is null
if (item == null) {
throw new NullPointerException("Null item passed to potential duplicate constructor");
}
// Instantiate metadata value list
this.metadataValueList = new LinkedList<>();
// Set title
this.title = item.getName();
// Set UUID
this.uuid = item.getID();
// Set owning collection name
if (item.getOwningCollection() != null) {
this.owningCollectionName = item.getOwningCollection().getName();
}
}

/**
* Get UUID of duplicate item
* @return UUID of duplicate item
*/
public UUID getUuid() {
return uuid;
}

/**
* Set UUID of duplicate item
* @param uuid UUID of duplicate item
*/
public void setUuid(UUID uuid) {
this.uuid = uuid;
}

/**
* Get title of duplicate item
* @return title of duplicate item
*/
public String getTitle() {
return title;
}

/**
* Set title of duplicate item
* @param title of duplicate item
*/
public void setTitle(String title) {
this.title = title;
}

/**
* Get owning collection name (title) of duplicate item
* @return owning collection name (title) of duplicate item
*/
public String getOwningCollectionName() {
return owningCollectionName;
}

/**
* Set owning collection name (title) of duplicate item
* @param owningCollectionName owning collection name (title) of duplicate item
*/
public void setOwningCollectionName(String owningCollectionName) {
this.owningCollectionName = owningCollectionName;
}

/**
* Get workspace ID for duplicate item, if any
* @return workspace item ID or null
*/
public Integer getWorkspaceItemId() {
return workspaceItemId;
}

/**
* Set workspace ID for duplicate item
* @param workspaceItemId workspace item ID
*/
public void setWorkspaceItemId(Integer workspaceItemId) {
this.workspaceItemId = workspaceItemId;
}

/**
* Get workflow ID for duplicate item, if anh
* @return workflow item ID or null
*/
public Integer getWorkflowItemId() {
return workflowItemId;
}

/**
* Set workflow ID for duplicate item
* @param workflowItemId workspace item ID
*/
public void setWorkflowItemId(Integer workflowItemId) {
this.workflowItemId = workflowItemId;
}

/**
* Get metadata (sorted, field->value list) for duplicate item
* @return (sorted, field->value list) for duplicate item
*/
public List<MetadataValue> getMetadataValueList() {
return metadataValueList;
}

/**
* Set metadata (sorted, field->value list) for duplicate item
* @param metadataValueList MetadataRest list of values mapped to field keys
*/
public void setMetadataValueList(List<MetadataValue> metadataValueList) {
this.metadataValueList = metadataValueList;
}

}
Loading

0 comments on commit e90ab9e

Please sign in to comment.