forked from DSpace/DSpace
-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request DSpace#9265 from the-library-code/DSpace_duplicate…
…_detection_PR Basic Duplicate Detection in submission and workflow
- Loading branch information
Showing
26 changed files
with
2,368 additions
and
33 deletions.
There are no files selected for viewing
362 changes: 362 additions & 0 deletions
362
dspace-api/src/main/java/org/dspace/content/DuplicateDetectionServiceImpl.java
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
85 changes: 85 additions & 0 deletions
85
dspace-api/src/main/java/org/dspace/content/service/DuplicateDetectionService.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
/** | ||
* The contents of this file are subject to the license and copyright | ||
* detailed in the LICENSE and NOTICE files at the root of the source | ||
* tree and available online at | ||
* | ||
* http://www.dspace.org/license/ | ||
*/ | ||
package org.dspace.content.service; | ||
|
||
import java.sql.SQLException; | ||
import java.util.List; | ||
import java.util.Optional; | ||
|
||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.dspace.authorize.AuthorizeException; | ||
import org.dspace.content.DuplicateDetectionServiceImpl; | ||
import org.dspace.content.Item; | ||
import org.dspace.content.virtual.PotentialDuplicate; | ||
import org.dspace.core.Context; | ||
import org.dspace.discovery.DiscoverResult; | ||
import org.dspace.discovery.IndexableObject; | ||
import org.dspace.discovery.SearchServiceException; | ||
|
||
/** | ||
* Duplicate Detection Service handles get, search and validation operations for duplicate detection. | ||
* @see DuplicateDetectionServiceImpl for implementation details | ||
* | ||
* @author Kim Shepherd | ||
*/ | ||
public interface DuplicateDetectionService { | ||
|
||
/** | ||
* Logger | ||
*/ | ||
Logger log = LogManager.getLogger(DuplicateDetectionService.class); | ||
|
||
/** | ||
* Get a list of PotentialDuplicate objects (wrappers with some metadata included for previewing) that | ||
* are identified as potential duplicates of the given item | ||
* | ||
* @param context DSpace context | ||
* @param item Item to check | ||
* @return List of potential duplicates (empty if none found) | ||
* @throws SearchServiceException if an error occurs performing the discovery search | ||
*/ | ||
List<PotentialDuplicate> getPotentialDuplicates(Context context, Item item) | ||
throws SearchServiceException; | ||
|
||
/** | ||
* Validate an indexable object (returned by discovery search) to ensure it is permissible, readable and valid | ||
* and can be added to a list of results. | ||
* An Optional is returned, if it is empty then it was invalid or did not pass validation. | ||
* | ||
* @param context The DSpace context | ||
* @param indexableObject The discovery search result | ||
* @param original The original item (to compare IDs, submitters, etc) | ||
* @return An Optional potential duplicate | ||
* @throws SQLException | ||
* @throws AuthorizeException | ||
*/ | ||
Optional<PotentialDuplicate> validateDuplicateResult(Context context, IndexableObject indexableObject, | ||
Item original) throws SQLException, AuthorizeException; | ||
|
||
/** | ||
* Search discovery for potential duplicates of a given item. The search uses levenshtein distance (configurable) | ||
* and a single-term "comparison value" constructed out of the item title | ||
* | ||
* @param context DSpace context | ||
* @param item The item to check | ||
* @return DiscoverResult as a result of performing search. Null if invalid. | ||
* | ||
* @throws SearchServiceException if an error was encountered during the discovery search itself. | ||
*/ | ||
DiscoverResult searchDuplicates(Context context, Item item) throws SearchServiceException; | ||
|
||
/** | ||
* Build a comparison value string made up of values of configured fields, used when indexing and querying | ||
* items for deduplication | ||
* @param context DSpace context | ||
* @param item The DSpace item | ||
* @return a constructed, normalised string | ||
*/ | ||
String buildComparisonValue(Context context, Item item); | ||
} |
176 changes: 176 additions & 0 deletions
176
dspace-api/src/main/java/org/dspace/content/virtual/PotentialDuplicate.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
/** | ||
* The contents of this file are subject to the license and copyright | ||
* detailed in the LICENSE and NOTICE files at the root of the source | ||
* tree and available online at | ||
* | ||
* http://www.dspace.org/license/ | ||
*/ | ||
package org.dspace.content.virtual; | ||
|
||
import java.util.LinkedList; | ||
import java.util.List; | ||
import java.util.UUID; | ||
|
||
import org.dspace.content.Item; | ||
import org.dspace.content.MetadataValue; | ||
|
||
/** | ||
* Model of potential duplicate item. Provides as little data as possible, but enough to be useful | ||
* about the context / state of the duplicate, and metadata for preview purposes. | ||
* This class lives in the virtual package because it is not stored, addressable data, it's a stub / preview | ||
* based on an items' search result and metadata. | ||
* | ||
* @author Kim Shepherd | ||
*/ | ||
public class PotentialDuplicate { | ||
/** | ||
* Title of duplicate object | ||
*/ | ||
private String title; | ||
/** | ||
* UUID of duplicate object | ||
*/ | ||
private UUID uuid; | ||
/** | ||
* Owning collection name (title) for duplicate item | ||
*/ | ||
private String owningCollectionName; | ||
/** | ||
* Workspace item ID, if the duplicate is a workspace item | ||
*/ | ||
private Integer workspaceItemId; | ||
/** | ||
* Workflow item ID, if the duplicate is a workflow item | ||
*/ | ||
private Integer workflowItemId; | ||
|
||
/** | ||
* List of configured metadata values copied across from the duplicate item | ||
*/ | ||
private List<MetadataValue> metadataValueList; | ||
|
||
/** | ||
* Default constructor | ||
*/ | ||
public PotentialDuplicate() { | ||
this.metadataValueList = new LinkedList<>(); | ||
} | ||
|
||
/** | ||
* Constructor that accepts an item and sets some values accordingly | ||
* @param item the potential duplicate item | ||
*/ | ||
public PotentialDuplicate(Item item) { | ||
// Throw error if item is null | ||
if (item == null) { | ||
throw new NullPointerException("Null item passed to potential duplicate constructor"); | ||
} | ||
// Instantiate metadata value list | ||
this.metadataValueList = new LinkedList<>(); | ||
// Set title | ||
this.title = item.getName(); | ||
// Set UUID | ||
this.uuid = item.getID(); | ||
// Set owning collection name | ||
if (item.getOwningCollection() != null) { | ||
this.owningCollectionName = item.getOwningCollection().getName(); | ||
} | ||
} | ||
|
||
/** | ||
* Get UUID of duplicate item | ||
* @return UUID of duplicate item | ||
*/ | ||
public UUID getUuid() { | ||
return uuid; | ||
} | ||
|
||
/** | ||
* Set UUID of duplicate item | ||
* @param uuid UUID of duplicate item | ||
*/ | ||
public void setUuid(UUID uuid) { | ||
this.uuid = uuid; | ||
} | ||
|
||
/** | ||
* Get title of duplicate item | ||
* @return title of duplicate item | ||
*/ | ||
public String getTitle() { | ||
return title; | ||
} | ||
|
||
/** | ||
* Set title of duplicate item | ||
* @param title of duplicate item | ||
*/ | ||
public void setTitle(String title) { | ||
this.title = title; | ||
} | ||
|
||
/** | ||
* Get owning collection name (title) of duplicate item | ||
* @return owning collection name (title) of duplicate item | ||
*/ | ||
public String getOwningCollectionName() { | ||
return owningCollectionName; | ||
} | ||
|
||
/** | ||
* Set owning collection name (title) of duplicate item | ||
* @param owningCollectionName owning collection name (title) of duplicate item | ||
*/ | ||
public void setOwningCollectionName(String owningCollectionName) { | ||
this.owningCollectionName = owningCollectionName; | ||
} | ||
|
||
/** | ||
* Get workspace ID for duplicate item, if any | ||
* @return workspace item ID or null | ||
*/ | ||
public Integer getWorkspaceItemId() { | ||
return workspaceItemId; | ||
} | ||
|
||
/** | ||
* Set workspace ID for duplicate item | ||
* @param workspaceItemId workspace item ID | ||
*/ | ||
public void setWorkspaceItemId(Integer workspaceItemId) { | ||
this.workspaceItemId = workspaceItemId; | ||
} | ||
|
||
/** | ||
* Get workflow ID for duplicate item, if anh | ||
* @return workflow item ID or null | ||
*/ | ||
public Integer getWorkflowItemId() { | ||
return workflowItemId; | ||
} | ||
|
||
/** | ||
* Set workflow ID for duplicate item | ||
* @param workflowItemId workspace item ID | ||
*/ | ||
public void setWorkflowItemId(Integer workflowItemId) { | ||
this.workflowItemId = workflowItemId; | ||
} | ||
|
||
/** | ||
* Get metadata (sorted, field->value list) for duplicate item | ||
* @return (sorted, field->value list) for duplicate item | ||
*/ | ||
public List<MetadataValue> getMetadataValueList() { | ||
return metadataValueList; | ||
} | ||
|
||
/** | ||
* Set metadata (sorted, field->value list) for duplicate item | ||
* @param metadataValueList MetadataRest list of values mapped to field keys | ||
*/ | ||
public void setMetadataValueList(List<MetadataValue> metadataValueList) { | ||
this.metadataValueList = metadataValueList; | ||
} | ||
|
||
} |
Oops, something went wrong.