Skip to content

Commit

Permalink
Merge pull request #11003 from GlobalDataverseCommunityConsortium/Har…
Browse files Browse the repository at this point in the history
…vestDatasetUsingPID

Fix lookups of Harvested datasets with lower-case versions of persistent identifiers in the database
  • Loading branch information
landreev authored Nov 8, 2024
2 parents b28812b + b8c0c40 commit 61b8046
Show file tree
Hide file tree
Showing 8 changed files with 34 additions and 10 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
MDC Citation retrieval with the PID settings has been fixed.
DOI parsing in Dataverse is case insensitive, improving interaction with services that may change the case.
PID parsing in Dataverse is now case insensitive, improving interaction with services that may change the case of PIDs.
Warnings related to managed/excluded PID lists for PID providers have been reduced
4 changes: 4 additions & 0 deletions doc/sphinx-guides/source/installation/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,10 @@ Dataverse automatically manages assigning PIDs and making them findable when dat
allow updating the PID target URLs and metadata of already-published datasets manually if needed <send-metadata-to-pid-provider>`, e.g. if a Dataverse instance is
moved to a new URL or when the software is updated to generate additional metadata or address schema changes at the PID service.

Note that while some forms of PIDs (Handles, PermaLinks) are technically case sensitive, common practice is to avoid creating PIDs that differ only by case.
Dataverse treats PIDs of all types as case-insensitive (as DOIs are by definition). This means that Dataverse will find datasets (in search, to display dataset pages, etc.)
when the PIDs entered do not match the case of the original but will have a problem if two PIDs that differ only by case exist in one instance.

Testing PID Providers
+++++++++++++++++++++

Expand Down
9 changes: 5 additions & 4 deletions src/main/java/edu/harvard/iq/dataverse/DvObject.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@
@NamedQuery(name = "DvObject.ownedObjectsById",
query="SELECT COUNT(obj) FROM DvObject obj WHERE obj.owner.id=:id"),
@NamedQuery(name = "DvObject.findByGlobalId",
query = "SELECT o FROM DvObject o WHERE o.identifier=:identifier and o.authority=:authority and o.protocol=:protocol and o.dtype=:dtype"),
query = "SELECT o FROM DvObject o WHERE UPPER(o.identifier)=UPPER(:identifier) and o.authority=:authority and o.protocol=:protocol and o.dtype=:dtype"),
@NamedQuery(name = "DvObject.findIdByGlobalId",
query = "SELECT o.id FROM DvObject o WHERE o.identifier=:identifier and o.authority=:authority and o.protocol=:protocol and o.dtype=:dtype"),
query = "SELECT o.id FROM DvObject o WHERE UPPER(o.identifier)=UPPER(:identifier) and o.authority=:authority and o.protocol=:protocol and o.dtype=:dtype"),

@NamedQuery(name = "DvObject.findByAlternativeGlobalId",
query = "SELECT o FROM DvObject o, AlternativePersistentIdentifier a WHERE o.id = a.dvObject.id and a.identifier=:identifier and a.authority=:authority and a.protocol=:protocol and o.dtype=:dtype"),
@NamedQuery(name = "DvObject.findIdByAlternativeGlobalId",
query = "SELECT o.id FROM DvObject o, AlternativePersistentIdentifier a WHERE o.id = a.dvObject.id and a.identifier=:identifier and a.authority=:authority and a.protocol=:protocol and o.dtype=:dtype"),

@NamedQuery(name = "DvObject.findByProtocolIdentifierAuthority",
query = "SELECT o FROM DvObject o WHERE o.identifier=:identifier and o.authority=:authority and o.protocol=:protocol"),
query = "SELECT o FROM DvObject o WHERE UPPER(o.identifier)=UPPER(:identifier) and o.authority=:authority and o.protocol=:protocol"),
@NamedQuery(name = "DvObject.findByOwnerId",
query = "SELECT o FROM DvObject o WHERE o.owner.id=:ownerId order by o.dtype desc, o.id"),
@NamedQuery(name = "DvObject.findByAuthenticatedUserId",
Expand All @@ -53,7 +53,8 @@
@Table(indexes = {@Index(columnList="dtype")
, @Index(columnList="owner_id")
, @Index(columnList="creator_id")
, @Index(columnList="releaseuser_id")},
, @Index(columnList="releaseuser_id")
, @Index(columnList="authority,protocol, UPPER(identifier)", name="INDEX_DVOBJECT_authority_protocol_upper_identifier")},
uniqueConstraints = {@UniqueConstraint(columnNames = {"authority,protocol,identifier"}),@UniqueConstraint(columnNames = {"owner_id,storageidentifier"})})
public abstract class DvObject extends DataverseEntity implements java.io.Serializable {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
// Creating a new dataset from scratch:

harvestedDataset = parser.parseDataset(obj);

harvestedDataset.setHarvestedFrom(harvestingClient);
harvestedDataset.setHarvestIdentifier(harvestIdentifier);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@
* service.
* As of now, it only does the registration updates, to accommodate
* the modifyRegistration datasets API sub-command.
*
* Note that while Handles are nominally case sensitive, handle.net is
* configured to be case-insensitive and Dataverse makes case-insensitve
* database look-ups to find Handles (See #11003). That said, database
* entries are stored in the case matching the configuration of the provider.
*/
public class HandlePidProvider extends AbstractPidProvider {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
* overridable by a configurable parameter to support use of an external
* resolver.
*
* Note that while PermaLinks are nominally case sensitive, Dataverse makes
* case-insensitve database look-ups to find them (See #11003). That said, database
* entries are stored in the case matching the configuration of the provider.
*/
public class PermaLinkPidProvider extends AbstractPidProvider {

Expand Down
4 changes: 4 additions & 0 deletions src/main/resources/db/migration/V6.4.0.1.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
-- Adding a case-insensitive index related to #11003
--

CREATE UNIQUE INDEX IF NOT EXISTS INDEX_DVOBJECT_authority_protocol_upper_identifier ON dvobject (authority, protocol, UPPER(identifier));
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
@JvmSetting(key = JvmSettings.PID_PROVIDER_LABEL, value = "FAKE 1", varArgs = "fake1")
@JvmSetting(key = JvmSettings.PID_PROVIDER_TYPE, value = FakeDOIProvider.TYPE, varArgs = "fake1")
@JvmSetting(key = JvmSettings.PID_PROVIDER_AUTHORITY, value = "10.5074", varArgs = "fake1")
@JvmSetting(key = JvmSettings.PID_PROVIDER_SHOULDER, value = "FK", varArgs = "fake1")
@JvmSetting(key = JvmSettings.PID_PROVIDER_SHOULDER, value = "fk", varArgs = "fake1")
@JvmSetting(key = JvmSettings.PID_PROVIDER_MANAGED_LIST, value = "doi:10.5073/FK3ABCDEF", varArgs ="fake1")

//HANDLE 1
Expand Down Expand Up @@ -315,6 +315,13 @@ public void testUnmanagedParsing() throws IOException {
GlobalId pid6 = PidUtil.parseAsGlobalID(pid6String);
assertEquals(pid6String, pid6.asString());
assertEquals(UnmanagedPermaLinkPidProvider.ID, pid6.getProviderId());

//Lowercase test for unmanaged DOIs
String pid7String = "doi:10.5281/zenodo.6381129";
GlobalId pid7 = PidUtil.parseAsGlobalID(pid7String);
assertEquals(UnmanagedDOIProvider.ID, pid5.getProviderId());
assertEquals(pid7String.toUpperCase().replace("DOI", "doi"), pid7.asString());


}

Expand Down Expand Up @@ -353,15 +360,15 @@ public void testExcludedSetParsing() throws IOException {
@Test
public void testManagedSetParsing() throws IOException {

String pid1String = "doi:10.5073/FK3ABCDEF";
String pid1String = "doi:10.5073/fk3ABCDEF";
GlobalId pid2 = PidUtil.parseAsGlobalID(pid1String);
assertEquals(pid1String, pid2.asString());
assertEquals(pid1String.toUpperCase().replace("DOI", "doi"), pid2.asString());
assertEquals("fake1", pid2.getProviderId());
assertEquals("https://doi.org/" + pid2.getAuthority() + PidUtil.getPidProvider(pid2.getProviderId()).getSeparator() + pid2.getIdentifier(),pid2.asURL());
assertEquals("10.5073", pid2.getAuthority());
assertEquals(AbstractDOIProvider.DOI_PROTOCOL, pid2.getProtocol());
GlobalId pid3 = PidUtil.parseAsGlobalID(pid2.asURL());
assertEquals(pid1String, pid3.asString());
assertEquals(pid1String.toUpperCase().replace("DOI", "doi"), pid3.asString());
assertEquals("fake1", pid3.getProviderId());
assertFalse(PidUtil.getPidProvider(pid3.getProviderId()).canCreatePidsLike(pid3));

Expand Down

0 comments on commit 61b8046

Please sign in to comment.