diff --git a/.github/workflows/deploy_beta_testing.yml b/.github/workflows/deploy_beta_testing.yml index 028f0140cc9..efe3e0d8621 100644 --- a/.github/workflows/deploy_beta_testing.yml +++ b/.github/workflows/deploy_beta_testing.yml @@ -45,7 +45,7 @@ jobs: - uses: actions/checkout@v3 - name: Download war artifact - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4.1.7 with: name: built-app path: ./ diff --git a/.github/workflows/maven_unit_test.yml b/.github/workflows/maven_unit_test.yml index a94b17a67ba..102fb1d5882 100644 --- a/.github/workflows/maven_unit_test.yml +++ b/.github/workflows/maven_unit_test.yml @@ -107,7 +107,7 @@ jobs: cache: maven # Get the build output from the unit test job - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4.1.7 with: name: java-artifacts - run: | @@ -140,7 +140,7 @@ jobs: cache: maven # Get the build output from the integration test job - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4.1.7 with: name: java-reportdir - run: tar -xvf java-reportdir.tar diff --git a/doc/release-notes/10708 - MDC Citation and DOI parsing improvements.md b/doc/release-notes/10708 - MDC Citation and DOI parsing improvements.md new file mode 100644 index 00000000000..1dcd293df77 --- /dev/null +++ b/doc/release-notes/10708 - MDC Citation and DOI parsing improvements.md @@ -0,0 +1,3 @@ +MDC Citation retrieval with the PID settings has been fixed. +DOI parsing in Dataverse is case insensitive, improving interaction with services that may change the case. +Warnings related to managed/excluded PID lists for PID providers have been reduced diff --git a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java index 1f2f1039327..306b863c9e4 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/MakeDataCountApi.java @@ -19,6 +19,9 @@ import java.io.IOException; import java.io.InputStream; import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.util.Iterator; import java.util.List; @@ -152,10 +155,17 @@ public Response updateCitationsForDataset(@PathParam("id") String id) throws IOE // DataCite wants "doi=", not "doi:". String authorityPlusIdentifier = persistentId.replaceFirst("doi:", ""); // Request max page size and then loop to handle multiple pages - URL url = new URL(JvmSettings.DATACITE_REST_API_URL.lookup() + + URL url = null; + try { + url = new URI(JvmSettings.DATACITE_REST_API_URL.lookup(pidProvider.getId()) + "/events?doi=" + authorityPlusIdentifier + - "&source=crossref&page[size]=1000"); + "&source=crossref&page[size]=1000").toURL(); + } catch (URISyntaxException e) { + //Nominally this means a config error/ bad DATACITE_REST_API_URL for this provider + logger.warning("Unable to create URL for " + persistentId + ", pidProvider " + pidProvider.getId()); + return error(Status.INTERNAL_SERVER_ERROR, "Unable to create DataCite URL to retrieve citations."); + } logger.fine("Retrieving Citations from " + url.toString()); boolean nextPage = true; JsonArrayBuilder dataBuilder = Json.createArrayBuilder(); @@ -178,7 +188,12 @@ public Response updateCitationsForDataset(@PathParam("id") String id) throws IOE dataBuilder.add(iter.next()); } if (links.containsKey("next")) { - url = new URL(links.getString("next")); + try { + url = new URI(links.getString("next")).toURL(); + } catch (URISyntaxException e) { + logger.warning("Unable to create URL from DataCite response: " + links.getString("next")); + return error(Status.INTERNAL_SERVER_ERROR, "Unable to retrieve all results from DataCite"); + } } else { nextPage = false; } @@ -187,7 +202,7 @@ public Response updateCitationsForDataset(@PathParam("id") String id) throws IOE JsonArray allData = dataBuilder.build(); List datasetExternalCitations = datasetExternalCitationsService.parseCitations(allData); /* - * ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopuate them. + * ToDo: If this is the only source of citations, we should remove all the existing ones for the dataset and repopulate them. * As is, this call doesn't remove old citations if there are now none (legacy issue if we decide to stop counting certain types of citation * as we've done for 'hasPart'). * If there are some, this call individually checks each one and if a matching item exists, it removes it and adds it back. Faster and better to delete all and diff --git a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java index 50c24274bb2..fa56432cc3c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/makedatacount/DatasetExternalCitationsServiceBean.java @@ -7,6 +7,9 @@ import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetServiceBean; +import edu.harvard.iq.dataverse.GlobalId; +import edu.harvard.iq.dataverse.pidproviders.PidUtil; + import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -40,7 +43,8 @@ public class DatasetExternalCitationsServiceBean implements java.io.Serializable Arrays.asList( "cites", "references", - "supplements")); + "supplements", + "is-supplement-to")); static ArrayList outboundRelationships = new ArrayList( Arrays.asList( "is-cited-by", @@ -59,12 +63,11 @@ public List parseCitations(JsonArray citations) { if (inboundRelationships.contains(relationship)) { Dataset localDs = null; if (objectUri.contains("doi")) { - String globalId = objectUri.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:"); - localDs = datasetService.findByGlobalId(globalId); + localDs = datasetService.findByGlobalId(objectUri); exCit.setDataset(localDs); } exCit.setCitedByUrl(subjectUri); - + if (localDs != null && !exCit.getCitedByUrl().isEmpty()) { datasetExternalCitations.add(exCit); } @@ -72,9 +75,9 @@ public List parseCitations(JsonArray citations) { if (outboundRelationships.contains(relationship)) { Dataset localDs = null; if (subjectUri.contains("doi")) { - String globalId = subjectUri.replace("https://", "").replace("doi.org/", "doi:").toUpperCase().replace("DOI:", "doi:"); - localDs = datasetService.findByGlobalId(globalId); + localDs = datasetService.findByGlobalId(subjectUri); exCit.setDataset(localDs); + } exCit.setCitedByUrl(objectUri); diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java index f6d142aac96..250eae7e5fc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/AbstractPidProvider.java @@ -36,9 +36,9 @@ public abstract class AbstractPidProvider implements PidProvider { private String datafilePidFormat = null; - private HashSet managedSet; + protected HashSet managedSet = new HashSet(); - private HashSet excludedSet; + protected HashSet excludedSet = new HashSet(); private String id; private String label; @@ -47,8 +47,6 @@ protected AbstractPidProvider(String id, String label, String protocol) { this.id = id; this.label = label; this.protocol = protocol; - this.managedSet = new HashSet(); - this.excludedSet = new HashSet(); } protected AbstractPidProvider(String id, String label, String protocol, String authority, String shoulder, @@ -60,8 +58,12 @@ protected AbstractPidProvider(String id, String label, String protocol, String a this.shoulder = shoulder; this.identifierGenerationStyle = identifierGenerationStyle; this.datafilePidFormat = datafilePidFormat; - this.managedSet = new HashSet(Arrays.asList(managedList.split(",\\s"))); - this.excludedSet = new HashSet(Arrays.asList(excludedList.split(",\\s"))); + if(!managedList.isEmpty()) { + this.managedSet.addAll(Arrays.asList(managedList.split(",\\s"))); + } + if(!excludedList.isEmpty()) { + this.excludedSet.addAll(Arrays.asList(excludedList.split(",\\s"))); + } if (logger.isLoggable(Level.FINE)) { Iterator iter = managedSet.iterator(); while (iter.hasNext()) { @@ -313,10 +315,17 @@ protected GlobalId parsePersistentId(String protocol, String identifierString) { } public GlobalId parsePersistentId(String protocol, String authority, String identifier) { + return parsePersistentId(protocol, authority, identifier, false); + } + + public GlobalId parsePersistentId(String protocol, String authority, String identifier, boolean isCaseInsensitive) { logger.fine("Parsing: " + protocol + ":" + authority + getSeparator() + identifier + " in " + getId()); if (!PidProvider.isValidGlobalId(protocol, authority, identifier)) { return null; } + if(isCaseInsensitive) { + identifier = identifier.toUpperCase(); + } // Check authority/identifier if this is a provider that manages specific // identifiers // /is not one of the unmanaged providers that has null authority @@ -333,7 +342,7 @@ public GlobalId parsePersistentId(String protocol, String authority, String iden logger.fine("managed in " + getId() + ": " + getManagedSet().contains(cleanIdentifier)); logger.fine("excluded from " + getId() + ": " + getExcludedSet().contains(cleanIdentifier)); - if (!(((authority.equals(getAuthority()) && identifier.startsWith(getShoulder())) + if (!(((authority.equals(getAuthority()) && identifier.startsWith(getShoulder().toUpperCase())) || getManagedSet().contains(cleanIdentifier)) && !getExcludedSet().contains(cleanIdentifier))) { return null; } diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/AbstractDOIProvider.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/AbstractDOIProvider.java index 02a7dedce47..70ce1ec4c14 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/AbstractDOIProvider.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/AbstractDOIProvider.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.pidproviders.doi; import java.util.Arrays; +import java.util.HashSet; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; @@ -26,9 +27,24 @@ public abstract class AbstractDOIProvider extends AbstractPidProvider { public AbstractDOIProvider(String id, String label, String providerAuthority, String providerShoulder, String identifierGenerationStyle, String datafilePidFormat, String managedList, String excludedList) { super(id, label, DOI_PROTOCOL, providerAuthority, providerShoulder, identifierGenerationStyle, datafilePidFormat, managedList, excludedList); + //Create case insensitive (converted toUpperCase) managedSet and excludedSet + managedSet = clean(managedSet, "managed"); + excludedSet = clean(excludedSet, "excluded"); } - //For Unmanged provider + private HashSet clean(HashSet originalSet, String setName) { + HashSet cleanSet = new HashSet(); + for(String entry: originalSet) { + if(entry.startsWith(DOI_PROTOCOL)) { + cleanSet.add(DOI_PROTOCOL + entry.substring(DOI_PROTOCOL.length()).toUpperCase()); + } else { + logger.warning("Non-DOI found in " + setName + " set of pidProvider id: " + getId() + ": " + entry + ". Entry is being dropped."); + } + } + return cleanSet; + } + + //For Unmanaged provider public AbstractDOIProvider(String name, String label) { super(name, label, DOI_PROTOCOL); } @@ -67,7 +83,7 @@ public GlobalId parsePersistentId(String protocol, String authority, String iden if (!DOI_PROTOCOL.equals(protocol)) { return null; } - return super.parsePersistentId(protocol, authority, identifier); + return super.parsePersistentId(protocol, authority, identifier, true); } public String getUrlPrefix() { diff --git a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java index 5630844fb32..b07cd027a01 100644 --- a/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java +++ b/src/main/java/edu/harvard/iq/dataverse/pidproviders/doi/datacite/DataCiteDOIProvider.java @@ -226,8 +226,7 @@ protected String getProviderKeyName() { @Override public String getProviderType() { - // TODO Auto-generated method stub - return null; + return TYPE; } public String getMdsUrl() { diff --git a/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java index 58d69da743b..ecf18e6b1ca 100644 --- a/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/pidproviders/PidUtilTest.java @@ -250,9 +250,12 @@ public void testDOIParsing() throws IOException { assertEquals(pid1String, pid3.asString()); assertEquals("dc1", pid3.getProviderId()); - String pid4String = "doi:10.5072/FK3ABCDEF"; + //Also test case insensitive + String pid4String = "doi:10.5072/fk3ABCDEF"; GlobalId pid4 = PidUtil.parseAsGlobalID(pid4String); - assertEquals(pid4String, pid4.asString()); + // Lower case is recognized by converting to upper case internally, so we need to test vs. the upper case identifier + // I.e. we are verifying that the lower case string is parsed the same as the upper case string, both give an internal upper case PID representation + assertEquals("doi:10.5072/FK3ABCDEF", pid4.asString()); assertEquals("dc2", pid4.getProviderId()); String pid5String = "doi:10.5072/FK2ABCDEF";