From 70580d02b6b88c19a1a4fd0ba1693692d8d1fad4 Mon Sep 17 00:00:00 2001 From: chemicus Date: Mon, 26 Sep 2022 12:21:37 +0200 Subject: [PATCH 1/2] MCR-2742 New ids found during enrichment are added to the idPool before the mods merge, which can lead to duplicate source requests --- .../mycore/mods/enrichment/MCRIdentifier.java | 16 ++++++++++++++-- .../mycore/mods/merger/MCRIdentifierMerger.java | 9 +++++---- .../mods/merger/MCRIdentifierMergerTest.java | 10 ++++++++++ 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/mycore-mods/src/main/java/org/mycore/mods/enrichment/MCRIdentifier.java b/mycore-mods/src/main/java/org/mycore/mods/enrichment/MCRIdentifier.java index 1354b6c50a..a9502d18a5 100644 --- a/mycore-mods/src/main/java/org/mycore/mods/enrichment/MCRIdentifier.java +++ b/mycore-mods/src/main/java/org/mycore/mods/enrichment/MCRIdentifier.java @@ -18,6 +18,10 @@ package org.mycore.mods.enrichment; +import java.nio.charset.StandardCharsets; +import java.net.URLDecoder; +import java.util.Locale; + import org.jaxen.JaxenException; import org.jdom2.Element; import org.mycore.common.MCRException; @@ -47,14 +51,22 @@ public String getValue() { return value; } + public static String simplifyID(String id) { + return URLDecoder.decode(id.toLowerCase(Locale.ENGLISH),StandardCharsets.UTF_8).replace("-",""); + } + + public String simplifiedID() { + return simplifyID(toString()); + } + @Override public boolean equals(Object other) { - return (other instanceof MCRIdentifier && this.toString().equals(other.toString())); + return (other instanceof MCRIdentifier && this.simplifiedID().equals(((MCRIdentifier)other).simplifiedID())); } @Override public int hashCode() { - return toString().hashCode(); + return simplifiedID().hashCode(); } @Override diff --git a/mycore-mods/src/main/java/org/mycore/mods/merger/MCRIdentifierMerger.java b/mycore-mods/src/main/java/org/mycore/mods/merger/MCRIdentifierMerger.java index 31d72ac803..87e564a923 100644 --- a/mycore-mods/src/main/java/org/mycore/mods/merger/MCRIdentifierMerger.java +++ b/mycore-mods/src/main/java/org/mycore/mods/merger/MCRIdentifierMerger.java @@ -20,9 +20,9 @@ import java.nio.charset.StandardCharsets; import java.net.URLDecoder; -import java.util.Locale; import org.jdom2.Element; +import org.mycore.mods.enrichment.MCRIdentifier; /** * Compares and merges mods:identifier elements. @@ -43,8 +43,7 @@ private String getType() { } private String getSimplifiedID() { - return URLDecoder.decode(this.element.getTextNormalize().toLowerCase(Locale.ENGLISH),StandardCharsets.UTF_8) - .replace("-",""); + return MCRIdentifier.simplifyID(this.element.getTextNormalize()); } @Override @@ -60,7 +59,9 @@ public boolean isProbablySameAs(MCRMerger other) { @Override public void mergeFrom(MCRMerger other) { - if (!this.element.getText().contains("-") && other.element.getText().contains("-")) { + if ((!this.element.getText().contains("-") && other.element.getText().contains("-")) || + (!URLDecoder.decode(this.element.getText(),StandardCharsets.UTF_8).equals(this.element.getText()) + && URLDecoder.decode(other.element.getText(),StandardCharsets.UTF_8).equals(other.element.getText()))) { this.element.setText(other.element.getText()); } } diff --git a/mycore-mods/src/test/java/org/mycore/mods/merger/MCRIdentifierMergerTest.java b/mycore-mods/src/test/java/org/mycore/mods/merger/MCRIdentifierMergerTest.java index 469d4d2233..9d220dc34b 100644 --- a/mycore-mods/src/test/java/org/mycore/mods/merger/MCRIdentifierMergerTest.java +++ b/mycore-mods/src/test/java/org/mycore/mods/merger/MCRIdentifierMergerTest.java @@ -36,6 +36,15 @@ public void testMergeSame() throws Exception { String a = "[mods:identifier[@type='issn']='12345678']"; String b = "[mods:identifier[@type='issn']='1234-5678']"; MCRMergerTest.test(a, b, b); + MCRMergerTest.test(b, a, b); + } + + @Test + public void testMergeURLEncoded() throws Exception { + String a = "[mods:identifier[@type='doi']='10.1002/%28issn%291521-3765']"; + String b = "[mods:identifier[@type='doi']='10.1002/(issn)1521-3765']"; + MCRMergerTest.test(a, b, b); + MCRMergerTest.test(b, a, b); } @Test @@ -51,5 +60,6 @@ public void testCaseInsensitiveDOIs() throws Exception { String a = "[mods:identifier[@type='doi']='10.1530/EJE-21-1086']"; String b = "[mods:identifier[@type='doi']='10.1530/eje-21-1086']"; MCRMergerTest.test(a, b, a); + MCRMergerTest.test(b, a, b); } } From a21d0c4bb7fbb56e716c2f698c0ae213532e871c Mon Sep 17 00:00:00 2001 From: chemicus Date: Tue, 11 Oct 2022 11:53:37 +0200 Subject: [PATCH 2/2] MCR-2742 Fix JUnit test --- .../testBasicEnrichment-debug.xml | 18 +++++++++--------- .../testBasicEnrichment-result.xml | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mycore-mods/src/test/resources/MCREnrichmentTest/testBasicEnrichment-debug.xml b/mycore-mods/src/test/resources/MCREnrichmentTest/testBasicEnrichment-debug.xml index 6f52245ebe..9c37c799e9 100644 --- a/mycore-mods/src/test/resources/MCREnrichmentTest/testBasicEnrichment-debug.xml +++ b/mycore-mods/src/test/resources/MCREnrichmentTest/testBasicEnrichment-debug.xml @@ -67,14 +67,14 @@ - 1234-5678 - 1234-6789 1000-9999 + 1234-6789 + 1234-5678 - from data source C for ISSN 1234-5678 - 1234-5678 + from data source C for ISSN 1234-6789 + 1234-6789 @@ -84,13 +84,13 @@ host from data source B 1234-5678 1234-6789 - from data source C for ISSN 1234-5678 + from data source C for ISSN 1234-6789 - from data source C for ISSN 1234-6789 - 1234-6789 + from data source C for ISSN 1234-5678 + 1234-5678 @@ -100,9 +100,9 @@ host from data source B 1234-5678 1234-6789 - from data source C for ISSN 1234-5678 from data source C for ISSN 1234-6789 + from data source C for ISSN 1234-5678 - \ No newline at end of file + diff --git a/mycore-mods/src/test/resources/MCREnrichmentTest/testBasicEnrichment-result.xml b/mycore-mods/src/test/resources/MCREnrichmentTest/testBasicEnrichment-result.xml index 1035df2563..399ced4d46 100644 --- a/mycore-mods/src/test/resources/MCREnrichmentTest/testBasicEnrichment-result.xml +++ b/mycore-mods/src/test/resources/MCREnrichmentTest/testBasicEnrichment-result.xml @@ -4,8 +4,8 @@ host from data source A host from data source B - from data source C for ISSN 1234-5678 from data source C for ISSN 1234-6789 + from data source C for ISSN 1234-5678 1000-9999 1234-5678 1234-6789