From 71bb2aa2fbfd97c813f62568e749622928454a7c Mon Sep 17 00:00:00 2001 From: Martin Ledvinka Date: Fri, 29 Nov 2024 17:46:45 +0100 Subject: [PATCH] [kbss-cvut/termit-ui#581] Implement importing of term translations from Excel. --- .../service/importer/excel/ExcelImporter.java | 223 +++++++++++++----- .../excel/LocalizedSheetImporter.java | 2 +- .../importer/excel/ExcelImporterTest.java | 132 +++++++++-- 3 files changed, 280 insertions(+), 77 deletions(-) diff --git a/src/main/java/cz/cvut/kbss/termit/service/importer/excel/ExcelImporter.java b/src/main/java/cz/cvut/kbss/termit/service/importer/excel/ExcelImporter.java index a86d0a6c6..f55e2ef75 100644 --- a/src/main/java/cz/cvut/kbss/termit/service/importer/excel/ExcelImporter.java +++ b/src/main/java/cz/cvut/kbss/termit/service/importer/excel/ExcelImporter.java @@ -1,6 +1,7 @@ package cz.cvut.kbss.termit.service.importer.excel; import cz.cvut.kbss.jopa.model.EntityManager; +import cz.cvut.kbss.jopa.model.MultilingualString; import cz.cvut.kbss.termit.exception.NotFoundException; import cz.cvut.kbss.termit.exception.importing.VocabularyDoesNotExistException; import cz.cvut.kbss.termit.exception.importing.VocabularyImportException; @@ -21,6 +22,7 @@ import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.config.ConfigurableBeanFactory; @@ -119,43 +121,8 @@ public Vocabulary importVocabulary(@Nonnull ImportConfiguration config, @Nonnull terms = sheetImporter.resolveTermsFromSheet(sheet); rawDataToInsert.addAll(sheetImporter.getRawDataToInsert()); } - terms.stream().peek(t -> t.setUri(resolveTermIdentifier(targetVocabulary, t))) - .peek(t -> t.getLabel().getValue().forEach((lang, value) -> { - final Optional existingUri = termService.findIdentifierByLabel(value, - targetVocabulary, - lang); - if (existingUri.isPresent() && !existingUri.get().equals(t.getUri())) { - throw new VocabularyImportException( - "Vocabulary already contains a term with label '" + value + "' with a different identifier than the imported one.", - "error.vocabulary.import.excel.labelWithDifferentIdentifierExists") - .addParameter("label", value) - .addParameter("existingUri", Utils.uriToString(existingUri.get())); - } - })) - .filter(t -> termService.exists(t.getUri())).forEach(t -> { - LOG.trace("Term {} already exists. Removing old version.", t); - termService.forceRemove(termService.findRequired(t.getUri())); - // Flush changes to prevent EntityExistsExceptions when term is already managed in PC as different type (Term vs TermInfo) - em.flush(); - }); - // Ensure all parents are saved before we start adding children - terms.stream().filter(t -> Utils.emptyIfNull(t.getParentTerms()).isEmpty()) - .forEach(root -> { - LOG.trace("Persisting root term {}.", root); - termService.addRootTermToVocabulary(root, targetVocabulary); - root.setVocabulary(targetVocabulary.getUri()); - }); - terms.stream().filter(t -> !Utils.emptyIfNull(t.getParentTerms()).isEmpty()) - .forEach(t -> { - t.setVocabulary(targetVocabulary.getUri()); - LOG.trace("Persisting child term {}.", t); - termService.addChildTerm(t, t.getParentTerms().iterator().next()); - }); - // Insert term relationships as raw data because of possible object conflicts in the persistence context - - // the same term being as multiple types (Term, TermInfo) in the same persistence context - dataDao.insertRawData(rawDataToInsert.stream().map(tr -> new Quad(tr.subject().getUri(), tr.property(), - tr.object().getUri(), - targetVocabulary.getUri())).toList()); + prepareTermsForPersist(terms, targetVocabulary); + persistNewTerms(terms, targetVocabulary, rawDataToInsert); } } catch (IOException e) { throw new VocabularyImportException("Unable to read input as Excel.", e); @@ -174,30 +141,17 @@ private PrefixMap resolvePrefixMap(Workbook excel) { } /** - * Resolves namespace for identifiers of terms in the specified vocabulary. - *

- * It uses the vocabulary identifier and the configured term namespace separator. - * - * @param vocabulary Vocabulary whose term identifier namespace to resolve - * @return Resolved namespace - */ - private String resolveVocabularyTermNamespace(Vocabulary vocabulary) { - return idResolver.buildNamespace(vocabulary.getUri().toString(), - config.getNamespace().getTerm().getSeparator()); - } - - /** - * Resolves term identifier. + * Resolves term identifier w.r.t. the target vocabulary. *

* If the term does not have an identifier, it is generated so that existing instance can be removed before * inserting the imported term. If the term has an identifier, but it does not match the expected vocabulary-based * namespace, it is adjusted so that it does. Otherwise, the identifier is used. * - * @param vocabulary Vocabulary into which the term will be added * @param term The imported term + * @param vocabulary Vocabulary into which the term will be added * @return Term identifier */ - private URI resolveTermIdentifier(Vocabulary vocabulary, Term term) { + private URI resolveTermIdentifierWrtVocabulary(Term term, Vocabulary vocabulary) { final String termNamespace = resolveVocabularyTermNamespace(vocabulary); if (term.getUri() == null) { return idResolver.generateDerivedIdentifier(vocabulary.getUri(), @@ -215,10 +169,169 @@ private URI resolveTermIdentifier(Vocabulary vocabulary, Term term) { return term.getUri(); } + /** + * Resolves namespace for identifiers of terms in the specified vocabulary. + *

+ * It uses the vocabulary identifier and the configured term namespace separator. + * + * @param vocabulary Vocabulary whose term identifier namespace to resolve + * @return Resolved namespace + */ + private String resolveVocabularyTermNamespace(Vocabulary vocabulary) { + return idResolver.buildNamespace(vocabulary.getUri().toString(), + config.getNamespace().getTerm().getSeparator()); + } + + /** + * Prepares terms for persist by: + *

+ * + * @param terms Terms to process + * @param targetVocabulary Target vocabulary + */ + private void prepareTermsForPersist(List terms, Vocabulary targetVocabulary) { + terms.stream().peek(t -> t.setUri(resolveTermIdentifierWrtVocabulary(t, targetVocabulary))) + .peek(t -> t.getLabel().getValue().forEach((lang, value) -> { + final Optional existingUri = termService.findIdentifierByLabel(value, + targetVocabulary, + lang); + if (existingUri.isPresent() && !existingUri.get().equals(t.getUri())) { + throw new VocabularyImportException( + "Vocabulary already contains a term with label '" + value + "' with a different identifier than the imported one.", + "error.vocabulary.import.excel.labelWithDifferentIdentifierExists") + .addParameter("label", value) + .addParameter("existingUri", Utils.uriToString(existingUri.get())); + } + })) + .filter(t -> termService.exists(t.getUri())).forEach(t -> { + LOG.trace("Term {} already exists. Removing old version.", t); + termService.forceRemove(termService.findRequired(t.getUri())); + // Flush changes to prevent EntityExistsExceptions when term is already managed in PC as different type (Term vs TermInfo) + em.flush(); + }); + } + + private void persistNewTerms(List terms, Vocabulary targetVocabulary, Set rawDataToInsert) { + // Ensure all parents are saved before we start adding children + terms.stream().filter(t -> Utils.emptyIfNull(t.getParentTerms()).isEmpty()) + .forEach(root -> { + LOG.trace("Persisting root term {}.", root); + termService.addRootTermToVocabulary(root, targetVocabulary); + root.setVocabulary(targetVocabulary.getUri()); + }); + terms.stream().filter(t -> !Utils.emptyIfNull(t.getParentTerms()).isEmpty()) + .forEach(t -> { + t.setVocabulary(targetVocabulary.getUri()); + LOG.trace("Persisting child term {}.", t); + termService.addChildTerm(t, t.getParentTerms().iterator().next()); + }); + // Insert term relationships as raw data because of possible object conflicts in the persistence context - + // the same term being as multiple types (Term, TermInfo) in the same persistence context + dataDao.insertRawData(rawDataToInsert.stream().map(tr -> new Quad(tr.subject().getUri(), tr.property(), + tr.object().getUri(), + targetVocabulary.getUri())).toList()); + } + @Override public Vocabulary importTermTranslations(@Nonnull URI vocabularyIri, @Nonnull ImportInput data) { - // TODO - return null; + Objects.requireNonNull(vocabularyIri); + Objects.requireNonNull(data); + final Vocabulary targetVocabulary = vocabularyDao.find(vocabularyIri).orElseThrow( + () -> NotFoundException.create(Vocabulary.class, vocabularyIri)); + LOG.debug("Importing translations for terms in vocabulary {}.", vocabularyIri); + try { + final List terms = readTermsFromSheet(data); + terms.forEach(t -> { + identifyTermByLabelIfNecessary(t, targetVocabulary); + final Optional existingTerm = termService.find(t.getUri()); + if (existingTerm.isEmpty() || !existingTerm.get().getVocabulary().equals(vocabularyIri)) { + LOG.warn( + "Term with identifier '{}' not found in vocabulary '{}'. Skipping record resolved from Excel file.", + t.getUri(), vocabularyIri); + return; + } + mergeTranslations(t, existingTerm.get()); + termService.update(existingTerm.get()); + }); + } catch (IOException e) { + throw new VocabularyImportException("Unable to read input as Excel.", e); + } + return targetVocabulary; + } + + private void identifyTermByLabelIfNecessary(Term t, Vocabulary targetVocabulary) { + if (t.getUri() == null) { + final String termLabel = t.getLabel().get(config.getPersistence().getLanguage()); + if (termLabel == null) { + throw new VocabularyImportException( + "Unable to identify terms in Excel - it contains neither term identifiers nor labels in primary language.", + "error.vocabulary.import.excel.missingIdentifierOrLabel"); + } + t.setUri(idResolver.generateDerivedIdentifier(targetVocabulary.getUri(), + config.getNamespace().getTerm().getSeparator(), + termLabel)); + } + } + + private List readTermsFromSheet(@NotNull ImportInput data) throws IOException { + List terms = Collections.emptyList(); + for (InputStream input : data.data()) { + final Workbook workbook = new XSSFWorkbook(input); + assert workbook.getNumberOfSheets() > 0; + PrefixMap prefixMap = resolvePrefixMap(workbook); + for (int i = 0; i < workbook.getNumberOfSheets(); i++) { + final Sheet sheet = workbook.getSheetAt(i); + if (ExcelVocabularyExporter.PREFIX_SHEET_NAME.equals(sheet.getSheetName())) { + // Skip already processed prefix sheet + continue; + } + final LocalizedSheetImporter sheetImporter = new LocalizedSheetImporter( + new LocalizedSheetImporter.Services(termService, languageService), + prefixMap, terms); + terms = sheetImporter.resolveTermsFromSheet(sheet); + } + } + return terms; + } + + private void mergeTranslations(Term source, Term target) { + target.setLabel(mergeSingularTranslations(source.getLabel(), target.getLabel())); + target.setDefinition(mergeSingularTranslations(source.getDefinition(), target.getDefinition())); + target.setDescription(mergeSingularTranslations(source.getDescription(), target.getDescription())); + assert target.getAltLabels() != null; + mergePluralTranslations(source.getAltLabels(), target.getAltLabels()); + assert target.getHiddenLabels() != null; + mergePluralTranslations(source.getHiddenLabels(), target.getHiddenLabels()); + assert target.getExamples() != null; + mergePluralTranslations(source.getExamples(), target.getExamples()); + } + + private MultilingualString mergeSingularTranslations(MultilingualString source, MultilingualString target) { + if (target == null) { + return source; + } + if (source == null) { + return target; + } + source.getValue().forEach((lang, value) -> { + if (!target.contains(lang)) { + target.set(lang, value); + } + }); + return target; + } + + private void mergePluralTranslations(Set source, Set target) { + if (Utils.emptyIfNull(source).isEmpty()) { + return; + } + // Remove just the existing language values + target.forEach(t -> t.getLanguages().forEach(lang -> source.forEach(mls -> mls.remove(lang)))); + // Add the remainder + target.addAll(source.stream().filter(mls -> !mls.isEmpty()).toList()); } /** diff --git a/src/main/java/cz/cvut/kbss/termit/service/importer/excel/LocalizedSheetImporter.java b/src/main/java/cz/cvut/kbss/termit/service/importer/excel/LocalizedSheetImporter.java index 67187fc3e..17ba9dc02 100644 --- a/src/main/java/cz/cvut/kbss/termit/service/importer/excel/LocalizedSheetImporter.java +++ b/src/main/java/cz/cvut/kbss/termit/service/importer/excel/LocalizedSheetImporter.java @@ -84,7 +84,7 @@ class LocalizedSheetImporter { * @return Terms resolved from the sheet */ List resolveTermsFromSheet(Sheet sheet) { - LOG.debug("Importing terms from sheet '{}'.", sheet.getSheetName()); + LOG.debug("Reading terms from sheet '{}'.", sheet.getSheetName()); this.rawDataToInsert = new ArrayList<>(); final Optional lang = resolveLanguage(sheet); if (lang.isEmpty()) { diff --git a/src/test/java/cz/cvut/kbss/termit/service/importer/excel/ExcelImporterTest.java b/src/test/java/cz/cvut/kbss/termit/service/importer/excel/ExcelImporterTest.java index 5804ca6e8..eb682d4ae 100644 --- a/src/test/java/cz/cvut/kbss/termit/service/importer/excel/ExcelImporterTest.java +++ b/src/test/java/cz/cvut/kbss/termit/service/importer/excel/ExcelImporterTest.java @@ -38,6 +38,7 @@ import java.io.ByteArrayOutputStream; import java.net.URI; import java.util.Collection; +import java.util.HashSet; import java.util.List; import java.util.Objects; import java.util.Optional; @@ -86,7 +87,7 @@ class ExcelImporterTest { @SuppressWarnings("unused") @Spy - private IdentifierResolver idResolver = new IdentifierResolver(new Configuration()); + private IdentifierResolver idResolver = new IdentifierResolver(config); @InjectMocks private ExcelImporter sut; @@ -97,6 +98,7 @@ class ExcelImporterTest { void setUp() { this.vocabulary = Generator.generateVocabularyWithId(); config.getNamespace().getTerm().setSeparator("/terms"); + config.getPersistence().setLanguage(Environment.LANGUAGE); } @ParameterizedTest @@ -350,9 +352,7 @@ void importFallsBackToEnglishColumnLabelsForUnknownLanguages() { @Test void importSupportsTermIdentifiers() { - vocabulary.setUri(URI.create("http://example.com")); - when(vocabularyDao.exists(vocabulary.getUri())).thenReturn(true); - when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + initVocabularyResolution(); final Vocabulary result = sut.importVocabulary( new VocabularyImporter.ImportConfiguration(false, vocabulary.getUri(), prePersist), @@ -378,11 +378,15 @@ void importSupportsTermIdentifiers() { building.get().getUri(), vocabulary.getUri())), quadsCaptor.getValue()); } - @Test - void importSupportsPrefixedTermIdentifiers() { + private void initVocabularyResolution() { vocabulary.setUri(URI.create("http://example.com")); when(vocabularyDao.exists(vocabulary.getUri())).thenReturn(true); when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + } + + @Test + void importSupportsPrefixedTermIdentifiers() { + initVocabularyResolution(); final Vocabulary result = sut.importVocabulary( new VocabularyImporter.ImportConfiguration(false, vocabulary.getUri(), prePersist), @@ -431,9 +435,7 @@ void importAdjustsTermIdentifiersToUseExistingVocabularyIdentifierAndSeparatorAs @Test void importRemovesExistingInstanceWhenImportedTermAlreadyExists() { - vocabulary.setUri(URI.create("http://example.com")); - when(vocabularyDao.exists(vocabulary.getUri())).thenReturn(true); - when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + initVocabularyResolution(); final Term existingBuilding = Generator.generateTermWithId(); existingBuilding.setUri(URI.create("http://example.com/terms/building")); final Term existingConstruction = Generator.generateTermWithId(); @@ -457,9 +459,7 @@ void importRemovesExistingInstanceWhenImportedTermAlreadyExists() { @Test void importSupportsReferencesToOtherVocabulariesViaTermIdentifiersWhenReferencedTermsExist() { - vocabulary.setUri(URI.create("http://example.com")); - when(vocabularyDao.exists(vocabulary.getUri())).thenReturn(true); - when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + initVocabularyResolution(); when(termService.exists(any())).thenReturn(false); when(termService.exists(URI.create("http://example.com/another-vocabulary/terms/relatedMatch"))).thenReturn( true); @@ -568,9 +568,7 @@ void importThrowsVocabularyImportExceptionWhenSheetContainsDuplicateLabels() thr @Test void importThrowsVocabularyImportExceptionWhenSheetContainsDuplicateIdentifiers() throws Exception { - vocabulary.setUri(URI.create("http://example.com")); - when(vocabularyDao.exists(vocabulary.getUri())).thenReturn(true); - when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + initVocabularyResolution(); final Workbook input = new XSSFWorkbook(Environment.loadFile("template/termit-import.xlsx")); final Sheet sheet = input.getSheet("English"); sheet.shiftColumns(0, 12, 1); @@ -597,9 +595,7 @@ void importThrowsVocabularyImportExceptionWhenSheetContainsDuplicateIdentifiers( @Test void importSupportsSpecifyingStateAndTypeOnlyInOneSheet() throws Exception { - vocabulary.setUri(URI.create("http://example.com")); - when(vocabularyDao.exists(vocabulary.getUri())).thenReturn(true); - when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + initVocabularyResolution(); final Workbook input = new XSSFWorkbook(Environment.loadFile("template/termit-import.xlsx")); final Sheet englishSheet = input.getSheet("English"); englishSheet.getRow(1).createCell(0).setCellValue("Construction"); @@ -651,9 +647,7 @@ void importThrowsVocabularyImportExceptionWhenVocabularyAlreadyContainsTermWithS @Test void importSupportsMultipleTypesDeclaredForTerm() throws Exception { - vocabulary.setUri(URI.create("http://example.com")); - when(vocabularyDao.exists(vocabulary.getUri())).thenReturn(true); - when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + initVocabularyResolution(); final Workbook input = new XSSFWorkbook(Environment.loadFile("template/termit-import.xlsx")); final Sheet englishSheet = input.getSheet("English"); englishSheet.getRow(1).createCell(0).setCellValue("Construction"); @@ -678,4 +672,100 @@ void importSupportsMultipleTypesDeclaredForTerm() throws Exception { assertThat(captor.getValue().getTypes(), hasItems(objectType.getUri().toString(), eventType.getUri().toString())); } + + @Test + void importTermTranslationsFromExcelWithIdentifiersUpdatesExistingTerms() { + vocabulary.setUri(URI.create("http://example.com")); + when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + final Term building = initTermBuilding(); + final Term construction = initTermConstruction(); + + final Vocabulary result = sut.importTermTranslations(vocabulary.getUri(), new VocabularyImporter.ImportInput( + Constants.MediaType.EXCEL, + Environment.loadFile("data/import-with-identifiers-en-cs.xlsx"))); + assertEquals(vocabulary, result); + assertEquals("Budova", building.getLabel().get("cs")); + List.of("Barák", "Dům").forEach(t -> assertTrue( + building.getAltLabels().stream().anyMatch(mls -> mls.contains("cs") && mls.get("cs").equals(t)))); + assertEquals("Definice pojmu budova", building.getDefinition().get("cs")); + assertEquals("Doplňující poznámka pojmu budova", building.getDescription().get("cs")); + assertEquals("Stavba", construction.getLabel().get("cs")); + assertEquals("Proces výstavby budovy", construction.getDefinition().get("cs")); + assertTrue(construction.getAltLabels().stream() + .anyMatch(mls -> mls.contains("cs") && mls.get("cs").equals("Staveniště"))); + verify(termService).update(building); + verify(termService).update(construction); + } + + private Term initTermBuilding() { + final Term building = new Term(URI.create("http://example.com/terms/budova")); + building.setLabel(MultilingualString.create("Building", "en")); + building.setAltLabels(new HashSet<>(Set.of(MultilingualString.create("Complex", "en")))); + building.setDefinition(MultilingualString.create("Definition of term Building", "en")); + building.setDescription(MultilingualString.create("Building scope note", "en")); + building.setHiddenLabels(new HashSet<>()); + building.setExamples(new HashSet<>()); + building.setVocabulary(vocabulary.getUri()); + when(termService.find(building.getUri())).thenReturn(Optional.of(building)); + return building; + } + + private Term initTermConstruction() { + final Term construction = new Term(URI.create("http://example.com/terms/stavba")); + construction.setLabel(MultilingualString.create("Construction", "en")); + construction.setAltLabels(new HashSet<>(Set.of(MultilingualString.create("Construction site", "en")))); + construction.setDefinition(MultilingualString.create("The process of building a building", "en")); + construction.setHiddenLabels(new HashSet<>()); + construction.setExamples(new HashSet<>()); + construction.setVocabulary(vocabulary.getUri()); + when(termService.find(construction.getUri())).thenReturn(Optional.of(construction)); + return construction; + } + + @Test + void importTermTranslationsPreservesExistingValues() { + vocabulary.setUri(URI.create("http://example.com")); + when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + final Term building = initTermBuilding(); + + final Vocabulary result = sut.importTermTranslations(vocabulary.getUri(), new VocabularyImporter.ImportInput( + Constants.MediaType.EXCEL, + Environment.loadFile("data/import-with-identifiers-en-cs.xlsx"))); + assertEquals(vocabulary, result); + assertEquals("Building", building.getLabel().get("en")); + assertEquals("Definition of term Building", building.getDefinition().get("en")); + assertTrue(building.getAltLabels().stream() + .anyMatch(mls -> mls.contains("en") && mls.get("en").equals("Complex"))); + } + + @Test + void importTermTranslationsUsesTermLabelToResolveIdentifierWhenExcelDoesNotContainIdentifiers() { + vocabulary.setUri(URI.create("http://example.com")); + when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + config.getPersistence().setLanguage("cs"); + final Term building = initTermBuilding(); + + sut.importTermTranslations(vocabulary.getUri(), new VocabularyImporter.ImportInput( + Constants.MediaType.EXCEL, + Environment.loadFile("data/import-simple-en-cs.xlsx"))); + verify(termService).find(building.getUri()); + assertEquals("Budova", building.getLabel().get("cs")); + verify(termService).update(any(Term.class)); + } + + @Test + void importTermTranslationsThrowsVocabularyImportExceptionWhenExcelDoesNotContainIdentifierAndSheetWithLabelsInPrimaryLanguage() { + vocabulary.setUri(URI.create("http://example.com")); + when(vocabularyDao.find(vocabulary.getUri())).thenReturn(Optional.of(vocabulary)); + + VocabularyImportException ex = assertThrows(VocabularyImportException.class, + () -> sut.importTermTranslations(vocabulary.getUri(), + new VocabularyImporter.ImportInput( + Constants.MediaType.EXCEL, + Environment.loadFile( + "data/import-simple-de.xlsx")) + )); + assertEquals("error.vocabulary.import.excel.missingIdentifierOrLabel", ex.getMessageId()); + verify(termService, never()).update(any()); + } }