diff --git a/src/main/java/de/ulb/digital/derivans/model/ITextElement.java b/src/main/java/de/ulb/digital/derivans/model/ITextElement.java index 2fe394b..3580fd5 100644 --- a/src/main/java/de/ulb/digital/derivans/model/ITextElement.java +++ b/src/main/java/de/ulb/digital/derivans/model/ITextElement.java @@ -9,12 +9,23 @@ public interface ITextElement { /** - * Get textual UTF-8 representation + * Get textual representation as-in memory * * @return */ String getText(); + + /** + * + * Get textual representation used for printing PDF which + * might differ if text orientation switches between + * left-to-right (LTR, western EU) right-to-left (RTL, arabic) + * + * @return + */ + String forPrint(); + /** * * Distinguish between western european left (LTR) @@ -31,7 +42,7 @@ public interface ITextElement { */ default boolean isRTL() { if (!this.getText().isEmpty()) { - var code = getText().codePointAt(0); + var code = this.getText().codePointAt(0); // arabic basic if (code >= 0x600 && code <= 0x6ff) { return true; diff --git a/src/main/java/de/ulb/digital/derivans/model/pdf/PDFTextElement.java b/src/main/java/de/ulb/digital/derivans/model/pdf/PDFTextElement.java index aef366a..d9be373 100644 --- a/src/main/java/de/ulb/digital/derivans/model/pdf/PDFTextElement.java +++ b/src/main/java/de/ulb/digital/derivans/model/pdf/PDFTextElement.java @@ -28,6 +28,11 @@ public class PDFTextElement implements ITextElement, IVisualElement { // within the spanning rectangular area public static final float DESCENT_RATIO = .25f; + // Important to mark end of actual token for heavy ligated + // script fonts like arabic or farsi + // cf. https://en.wikipedia.org/wiki/Zero-width_space + public static final char ZERO_WIDTH = '\u200b'; + private PDFTextElementType type = PDFTextElementType.TOKEN; private Rectangle2D box; @@ -42,6 +47,14 @@ public class PDFTextElement implements ITextElement, IVisualElement { private boolean isPrinted; + public PDFTextElement(PDFTextElementType type) { + this.type = type; + } + + public PDFTextElement(List children) { + children.forEach(this::add); + } + public PDFTextElement(String actualText) { this(actualText, new Rectangle2D.Float()); } @@ -60,6 +73,11 @@ public PDFTextElement(String actualText, Rectangle2D box, String type) { } } + public PDFTextElement(String actualText, Rectangle2D box, PDFTextElementType type) { + this(actualText, box); + this.type = type; + } + private float descent() { return (float)this.box.getHeight() * DESCENT_RATIO; } @@ -123,12 +141,32 @@ public void setPrinted(boolean state) { @Override public String getText() { + if (!this.getChildren().isEmpty()) { + var builder = new StringBuilder(); + for (var kid : this.children) { + builder.append(kid.getText()).append(" "); + } + return builder.toString().trim(); + } return this.text; } + @Override public String forPrint() { + if (!this.children.isEmpty()) { + var builder = new StringBuilder(); + for (var kid : this.children) { + var txtPrint = kid.forPrint(); + builder.append(txtPrint); + if (kid.isRTL()) { + builder.append(ZERO_WIDTH); + } + builder.append(" "); + } + return builder.toString().trim(); + } if (this.isRTL()) { - return new StringBuffer(this.text).reverse().toString(); + return new StringBuilder(this.text).reverse().toString(); } return this.text; } diff --git a/src/main/java/de/ulb/digital/derivans/model/text/Textline.java b/src/main/java/de/ulb/digital/derivans/model/text/Textline.java index 2e8f2d9..badfb5f 100644 --- a/src/main/java/de/ulb/digital/derivans/model/text/Textline.java +++ b/src/main/java/de/ulb/digital/derivans/model/text/Textline.java @@ -7,7 +7,6 @@ import java.util.Objects; import java.util.stream.Collectors; -import de.ulb.digital.derivans.model.ITextElement; import de.ulb.digital.derivans.model.IVisualElement; /** @@ -17,7 +16,7 @@ * @author u.hartwig * */ -public class Textline implements IVisualElement, ITextElement { +public class Textline implements IVisualElement { private List textTokens = new ArrayList<>(); @@ -37,7 +36,7 @@ public Textline(List texts) { this.textTokens.addAll(texts); this.calculateArea(); this.actualText = String.join(" ", - texts.stream().map(ITextElement::getText).filter(Objects::nonNull).collect(Collectors.toList())); + texts.stream().map(Word::getText).filter(Objects::nonNull).collect(Collectors.toList())); } public void calculateArea() { diff --git a/src/main/java/de/ulb/digital/derivans/model/text/Word.java b/src/main/java/de/ulb/digital/derivans/model/text/Word.java index e812b27..8a2d3cc 100644 --- a/src/main/java/de/ulb/digital/derivans/model/text/Word.java +++ b/src/main/java/de/ulb/digital/derivans/model/text/Word.java @@ -4,7 +4,6 @@ import java.util.List; import de.ulb.digital.derivans.model.IVisualElement; -import de.ulb.digital.derivans.model.ITextElement; import de.ulb.digital.derivans.model.pdf.PDFTextElementType; /** @@ -15,13 +14,13 @@ * @author u.hartwig * */ -public class Word implements ITextElement, IVisualElement { +public class Word implements IVisualElement { protected PDFTextElementType type = PDFTextElementType.TOKEN; - protected ITextElement parent; + protected IVisualElement parent; - protected List children; + protected List children; protected Rectangle2D rect; diff --git a/src/test/java/de/ulb/digital/derivans/TestDerivansArguments.java b/src/test/java/de/ulb/digital/derivans/TestDerivansArguments.java index e755c49..85f93a7 100644 --- a/src/test/java/de/ulb/digital/derivans/TestDerivansArguments.java +++ b/src/test/java/de/ulb/digital/derivans/TestDerivansArguments.java @@ -1,7 +1,6 @@ package de.ulb.digital.derivans; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; diff --git a/src/test/java/de/ulb/digital/derivans/derivate/pdf/TestPDFSyntheticArabicOCR.java b/src/test/java/de/ulb/digital/derivans/derivate/pdf/TestPDFSyntheticArabicOCR.java index 063de02..f64eae5 100644 --- a/src/test/java/de/ulb/digital/derivans/derivate/pdf/TestPDFSyntheticArabicOCR.java +++ b/src/test/java/de/ulb/digital/derivans/derivate/pdf/TestPDFSyntheticArabicOCR.java @@ -194,21 +194,16 @@ void wordLevelFirstlineBaselineWord01() { static OCRData arabicOCR() { var w1 = new Word("٨", new Rectangle(400, textMarginTop, 15, 30)); // ٨ assertEquals(1, w1.getText().length()); - assertTrue(w1.isRTL()); var w2 = new Word("ديبا", new Rectangle(200, textMarginTop, 60, 30)); assertEquals(4, w2.getText().length()); - assertTrue(w2.isRTL()); var w3 = new Word("جه", new Rectangle(160, textMarginTop, 30, 30)); assertEquals(2, w3.getText().length()); - assertTrue(w3.isRTL()); // var w4 = new Word("\u0627\u0644\u0633\u0639\u0631", new Rectangle(400, textMarginTop + 40, 75, 30)); // السعر - assertTrue(w4.isRTL()); assertEquals(5, w4.getText().length()); var w5 = new Word("\u0627\u0644\u0627\u062c\u0645\u0627\u0644\u064a", new Rectangle(250, textMarginTop + 40, 120, 30)); // الاجمالي - assertTrue(w5.isRTL()); assertEquals(8, w5.getText().length()); List lines = List.of( new Textline(List.of(w1, w2, w3)), diff --git a/src/test/java/de/ulb/digital/derivans/model/ocr/TestOCRData.java b/src/test/java/de/ulb/digital/derivans/model/ocr/TestOCRData.java deleted file mode 100644 index a97cd86..0000000 --- a/src/test/java/de/ulb/digital/derivans/model/ocr/TestOCRData.java +++ /dev/null @@ -1,57 +0,0 @@ -package de.ulb.digital.derivans.model.ocr; - -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.awt.Rectangle; -import java.util.Arrays; -import java.util.List; - -import org.junit.jupiter.api.Test; - -import de.ulb.digital.derivans.model.text.Textline; -import de.ulb.digital.derivans.model.text.Word; - -/** - * @author u.hartwig - */ -class TestOCRData { - - /** - * Minimum LTR example - * Textline with single ASCII word is - * considered to be left-to-right - */ - @Test - void testTextOrientationTextLine() { - // arrange - var w1 = new Word("hello", new Rectangle(0, 0, 100, 20)); - List words = Arrays.asList(w1); - Textline line = new Textline(words); - - // act - assertFalse(line.isRTL()); - } - - @Test - void testTextOrientationForPersianWord() { - // arrange - var w1 = new Word("چه", new Rectangle(0, 0, 100, 20)); - List words = Arrays.asList(w1); - Textline line = new Textline(words); - - // act - assertTrue(line.isRTL()); - } - - @Test - void testTextOrientationForHebrewWord() { - // arrange - var w1 = new Word("א", new Rectangle(0, 0, 100, 20)); - List words = Arrays.asList(w1); - Textline line = new Textline(words); - - // act - assertTrue(line.isRTL()); - } -} \ No newline at end of file diff --git a/src/test/java/de/ulb/digital/derivans/model/pdf/TestTextElement.java b/src/test/java/de/ulb/digital/derivans/model/pdf/TestTextElement.java index 22abb5c..824ae04 100644 --- a/src/test/java/de/ulb/digital/derivans/model/pdf/TestTextElement.java +++ b/src/test/java/de/ulb/digital/derivans/model/pdf/TestTextElement.java @@ -3,6 +3,10 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; +import java.awt.geom.Rectangle2D; +import java.util.Arrays; +import java.util.List; + import org.junit.jupiter.api.Test; /** @@ -20,4 +24,64 @@ void testPersianTextElement() { assertTrue(new PDFTextElement("ديباجه").isRTL()); } + /** + * + * Ensure Changes in text orientation reflected on line-level, too. + * + */ + @Test + void testOrientationWithBiscriptualChildren() { + var w1 = new PDFTextElement("مخصوصی", new Rectangle2D.Float(523f, 1521f, 135f, 36f)); + var w2 = new PDFTextElement("Pascal", new Rectangle2D.Float(384f, 1513f, 107f, 44f)); + var w3 = new PDFTextElement("کرد", new Rectangle2D.Float(312f, 1514f, 60f, 43f)); + + var l1 = new PDFTextElement(PDFTextElementType.LINE); + l1.add(w1); + l1.add(w2); + l1.add(w3); + + var printText = l1.forPrint(); + assertTrue(printText.contains("Pascal")); + assertTrue(printText.contains("یصوصخم")); + assertTrue(printText.contains("درک")); + } + + /** + * Minimum LTR example + * Textline with single ASCII word is + * considered to be left-to-right + */ + @Test + void testTextOrientationTextLine() { + // arrange + var w1 = new PDFTextElement("hello", new Rectangle2D.Double(0, 0, 100, 20), + PDFTextElementType.TOKEN); + List words = Arrays.asList(w1); + PDFTextElement line = new PDFTextElement(words); + + // act + assertFalse(line.isRTL()); + } + + @Test + void testTextOrientationForPersianWord() { + // arrange + var w1 = new PDFTextElement("چه", new Rectangle2D.Double(0, 0, 100, 20)); + List words = Arrays.asList(w1); + var line = new PDFTextElement(words); + + // act + assertTrue(line.isRTL()); + } + + @Test + void testTextOrientationForHebrewWord() { + // arrange + var w1 = new PDFTextElement("א", new Rectangle2D.Double(0, 0, 100, 20)); + List words = Arrays.asList(w1); + var line = new PDFTextElement(words); + + // act + assertTrue(line.isRTL()); + } }