Skip to content

Commit

Permalink
Merge pull request #7 from bertsky/fix-ocrd
Browse files Browse the repository at this point in the history
Fix OCR-D

Detailed inspection reveals character issues from HTML-like entities, which are going to be fixed, turning our Suͤnden⸗Fall into a Suͤnden⸗Fall - what it should be !
That's highly appreciated! Thanks-a-lot @bertsky !
  • Loading branch information
M3ssman authored Oct 23, 2024
2 parents 734e401 + d5457c3 commit 2cc2dca
Show file tree
Hide file tree
Showing 1,027 changed files with 43,930 additions and 27,679 deletions.
22 changes: 18 additions & 4 deletions data/ger/GT-PAGE/urn+nbn+de+gbv+3+1-112032-p0026-5_ger.gt.xml
Original file line number Diff line number Diff line change
@@ -1,12 +1,26 @@
<?xml version='1.0' encoding='utf-8'?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" pcGtsId="PAGE_01" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="OCR-D-GT-FULLTEXT-REPAIR-1">
<Metadata>
<Creator>OCR-D DFG Phase III</Creator>
<Created>2024-01-19T13:07:02Z</Created>
<LastChange>2024-01-19T13:07:02Z</LastChange>
<Comments>ODEM: OCR-D extension for mass digitization (2021-2024)</Comments>
<MetadataItem type="processingStep" name="layout/segmentation/region" value="ocrd-segment-repair">
<Labels externalModel="ocrd-tool" externalId="parameters">
<Label value="False" type="sanitize"/>
<Label value="5" type="sanitize_padding"/>
<Label value="0" type="simplify"/>
<Label value="False" type="plausibilize"/>
<Label value="0.9" type="plausibilize_merge_min_overlap"/>
<Label value="0" type="spread"/>
<Label value="region" type="spread_level"/>
</Labels>
<Labels externalModel="ocrd-tool" externalId="version">
<Label value="0.1.24" type="ocrd-segment-repair"/>
<Label value="2.65.0" type="ocrd/core"/>
</Labels>
</MetadataItem>
</Metadata>
<Page imageFilename="urn+nbn+de+gbv+3+1-112032-p0026-5_ger.jpg" imageWidth="1269" imageHeight="1946" type="content" primaryLanguage="German">
<Page imageFilename="https://opendata.uni-halle.de/retrieve/66cd3e7c-1932-43ce-9e6c-b79ca72a5fe9/00000026.jpg" imageWidth="1269" imageHeight="1946" type="content" primaryLanguage="German">
<Border>
<Coords points="237,164 1117,164 1117,1666 237,1666"/>
</Border>
Expand All @@ -15,7 +29,7 @@
<RegionRefIndexed index="3" regionRef="region0002"/>
</OrderedGroup>
</ReadingOrder>
<TextRegion orientation="-0.22018771" readingDirection="left-to-right" id="region0002" custom="readingOrder {index:3;}">
<TextRegion id="region0002" custom="readingOrder {index:3;}" orientation="-0.22018771" readingDirection="left-to-right">
<Coords points="1117,266 240,257 237,537 237,1657 1020,1666 1109,1666 1117,920"/>
<TextLine id="region0002_line0001" custom="readingOrder {index:0;}">
<Coords points="397,260 365,262 347,270 306,270 300,264 290,264 282,271 273,271 256,278 240,278 240,314 276,316 312,312 482,313 513,316 521,324 529,324 537,317 576,315 726,315 803,317 810,324 819,324 829,316 848,316 860,307 877,307 884,308 892,317 907,318 913,324 923,324 929,318 951,318 958,324 973,318 986,317 1117,320 1117,277 1072,277 1028,273 1024,270 1010,273 981,266 945,264 938,264 886,268 816,265 800,270 774,264 735,264 713,268 697,267 691,271 682,262 672,263 665,272 611,269 603,265 592,265 582,270 533,273 503,269 497,260 485,260 474,268 436,262"/>
Expand Down
49 changes: 32 additions & 17 deletions data/ger/GT-PAGE/urn+nbn+de+gbv+3+1-113129-p0007-8_ger.gt.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,26 @@
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" pcGtsId="PAGE_01" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd">
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="OCR-D-GT-FULLTEXT-REPAIR-2">
<Metadata>
<Creator>OCR-D DFG Phase III</Creator>
<Created>2024-01-19T13:07:02Z</Created>
<LastChange>2024-01-19T13:07:02Z</LastChange>
<Comments>ODEM: OCR-D extension for mass digitization (2021-2024)</Comments>
<MetadataItem type="processingStep" name="layout/segmentation/region" value="ocrd-segment-repair">
<Labels externalModel="ocrd-tool" externalId="parameters">
<Label value="False" type="sanitize"/>
<Label value="5" type="sanitize_padding"/>
<Label value="0" type="simplify"/>
<Label value="False" type="plausibilize"/>
<Label value="0.9" type="plausibilize_merge_min_overlap"/>
<Label value="0" type="spread"/>
<Label value="region" type="spread_level"/>
</Labels>
<Labels externalModel="ocrd-tool" externalId="version">
<Label value="0.1.24" type="ocrd-segment-repair"/>
<Label value="2.65.0" type="ocrd/core"/>
</Labels>
</MetadataItem>
</Metadata>
<Page imageFilename="urn+nbn+de+gbv+3+1-113129-p0007-8_ger.jpg" imageWidth="1692" imageHeight="2406" type="content" primaryLanguage="German">
<Page imageFilename="https://opendata.uni-halle.de/retrieve/eeeee05d-c7cd-4e89-9607-5d1ac175afa1/00000007.jpg" imageWidth="1692" imageHeight="2406" type="content" primaryLanguage="German">
<Border>
<Coords points="61,121 71,2338 1414,2344 1396,119"/>
</Border>
Expand All @@ -21,9 +36,6 @@
<RegionRefIndexed index="8" regionRef="TextRegion_1645513190201_73"/>
</OrderedGroup>
</ReadingOrder>
<ImageRegion orientation="0.5962476" id="region0002" custom="readingOrder {index:0;}">
<Coords points="278,241 1299,238 1308,399 287,402"/>
</ImageRegion>
<TextRegion id="region_1639749174166_48" custom="readingOrder {index:1;}">
<Coords points="567,474 567,599 1009,599 1009,474"/>
<TextLine id="line_1639749174315_51" custom="readingOrder {index:0;} structure {type:heading;}">
Expand All @@ -42,7 +54,7 @@
<Unicode>Wunsch.</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion orientation="0.5962476" readingDirection="left-to-right" id="region0003" custom="readingOrder {index:2;}">
<TextRegion id="region0003" custom="readingOrder {index:2;}" orientation="0.5962476" readingDirection="left-to-right">
<Coords points="272,639 1301,636 1303,1326 274,1329"/>
<TextLine id="region0003_line0001" custom="readingOrder {index:0;}">
<Coords points="302,639 298,642 272,642 272,724 338,722 345,728 356,728 367,720 420,719 431,712 440,717 476,713 490,718 499,712 505,712 541,714 549,718 578,715 589,718 646,717 654,720 678,717 684,713 699,711 708,717 718,716 723,719 732,719 736,716 740,719 751,716 766,718 770,715 777,719 801,718 805,715 840,715 858,722 869,722 884,717 897,720 903,716 921,719 935,715 968,716 974,713 991,716 999,713 1003,716 1013,716 1017,713 1028,713 1039,716 1050,711 1052,720 1061,723 1070,722 1077,711 1117,711 1121,714 1300,711 1300,647 1247,645 1243,639 1236,638 1227,638 1217,646 1181,647 1132,644 1129,637 1113,637 1104,643 1053,644 1049,641 1037,642 1022,649 975,648 965,643 934,647 917,644 888,646 875,643 871,646 848,645 792,654 754,657 677,652 664,644 649,645 638,652 619,652 561,649 554,642 539,642 528,649 455,652 452,643 447,639 424,639 417,645 413,642 401,642 392,648 365,644 358,639 337,639 332,643"/>
Expand Down Expand Up @@ -374,7 +386,7 @@ traͤgſt die Suͤnde der Welt, gib
uns deinen Frieden. Amen.</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion orientation="0.5962476" type="heading" readingDirection="left-to-right" id="TextRegion_1645513144715_64" custom="readingOrder {index:3;} structure {type:heading;}">
<TextRegion id="TextRegion_1645513144715_64" custom="readingOrder {index:3;} structure {type:heading;}" orientation="0.5962476" type="heading" readingDirection="left-to-right">
<Coords points="283,1368 1307,1365 1307,1478 283,1478"/>
<TextLine id="region0004_line0001" custom="readingOrder {index:0;}">
<Coords points="614,1368 544,1371 536,1375 535,1424 544,1430 545,1458 548,1462 610,1468 682,1470 772,1465 785,1478 807,1477 823,1464 883,1464 888,1465 893,1473 903,1478 917,1478 930,1472 1025,1472 1031,1466 1030,1446 1038,1445 1043,1440 1043,1393 1038,1388 881,1387 877,1390 813,1391 809,1388 799,1388 795,1391 765,1390 756,1393 747,1391 744,1379 731,1367"/>
Expand All @@ -392,10 +404,10 @@ uns deinen Frieden. Amen.</Unicode>
<Unicode>Eingang.</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion orientation="0.5962476" type="heading" readingDirection="left-to-right" id="TextRegion_1645513306237_126" custom="readingOrder {index:4;} structure {type:heading;}">
<TextRegion id="TextRegion_1645513306237_126" custom="readingOrder {index:4;} structure {type:heading;}" orientation="0.5962476" type="heading" readingDirection="left-to-right">
<Coords points="283,1478 1307,1478 1307,1541 283,1541"/>
<TextLine id="region0004_line0002" custom="readingOrder {index:0;}">
<Coords points="752,1481 742,1488 665,1483 497,1485 492,1490 492,1537 497,1542 529,1542 581,1547 616,1545 632,1551 642,1551 649,1547 657,1551 666,1551 672,1546 748,1542 773,1549 781,1540 787,1544 796,1544 805,1538 819,1536 823,1539 842,1539 849,1544 866,1545 873,1535 874,1544 881,1551 892,1550 897,1542 914,1543 922,1549 937,1549 947,1544 977,1543 989,1546 993,1543 1068,1538 1073,1533 1073,1492 1068,1487 940,1490 772,1487 765,1481"/>
<Coords points="742,1488 665,1483 497,1485 492,1490 492,1537 496,1541 780,1541 781,1540 782,1541 800,1541 805,1538 819,1536 823,1539 842,1539 844,1541 868,1541 873,1535 873,1541 1023,1541 1068,1538 1073,1533 1073,1492 1068,1487 940,1490 772,1487 765,1481 752,1481"/>
<Word id="region0004_line0002_word0000" custom="readingOrder {index:0;}">
<Coords points="799,1541 799,1493 600,1495 600,1545 616,1545 632,1551 642,1551 649,1547 657,1551 666,1551 672,1546 748,1542 773,1549 781,1540 787,1544 796,1544"/>
<TextEquiv conf="0.90820235">
Expand All @@ -422,7 +434,7 @@ uns deinen Frieden. Amen.</Unicode>
<Unicode>Matth. 27, 51.</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion orientation="0.5962476" readingDirection="left-to-right" id="TextRegion_1645513306237_125" custom="readingOrder {index:5;}">
<TextRegion id="TextRegion_1645513306237_125" custom="readingOrder {index:5;}" orientation="0.5962476" readingDirection="left-to-right">
<Coords points="283,1541 1307,1541 1308,1767 284,1770"/>
<TextLine id="region0004_line0003" custom="readingOrder {index:0;}">
<Coords points="408,1546 396,1550 393,1556 379,1556 372,1548 345,1548 339,1553 329,1550 283,1553 283,1595 284,1627 402,1623 407,1632 414,1633 422,1631 429,1622 471,1619 544,1620 546,1624 568,1620 744,1617 763,1620 769,1616 840,1615 851,1625 864,1621 871,1614 890,1610 909,1616 935,1613 960,1631 978,1626 981,1617 1047,1617 1132,1622 1224,1619 1229,1623 1233,1633 1245,1632 1250,1621 1307,1621 1306,1547 1302,1544 1281,1548 1276,1556 1232,1558 1198,1555 1134,1543 1025,1554 1020,1544 1007,1543 1000,1547 999,1555 946,1557 868,1554 866,1546 844,1542 834,1544 831,1553 767,1550 751,1544 725,1544 713,1550 661,1556 625,1555 622,1548 616,1546 599,1546 591,1555 503,1554 494,1545 478,1545 469,1555 425,1557 419,1547"/>
Expand Down Expand Up @@ -544,7 +556,7 @@ zerriß in zwey Stuͤcken, von oben
an bis unten aus.</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion orientation="0.5962476" type="drop-capital" readingDirection="left-to-right" id="TextRegion_1645513195284_84" custom="readingOrder {index:6;} structure {type:drop-capital;}">
<TextRegion id="TextRegion_1645513195284_84" custom="readingOrder {index:6;} structure {type:drop-capital;}" orientation="0.5962476" type="drop-capital" readingDirection="left-to-right">
<Coords points="297,1799 456,1798 456,1989 297,1989"/>
<TextLine id="line_1645513244457_114" custom="readingOrder {index:0;}">
<Coords points="304,1807 453,1807 453,1984 304,1984"/>
Expand All @@ -562,7 +574,7 @@ an bis unten aus.</Unicode>
<Unicode>J</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion orientation="0.5962476" readingDirection="left-to-right" id="TextRegion_1645513195277_83" custom="readingOrder {index:7;}">
<TextRegion id="TextRegion_1645513195277_83" custom="readingOrder {index:7;}" orientation="0.5962476" readingDirection="left-to-right">
<Coords points="456,1798 1311,1797 1311,1989 456,1989"/>
<TextLine id="line_1645513213738_94" custom="readingOrder {index:0;}">
<Coords points="463,1814 513,1814 540,1816 549,1820 557,1815 565,1815 578,1826 586,1826 596,1815 608,1813 618,1814 623,1821 633,1816 652,1814 720,1813 772,1824 791,1819 797,1823 901,1822 907,1813 918,1813 927,1821 936,1820 945,1812 959,1811 967,1820 995,1823 1000,1823 1008,1813 1017,1813 1027,1824 1064,1823 1073,1814 1085,1813 1093,1822 1116,1823 1126,1814 1141,1813 1149,1817 1152,1825 1182,1824 1189,1815 1204,1814 1214,1824 1220,1825 1309,1827 1309,1881 1135,1879 1128,1886 1120,1886 1113,1878 900,1875 737,1880 608,1879 602,1888 591,1887 583,1879 463,1882"/>
Expand All @@ -573,7 +585,7 @@ an bis unten aus.</Unicode>
</TextEquiv>
</Word>
<Word id="region0005_line0001_word0001" custom="readingOrder {index:1;}">
<Coords points="653,1879 653,1816 525,1817 525,1880 583,1879 591,1887 602,1888 608,1879"/>
<Coords points="653,1816 632,1816 623,1821 619,1816 594,1816 586,1826 578,1826 566,1816 554,1816 549,1820 541,1816 525,1817 525,1880 583,1879 591,1887 602,1888 608,1879 653,1879"/>
<TextEquiv conf="0.9255404">
<Unicode>dieſen</Unicode>
</TextEquiv>
Expand Down Expand Up @@ -655,7 +667,7 @@ an bis unten aus.</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="line_1645513195315_90" custom="readingOrder {index:2;}">
<Coords points="456,1943 488,1942 516,1943 539,1934 548,1934 553,1942 562,1943 572,1939 625,1936 634,1931 706,1938 738,1938 752,1931 769,1931 777,1937 791,1930 798,1930 808,1937 815,1937 879,1935 891,1929 902,1929 907,1938 943,1935 955,1938 975,1936 982,1930 997,1929 1001,1931 1003,1938 1010,1940 1019,1937 1041,1938 1045,1935 1059,1941 1086,1936 1139,1937 1150,1932 1159,1932 1167,1936 1194,1938 1311,1938 1311,1993 1217,1994 1215,1997 1202,1999 1182,1992 1157,1990 1105,1990 1091,1984 1069,1990 1059,1986 1047,1989 1014,1989 981,1985 975,1988 966,1988 959,1994 946,1995 933,1991 929,1985 896,1988 884,1983 867,1984 846,1990 798,1991 768,1992 762,1988 753,1991 719,1991 687,1987 654,2000 645,2000 636,1990 622,1995 595,1993 582,2001 574,2001 564,1994 503,1994 501,1997 489,2000 479,1995 456,1994"/>
<Coords points="1194,1938 1167,1936 1159,1932 1150,1932 1139,1937 1086,1936 1059,1941 1045,1935 1041,1938 1019,1937 1010,1940 1003,1938 1001,1931 997,1929 982,1930 975,1936 955,1938 943,1935 907,1938 902,1929 891,1929 879,1935 815,1937 808,1937 798,1930 791,1930 777,1937 769,1931 752,1931 738,1938 706,1938 634,1931 625,1936 572,1939 562,1943 553,1942 548,1934 539,1934 516,1943 488,1942 456,1943 456,1989 681,1989 687,1987 703,1989 759,1989 762,1988 763,1989 849,1989 867,1984 884,1983 896,1988 929,1985 931,1989 964,1989 966,1988 975,1988 981,1985 1014,1989 1047,1989 1059,1986 1066,1989 1072,1989 1091,1984 1102,1989 1311,1989 1311,1938"/>
<Word id="word_1645513190232_77" custom="readingOrder {index:0;}">
<Coords points="484,1947 546,1946 546,1989 484,1989"/>
<TextEquiv conf="0.9298376">
Expand Down Expand Up @@ -702,10 +714,10 @@ Zufall oder Werck, ſo fich im Tem⸗
pel zu Jeruſalem begeben, und zwar</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion orientation="0.5962476" readingDirection="left-to-right" id="TextRegion_1645513190201_73" custom="readingOrder {index:8;}">
<TextRegion id="TextRegion_1645513190201_73" custom="readingOrder {index:8;}" orientation="0.5962476" readingDirection="left-to-right">
<Coords points="297,1989 1311,1989 1311,2062 297,2064"/>
<TextLine id="region0006_line0003" custom="readingOrder {index:0;}">
<Coords points="867,1985 846,1991 794,1994 768,1995 762,1989 753,1992 743,1990 737,1995 708,1995 706,1990 687,1988 654,2001 645,2001 636,1991 619,1998 595,1994 582,2002 566,1998 515,1997 493,2000 479,1998 413,2000 405,1992 395,1992 387,2002 298,2001 298,2059 299,2063 322,2063 331,2058 451,2054 491,2054 497,2060 515,2062 527,2054 580,2053 587,2059 602,2060 610,2052 671,2052 677,2058 686,2057 693,2050 795,2048 866,2057 935,2049 944,2054 956,2054 962,2049 1072,2049 1084,2057 1096,2057 1103,2050 1237,2050 1243,2044 1243,2006 1238,2001 1222,2001 1218,1998 1202,2000 1187,1994 1170,1994 1157,1991 1146,1994 1128,1994 1118,1990 1101,1993 1098,1988 1091,1985 1077,1987 1072,1993 1059,1987 1050,1988 1044,1994 1002,1994 1001,1989 995,1986 981,1986 971,1992 946,1996 933,1992 929,1986 916,1986 910,1990 894,1989 884,1984"/>
<Coords points="846,1991 794,1994 768,1995 762,1989 753,1992 743,1990 737,1995 708,1995 706,1990 696,1989 684,1989 654,2001 645,2001 636,1991 619,1998 595,1994 582,2002 566,1998 515,1997 493,2000 479,1998 413,2000 405,1992 395,1992 387,2002 298,2001 298,2059 299,2063 322,2063 331,2058 451,2054 491,2054 497,2060 515,2062 527,2054 580,2053 587,2059 602,2060 610,2052 671,2052 677,2058 686,2057 693,2050 795,2048 866,2057 935,2049 944,2054 956,2054 962,2049 1072,2049 1084,2057 1096,2057 1103,2050 1237,2050 1243,2044 1243,2006 1238,2001 1222,2001 1218,1998 1202,2000 1187,1994 1170,1994 1157,1991 1146,1994 1128,1994 1118,1990 1101,1993 1098,1989 1075,1989 1072,1993 1063,1989 1049,1989 1044,1994 1002,1994 1001,1989 976,1989 971,1992 946,1996 933,1992 931,1989 911,1989 910,1990 894,1989 853,1989"/>
<Word id="region0006_line0003_word0000" custom="readingOrder {index:0;}">
<Coords points="351,2057 351,2013 302,2013 302,2063 322,2063 331,2058"/>
<TextEquiv conf="0.9273798">
Expand Down Expand Up @@ -750,5 +762,8 @@ pel zu Jeruſalem begeben, und zwar</Unicode>
<Unicode>zu einer gantz beſondern Zeit, berichtet:</Unicode>
</TextEquiv>
</TextRegion>
<ImageRegion id="region0002" custom="readingOrder {index:0;}" orientation="0.5962476">
<Coords points="278,241 1299,238 1308,399 287,402"/>
</ImageRegion>
</Page>
</PcGts>
</PcGts>
Loading

0 comments on commit 2cc2dca

Please sign in to comment.