-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Same as sent to UIBK, with addition of --BB2 option
- Loading branch information
Showing
2 changed files
with
267 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
DU task for segmenting words into Menu-Items using the conjugate graph | ||
Copyright NAVER LABS Europe(C) 2019 Jean-Luc Meunier | ||
Developed for the EU project READ. The READ project has received funding | ||
from the European Union's Horizon 2020 research and innovation programme | ||
under grant agreement No 674943. | ||
""" | ||
|
||
import sys, os | ||
from graph.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText | ||
|
||
try: #to ease the use without proper Python installation | ||
import TranskribusDU_version | ||
except ImportError: | ||
sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) ) | ||
import TranskribusDU_version | ||
TranskribusDU_version | ||
|
||
from common.trace import traceln | ||
|
||
from graph.NodeType_PageXml import defaultBBoxDeltaFun | ||
|
||
from tasks.DU_Task_Factory import DU_Task_Factory | ||
from graph.pkg_GraphBinaryConjugateSegmenter.MultiSinglePageXml import MultiSinglePageXml as ConjugateSegmenterGraph_MultiSinglePageXml | ||
from graph.pkg_GraphBinaryConjugateSegmenter.MultiSinglePageXml_Separator \ | ||
import MultiSinglePageXml_Separator \ | ||
as ConjugateSegmenterGraph_MultiSinglePageXml_Separator | ||
|
||
#from graph.pkg_ReifiedEdge.MultiSinglePageXml_Segmenter_Separator_DOM import Graph_MultiSinglePageXml_Segmenter_Separator_DOM | ||
|
||
|
||
from graph.NodeType_PageXml import NodeType_PageXml_type | ||
from graph.NodeType_jsonOCR import NodeType_jsonOCR | ||
from graph.FeatureDefinition_Generic_noText import FeatureDefinition_Generic_noText | ||
from graph.FeatureDefinition_Generic import FeatureDefinition_Generic | ||
from tasks.DU_Task_Features import Features_June19_Full, Features_June19_Full_Separator | ||
|
||
|
||
# ---------------------------------------------------------------------------- | ||
|
||
class My_ConjugateNodeType(NodeType_PageXml_type): | ||
""" | ||
We need this to extract properly the label from the label attribute of the (parent) TableCell element. | ||
""" | ||
def __init__(self, sNodeTypeName, lsLabel, lsIgnoredLabel=None, bOther=True, BBoxDeltaFun=defaultBBoxDeltaFun | ||
, bPreserveWidth=False): | ||
super(My_ConjugateNodeType, self).__init__(sNodeTypeName, lsLabel, lsIgnoredLabel, bOther, BBoxDeltaFun | ||
, bPreserveWidth=bPreserveWidth) | ||
|
||
def parseDocNodeLabel(self, graph_node, defaultCls=None): | ||
""" | ||
Parse and set the graph node label and return its class index | ||
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one | ||
""" | ||
#sLabel = domnode.getparent().get(self.sLabelAttr) | ||
domnode = graph_node.node.getparent() | ||
sLabel = domnode.get(self.sLabelAttr) | ||
|
||
return sLabel if not sLabel is None else "__none__" | ||
|
||
def setDocNodeLabel(self, graph_node, sLabel): | ||
raise Exception("This should not occur in conjugate mode") | ||
|
||
|
||
def getConfiguredGraphClass(doer): | ||
""" | ||
In this class method, we must return a configured graph class | ||
""" | ||
# if options.bReified: | ||
# DU_GRAPH = Graph_MultiSinglePageXml_Segmenter_Separator_DOM | ||
if options.bSeparator: | ||
DU_GRAPH = ConjugateSegmenterGraph_MultiSinglePageXml_Separator | ||
else: | ||
DU_GRAPH = ConjugateSegmenterGraph_MultiSinglePageXml | ||
ntClass = My_ConjugateNodeType | ||
|
||
if options.bBB2: | ||
nt = ntClass("mi_clstr" #some short prefix because labels below are prefixed with it | ||
, [] # in conjugate, we accept all labels, andNone becomes "none" | ||
, [] | ||
, False # unused | ||
, BBoxDeltaFun = None | ||
, bPreserveWidth=True | ||
) | ||
else: | ||
nt = ntClass("mi_clstr" #some short prefix because labels below are prefixed with it | ||
, [] # in conjugate, we accept all labels, andNone becomes "none" | ||
, [] | ||
, False # unused | ||
, BBoxDeltaFun =lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way | ||
) | ||
nt.setLabelAttribute("id") | ||
|
||
## HD added 23/01/2020: needed for output generation | ||
DU_GRAPH.clusterType='paragraph' | ||
nt.setXpathExpr(( ".//pc:TextLine" | ||
, "./pc:TextEquiv") #how to get their text | ||
) | ||
DU_GRAPH.addNodeType(nt) | ||
|
||
return DU_GRAPH | ||
|
||
|
||
if __name__ == "__main__": | ||
traceln("VERSION: %s" % DU_Task_Factory.getVersion()) | ||
|
||
# standard command line options for CRF- ECN- GAT-based methods | ||
usage, parser = DU_Task_Factory.getStandardOptionsParser(sys.argv[0]) | ||
# parser.add_option("--spm" , dest='sSPModel' , action="store", type="string" | ||
# , help="Textual features are computed based on the given SentencePiece model. e.g. model/toto.model.") | ||
parser.add_option("--unigram" , dest='unigram' , action="store" | ||
,type="int", default = 0 | ||
, help="Textual features as unigram: Max uni") | ||
parser.add_option("--pxmlfeatures" , dest='pxmlfeatures' , action="store_true" | ||
, default=False | ||
, help="Use pageXml features (page h,w)") | ||
parser.add_option("--separator" , dest='bSeparator' , action="store_true" | ||
, default=False | ||
, help="Use separators") | ||
parser.add_option("--BB2", dest='bBB2' , action="store_true" | ||
, help="New style BB (same width as baseline, no resize)") | ||
(options, args) = parser.parse_args() | ||
|
||
|
||
if options.bSeparator: | ||
cFeatureDefinition = Features_June19_Full_Separator | ||
dFeatureConfig = { } | ||
elif options.pxmlfeatures: | ||
cFeatureDefinition = Features_June19_Full | ||
dFeatureConfig = { } | ||
else: | ||
cFeatureDefinition = FeatureDefinition_Generic | ||
dFeatureConfig = { } | ||
|
||
try: | ||
sModelDir, sModelName = args | ||
except Exception as e: | ||
traceln("Specify a model folder and a model name!") | ||
DU_Task_Factory.exit(usage, 1, e) | ||
|
||
doer = DU_Task_Factory.getDoer(sModelDir, sModelName | ||
, options = options | ||
, fun_getConfiguredGraphClass= getConfiguredGraphClass | ||
, cFeatureDefinition = cFeatureDefinition | ||
, dFeatureConfig = dFeatureConfig | ||
) | ||
|
||
# setting the learner configuration, in a standard way | ||
# (from command line options, or from a JSON configuration file) | ||
dLearnerConfig = doer.getStandardLearnerConfig(options) | ||
|
||
|
||
# # force a balanced weighting | ||
# print("Forcing balanced weights") | ||
# dLearnerConfig['balanced'] = True | ||
|
||
# of course, you can put yours here instead. | ||
doer.setLearnerConfiguration(dLearnerConfig) | ||
|
||
# act as per specified in the command line (--trn , --fold-run, ...) | ||
doer.standardDo(options) | ||
|
||
del doer | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
""" | ||
Newseye | ||
convert | ||
<TextRegion id="r_11_separator" custom="readingOrder {index:17;} structure {type:separator;}"> | ||
<Coords points="126,574 1282,574 1282,592 126,592"/> | ||
</TextRegion> | ||
into | ||
<Separator id="r_11_separator" > | ||
<Coords points="126,574 1282,574 1282,592 126,592"/> | ||
</Separator> | ||
""" | ||
|
||
from glob import glob | ||
from optparse import OptionParser | ||
import sys,os | ||
from lxml import etree | ||
|
||
from xml_formats.PageXml import PageXml, PageXmlException | ||
from util.Polygon import Polygon | ||
|
||
|
||
def convertTR2Sep(filename): | ||
""" | ||
""" | ||
print (filename) | ||
tagname='TextRegion' | ||
xml = etree.parse(filename) | ||
ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}) | ||
|
||
for x in ltextsep: | ||
if "separator" in x.get('custom'): | ||
x.tag = 'SeparatorRegion' | ||
|
||
#now we need to convert that object to a line | ||
lXY = PageXml.getPointList(x) #the polygon | ||
assert lXY, "Separator without Coord??" | ||
|
||
plg = Polygon(lXY) | ||
try: | ||
x1,y1, x2,y2 = plg.fitRectangle() | ||
except ValueError: | ||
print("Warning: Coords might be bad, taking bounding box: ", lXY) | ||
x1,y1,x2,y2 = plg.getBoundingBox() | ||
# try: | ||
# x1,y1, x2,y2 = plg.fitRectangle() | ||
# except ZeroDivisionError: | ||
# x1,y1,x2,y2 = plg.getBoundingBox() | ||
# except ValueError: | ||
# x1,y1,x2,y2 = plg.getBoundingBox() | ||
if abs(x2-x1) > abs(y2-y1): # horizontal | ||
y1 = (y1+y2)/2 | ||
y2 = y1 | ||
else: | ||
x1 = (x1+x2)/2 | ||
x2=x1 | ||
|
||
ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0] | ||
PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)]) | ||
|
||
return xml | ||
|
||
|
||
def convertFiles(lfilename,outdir): | ||
|
||
for filename in lfilename: | ||
|
||
doc = convertTR2Sep(filename) | ||
newfilename = outdir + os.path.sep + f'a_{os.path.basename(filename)}' | ||
with open(f'{newfilename}', 'wb') as f: | ||
doc.write(f, encoding="utf-8", xml_declaration=True, pretty_print=True) | ||
if __name__ == '__main__': | ||
|
||
version = "v.01" | ||
sUsage=""" | ||
Usage: %s <InputDir> <OuputDir> | ||
""" % (sys.argv[0]) | ||
|
||
parser = OptionParser(usage=sUsage) | ||
parser.add_option("--ext", dest='extension', type='string',action="store" | ||
, help="file extension") | ||
(options, args) = parser.parse_args() | ||
|
||
|
||
try: | ||
folderIn = args[0] | ||
folderOut = args[1] | ||
except ValueError: | ||
sys.stderr.write(sUsage) | ||
sys.exit(1) | ||
|
||
|
||
lsFile = sorted([s for s in glob(os.path.join(folderIn, '*')) if s.endswith(options.extension)]) | ||
convertFiles(lsFile,folderOut) | ||
|
||
print (f'done for {len(lsFile)} files.') |