Skip to content

Commit

Permalink
Same as sent to UIBK, with addition of --BB2 option
Browse files Browse the repository at this point in the history
  • Loading branch information
jlmeunier committed Aug 7, 2020
1 parent 323e611 commit 6c42b14
Show file tree
Hide file tree
Showing 2 changed files with 267 additions and 0 deletions.
169 changes: 169 additions & 0 deletions usecases/NewsEye/SgmParagraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# -*- coding: utf-8 -*-

"""
DU task for segmenting words into Menu-Items using the conjugate graph
Copyright NAVER LABS Europe(C) 2019 Jean-Luc Meunier
Developed for the EU project READ. The READ project has received funding
from the European Union's Horizon 2020 research and innovation programme
under grant agreement No 674943.
"""

import sys, os
from graph.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText

try: #to ease the use without proper Python installation
import TranskribusDU_version
except ImportError:
sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
import TranskribusDU_version
TranskribusDU_version

from common.trace import traceln

from graph.NodeType_PageXml import defaultBBoxDeltaFun

from tasks.DU_Task_Factory import DU_Task_Factory
from graph.pkg_GraphBinaryConjugateSegmenter.MultiSinglePageXml import MultiSinglePageXml as ConjugateSegmenterGraph_MultiSinglePageXml
from graph.pkg_GraphBinaryConjugateSegmenter.MultiSinglePageXml_Separator \
import MultiSinglePageXml_Separator \
as ConjugateSegmenterGraph_MultiSinglePageXml_Separator

#from graph.pkg_ReifiedEdge.MultiSinglePageXml_Segmenter_Separator_DOM import Graph_MultiSinglePageXml_Segmenter_Separator_DOM


from graph.NodeType_PageXml import NodeType_PageXml_type
from graph.NodeType_jsonOCR import NodeType_jsonOCR
from graph.FeatureDefinition_Generic_noText import FeatureDefinition_Generic_noText
from graph.FeatureDefinition_Generic import FeatureDefinition_Generic
from tasks.DU_Task_Features import Features_June19_Full, Features_June19_Full_Separator


# ----------------------------------------------------------------------------

class My_ConjugateNodeType(NodeType_PageXml_type):
"""
We need this to extract properly the label from the label attribute of the (parent) TableCell element.
"""
def __init__(self, sNodeTypeName, lsLabel, lsIgnoredLabel=None, bOther=True, BBoxDeltaFun=defaultBBoxDeltaFun
, bPreserveWidth=False):
super(My_ConjugateNodeType, self).__init__(sNodeTypeName, lsLabel, lsIgnoredLabel, bOther, BBoxDeltaFun
, bPreserveWidth=bPreserveWidth)

def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
#sLabel = domnode.getparent().get(self.sLabelAttr)
domnode = graph_node.node.getparent()
sLabel = domnode.get(self.sLabelAttr)

return sLabel if not sLabel is None else "__none__"

def setDocNodeLabel(self, graph_node, sLabel):
raise Exception("This should not occur in conjugate mode")


def getConfiguredGraphClass(doer):
"""
In this class method, we must return a configured graph class
"""
# if options.bReified:
# DU_GRAPH = Graph_MultiSinglePageXml_Segmenter_Separator_DOM
if options.bSeparator:
DU_GRAPH = ConjugateSegmenterGraph_MultiSinglePageXml_Separator
else:
DU_GRAPH = ConjugateSegmenterGraph_MultiSinglePageXml
ntClass = My_ConjugateNodeType

if options.bBB2:
nt = ntClass("mi_clstr" #some short prefix because labels below are prefixed with it
, [] # in conjugate, we accept all labels, andNone becomes "none"
, []
, False # unused
, BBoxDeltaFun = None
, bPreserveWidth=True
)
else:
nt = ntClass("mi_clstr" #some short prefix because labels below are prefixed with it
, [] # in conjugate, we accept all labels, andNone becomes "none"
, []
, False # unused
, BBoxDeltaFun =lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way
)
nt.setLabelAttribute("id")

## HD added 23/01/2020: needed for output generation
DU_GRAPH.clusterType='paragraph'
nt.setXpathExpr(( ".//pc:TextLine"
, "./pc:TextEquiv") #how to get their text
)
DU_GRAPH.addNodeType(nt)

return DU_GRAPH


if __name__ == "__main__":
traceln("VERSION: %s" % DU_Task_Factory.getVersion())

# standard command line options for CRF- ECN- GAT-based methods
usage, parser = DU_Task_Factory.getStandardOptionsParser(sys.argv[0])
# parser.add_option("--spm" , dest='sSPModel' , action="store", type="string"
# , help="Textual features are computed based on the given SentencePiece model. e.g. model/toto.model.")
parser.add_option("--unigram" , dest='unigram' , action="store"
,type="int", default = 0
, help="Textual features as unigram: Max uni")
parser.add_option("--pxmlfeatures" , dest='pxmlfeatures' , action="store_true"
, default=False
, help="Use pageXml features (page h,w)")
parser.add_option("--separator" , dest='bSeparator' , action="store_true"
, default=False
, help="Use separators")
parser.add_option("--BB2", dest='bBB2' , action="store_true"
, help="New style BB (same width as baseline, no resize)")
(options, args) = parser.parse_args()


if options.bSeparator:
cFeatureDefinition = Features_June19_Full_Separator
dFeatureConfig = { }
elif options.pxmlfeatures:
cFeatureDefinition = Features_June19_Full
dFeatureConfig = { }
else:
cFeatureDefinition = FeatureDefinition_Generic
dFeatureConfig = { }

try:
sModelDir, sModelName = args
except Exception as e:
traceln("Specify a model folder and a model name!")
DU_Task_Factory.exit(usage, 1, e)

doer = DU_Task_Factory.getDoer(sModelDir, sModelName
, options = options
, fun_getConfiguredGraphClass= getConfiguredGraphClass
, cFeatureDefinition = cFeatureDefinition
, dFeatureConfig = dFeatureConfig
)

# setting the learner configuration, in a standard way
# (from command line options, or from a JSON configuration file)
dLearnerConfig = doer.getStandardLearnerConfig(options)


# # force a balanced weighting
# print("Forcing balanced weights")
# dLearnerConfig['balanced'] = True

# of course, you can put yours here instead.
doer.setLearnerConfiguration(dLearnerConfig)

# act as per specified in the command line (--trn , --fold-run, ...)
doer.standardDo(options)

del doer

98 changes: 98 additions & 0 deletions usecases/NewsEye/convertToLineSeparator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
Newseye
convert
<TextRegion id="r_11_separator" custom="readingOrder {index:17;} structure {type:separator;}">
<Coords points="126,574 1282,574 1282,592 126,592"/>
</TextRegion>
into
<Separator id="r_11_separator" >
<Coords points="126,574 1282,574 1282,592 126,592"/>
</Separator>
"""

from glob import glob
from optparse import OptionParser
import sys,os
from lxml import etree

from xml_formats.PageXml import PageXml, PageXmlException
from util.Polygon import Polygon


def convertTR2Sep(filename):
"""
"""
print (filename)
tagname='TextRegion'
xml = etree.parse(filename)
ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"})

for x in ltextsep:
if "separator" in x.get('custom'):
x.tag = 'SeparatorRegion'

#now we need to convert that object to a line
lXY = PageXml.getPointList(x) #the polygon
assert lXY, "Separator without Coord??"

plg = Polygon(lXY)
try:
x1,y1, x2,y2 = plg.fitRectangle()
except ValueError:
print("Warning: Coords might be bad, taking bounding box: ", lXY)
x1,y1,x2,y2 = plg.getBoundingBox()
# try:
# x1,y1, x2,y2 = plg.fitRectangle()
# except ZeroDivisionError:
# x1,y1,x2,y2 = plg.getBoundingBox()
# except ValueError:
# x1,y1,x2,y2 = plg.getBoundingBox()
if abs(x2-x1) > abs(y2-y1): # horizontal
y1 = (y1+y2)/2
y2 = y1
else:
x1 = (x1+x2)/2
x2=x1

ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0]
PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)])

return xml


def convertFiles(lfilename,outdir):

for filename in lfilename:

doc = convertTR2Sep(filename)
newfilename = outdir + os.path.sep + f'a_{os.path.basename(filename)}'
with open(f'{newfilename}', 'wb') as f:
doc.write(f, encoding="utf-8", xml_declaration=True, pretty_print=True)
if __name__ == '__main__':

version = "v.01"
sUsage="""
Usage: %s <InputDir> <OuputDir>
""" % (sys.argv[0])

parser = OptionParser(usage=sUsage)
parser.add_option("--ext", dest='extension', type='string',action="store"
, help="file extension")
(options, args) = parser.parse_args()


try:
folderIn = args[0]
folderOut = args[1]
except ValueError:
sys.stderr.write(sUsage)
sys.exit(1)


lsFile = sorted([s for s in glob(os.path.join(folderIn, '*')) if s.endswith(options.extension)])
convertFiles(lsFile,folderOut)

print (f'done for {len(lsFile)} files.')

0 comments on commit 6c42b14

Please sign in to comment.