diff --git a/usecases/NewsEye/SgmParagraph.py b/usecases/NewsEye/SgmParagraph.py
new file mode 100644
index 0000000..2b991fa
--- /dev/null
+++ b/usecases/NewsEye/SgmParagraph.py
@@ -0,0 +1,169 @@
+# -*- coding: utf-8 -*-
+ DU task for segmenting words into Menu-Items using the conjugate graph
+ Copyright NAVER LABS Europe(C) 2019 Jean-Luc Meunier
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+import sys, os
+from graph.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+from common.trace import traceln
+from graph.NodeType_PageXml import defaultBBoxDeltaFun
+from tasks.DU_Task_Factory import DU_Task_Factory
+from graph.pkg_GraphBinaryConjugateSegmenter.MultiSinglePageXml import MultiSinglePageXml as ConjugateSegmenterGraph_MultiSinglePageXml
+from graph.pkg_GraphBinaryConjugateSegmenter.MultiSinglePageXml_Separator \
+ import MultiSinglePageXml_Separator \
+ as ConjugateSegmenterGraph_MultiSinglePageXml_Separator
+#from graph.pkg_ReifiedEdge.MultiSinglePageXml_Segmenter_Separator_DOM import Graph_MultiSinglePageXml_Segmenter_Separator_DOM
+from graph.NodeType_PageXml import NodeType_PageXml_type
+from graph.NodeType_jsonOCR import NodeType_jsonOCR
+from graph.FeatureDefinition_Generic_noText import FeatureDefinition_Generic_noText
+from graph.FeatureDefinition_Generic import FeatureDefinition_Generic
+from tasks.DU_Task_Features import Features_June19_Full, Features_June19_Full_Separator
+# ----------------------------------------------------------------------------
+class My_ConjugateNodeType(NodeType_PageXml_type):
+ """
+ We need this to extract properly the label from the label attribute of the (parent) TableCell element.
+ """
+ def __init__(self, sNodeTypeName, lsLabel, lsIgnoredLabel=None, bOther=True, BBoxDeltaFun=defaultBBoxDeltaFun
+ , bPreserveWidth=False):
+ super(My_ConjugateNodeType, self).__init__(sNodeTypeName, lsLabel, lsIgnoredLabel, bOther, BBoxDeltaFun
+ , bPreserveWidth=bPreserveWidth)
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
+ """
+ Parse and set the graph node label and return its class index
+ raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
+ """
+ #sLabel = domnode.getparent().get(self.sLabelAttr)
+ domnode = graph_node.node.getparent()
+ sLabel = domnode.get(self.sLabelAttr)
+ return sLabel if not sLabel is None else "__none__"
+ def setDocNodeLabel(self, graph_node, sLabel):
+ raise Exception("This should not occur in conjugate mode")
+def getConfiguredGraphClass(doer):
+ """
+ In this class method, we must return a configured graph class
+ """
+# if options.bReified:
+# DU_GRAPH = Graph_MultiSinglePageXml_Segmenter_Separator_DOM
+ if options.bSeparator:
+ DU_GRAPH = ConjugateSegmenterGraph_MultiSinglePageXml_Separator
+ else:
+ DU_GRAPH = ConjugateSegmenterGraph_MultiSinglePageXml
+ ntClass = My_ConjugateNodeType
+ if options.bBB2:
+ nt = ntClass("mi_clstr" #some short prefix because labels below are prefixed with it
+ , [] # in conjugate, we accept all labels, andNone becomes "none"
+ , []
+ , False # unused
+ , BBoxDeltaFun = None
+ , bPreserveWidth=True
+ )
+ else:
+ nt = ntClass("mi_clstr" #some short prefix because labels below are prefixed with it
+ , [] # in conjugate, we accept all labels, andNone becomes "none"
+ , []
+ , False # unused
+ , BBoxDeltaFun =lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way
+ )
+ nt.setLabelAttribute("id")
+ ## HD added 23/01/2020: needed for output generation
+ DU_GRAPH.clusterType='paragraph'
+ nt.setXpathExpr(( ".//pc:TextLine"
+ , "./pc:TextEquiv") #how to get their text
+ )
+ DU_GRAPH.addNodeType(nt)
+ return DU_GRAPH
+if __name__ == "__main__":
+ traceln("VERSION: %s" % DU_Task_Factory.getVersion())
+ # standard command line options for CRF- ECN- GAT-based methods
+ usage, parser = DU_Task_Factory.getStandardOptionsParser(sys.argv[0])
+# parser.add_option("--spm" , dest='sSPModel' , action="store", type="string"
+# , help="Textual features are computed based on the given SentencePiece model. e.g. model/toto.model.")
+ parser.add_option("--unigram" , dest='unigram' , action="store"
+ ,type="int", default = 0
+ , help="Textual features as unigram: Max uni")
+ parser.add_option("--pxmlfeatures" , dest='pxmlfeatures' , action="store_true"
+ , default=False
+ , help="Use pageXml features (page h,w)")
+ parser.add_option("--separator" , dest='bSeparator' , action="store_true"
+ , default=False
+ , help="Use separators")
+ parser.add_option("--BB2", dest='bBB2' , action="store_true"
+ , help="New style BB (same width as baseline, no resize)")
+ (options, args) = parser.parse_args()
+ if options.bSeparator:
+ cFeatureDefinition = Features_June19_Full_Separator
+ dFeatureConfig = { }
+ elif options.pxmlfeatures:
+ cFeatureDefinition = Features_June19_Full
+ dFeatureConfig = { }
+ else:
+ cFeatureDefinition = FeatureDefinition_Generic
+ dFeatureConfig = { }
+ try:
+ sModelDir, sModelName = args
+ except Exception as e:
+ traceln("Specify a model folder and a model name!")
+ DU_Task_Factory.exit(usage, 1, e)
+ doer = DU_Task_Factory.getDoer(sModelDir, sModelName
+ , options = options
+ , fun_getConfiguredGraphClass= getConfiguredGraphClass
+ , cFeatureDefinition = cFeatureDefinition
+ , dFeatureConfig = dFeatureConfig
+ )
+ # setting the learner configuration, in a standard way
+ # (from command line options, or from a JSON configuration file)
+ dLearnerConfig = doer.getStandardLearnerConfig(options)
+# # force a balanced weighting
+# print("Forcing balanced weights")
+# dLearnerConfig['balanced'] = True
+ # of course, you can put yours here instead.
+ doer.setLearnerConfiguration(dLearnerConfig)
+ # act as per specified in the command line (--trn , --fold-run, ...)
+ doer.standardDo(options)
+ del doer
diff --git a/usecases/NewsEye/convertToLineSeparator.py b/usecases/NewsEye/convertToLineSeparator.py
new file mode 100644
index 0000000..31d11c4
--- /dev/null
+++ b/usecases/NewsEye/convertToLineSeparator.py
@@ -0,0 +1,98 @@
+ Newseye
+ convert
+ into
+from glob import glob
+from optparse import OptionParser
+import sys,os
+from lxml import etree
+from xml_formats.PageXml import PageXml, PageXmlException
+from util.Polygon import Polygon
+def convertTR2Sep(filename):
+ """
+ """
+ print (filename)
+ tagname='TextRegion'
+ xml = etree.parse(filename)
+ ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"})
+ for x in ltextsep:
+ if "separator" in x.get('custom'):
+ x.tag = 'SeparatorRegion'
+ #now we need to convert that object to a line
+ lXY = PageXml.getPointList(x) #the polygon
+ assert lXY, "Separator without Coord??"
+ plg = Polygon(lXY)
+ try:
+ x1,y1, x2,y2 = plg.fitRectangle()
+ except ValueError:
+ print("Warning: Coords might be bad, taking bounding box: ", lXY)
+ x1,y1,x2,y2 = plg.getBoundingBox()
+# try:
+# x1,y1, x2,y2 = plg.fitRectangle()
+# except ZeroDivisionError:
+# x1,y1,x2,y2 = plg.getBoundingBox()
+# except ValueError:
+# x1,y1,x2,y2 = plg.getBoundingBox()
+ if abs(x2-x1) > abs(y2-y1): # horizontal
+ y1 = (y1+y2)/2
+ y2 = y1
+ else:
+ x1 = (x1+x2)/2
+ x2=x1
+ ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0]
+ PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)])
+ return xml
+def convertFiles(lfilename,outdir):
+ for filename in lfilename:
+ doc = convertTR2Sep(filename)
+ newfilename = outdir + os.path.sep + f'a_{os.path.basename(filename)}'
+ with open(f'{newfilename}', 'wb') as f:
+ doc.write(f, encoding="utf-8", xml_declaration=True, pretty_print=True)
+if __name__ == '__main__':
+ version = "v.01"
+ sUsage="""
+ Usage: %s
+ """ % (sys.argv[0])
+ parser = OptionParser(usage=sUsage)
+ parser.add_option("--ext", dest='extension', type='string',action="store"
+ , help="file extension")
+ (options, args) = parser.parse_args()
+ try:
+ folderIn = args[0]
+ folderOut = args[1]
+ except ValueError:
+ sys.stderr.write(sUsage)
+ sys.exit(1)
+ lsFile = sorted([s for s in glob(os.path.join(folderIn, '*')) if s.endswith(options.extension)])
+ convertFiles(lsFile,folderOut)
+ print (f'done for {len(lsFile)} files.')
\ No newline at end of file