5400 17442
+
+ wait
+ 8 python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force
+ #covnert to ds
+ 9 python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442
+ 10 python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml --doie --usetemplate
+
+ """
+
+ #create Transkribus client
+ self.myTrKCient = TranskribusClient(sServerUrl=self.server,proxies={},loggingLevel=logging.WARN)
+ #login
+ _ = self.login(self.myTrKCient,trace=trace, traceln=traceln)
+
+# self.downloadCollection(colid,coldir,docid,bNoImg=False,bForce=True)
+
+ ## load dom
+ if dom is None:
+ self.inputFileName = os.path.abspath(os.path.join(coldir,TableProcessing.sCOL,docid+TableProcessing.sMPXMLExtension))
+ mpxml_doc = self.loadDom()
+ nbPages = MultiPageXml.getNBPages(mpxml_doc)
+ else:
+ # load provided mpxml
+ mpxml_doc = dom
+ nbPages = MultiPageXml.getNBPages(mpxml_doc)
+
+# ### table registration: need to compute/select??? the template
+# # perform LA separator, table registration, baseline with normalization
+# #python ../../src/tasks/performCVLLA.py --coldir=trnskrbs_5400/ --docid=17442 -i trnskrbs_5400/col/17442.mpxml --bl --regTL --form
+# tableregtool= LAProcessor()
+# # latool.setParams(dParams)
+# tableregtool.coldir = coldir
+# tableregtool.docid = docid
+# tableregtool.bTemplate, tableregtool.bSeparator , tableregtool.bBaseLine , tableregtool.bRegularTextLine = True,False,False,False
+# # creates xml and a new mpxml
+# mpxml_doc,nbPages = tableregtool.performLA(mpxml_doc)
+#
+#
+
+# self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table reg done')
+
+ lJobIDs = self.applyLA_URO(colid, docid, nbPages)
+ return
+
+ bWait=True
+ assert lJobIDs != []
+ jobid=lJobIDs[-1]
+ traceln("waiting for job %s"%jobid)
+ while bWait:
+ dInfo = self.myTrKCient.getJobStatus(jobid)
+ bWait = dInfo['state'] not in [ 'FINISHED', 'FAILED' ]
+
+
+ ## coldir???
+ self.downloadCollection(colid,coldir,docid,bNoImg=True,bForce=True)
+
+ ##STOP HERE FOR DAS newx testset:
+ return
+
+ # tag text for BIES cell
+ #python ../../src/tasks/DU_ABPTable_T.py modelMultiType tableRow2 --run=trnskrbs_5400
+ """
+ needed : doer = DU_ABPTable_TypedCRF(sModelName, sModelDir,
+ """
+ doer = DU_ABPTable_TypedCRF(self.sRowModelName, self.sRowModelDir)
+ doer.load()
+ ## needed predict at file level, and do not store dom, but return it
+ rowpath=os.path.join(coldir,"col")
+ BIESFiles = doer.predict([rowpath],docid)
+ BIESDom = self.loadDom(BIESFiles[0])
+# res= BIESDom.saveFormatFileEnc('test.mpxml', "UTF-8",True)
+
+ # MPXML2DS
+ #python ../../src/xml_formats/Page2DS.py --pattern=trnskrbs_5400/col/17442_du.mpxml -o trnskrbs_5400/xml/17442.ds_xml --docid=17442
+ dsconv = primaAnalysis()
+ DSBIESdoc = dsconv.convert2DS(BIESDom,self.docid)
+
+ # create XMLDOC object
+ self.ODoc = XMLDSDocument()
+ self.ODoc.loadFromDom(DSBIESdoc) #,listPages = range(self.firstPage,self.lastPage+1))
+ # create row
+ #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml
+ rdc = RowDetection()
+ rdc.findRowsInDoc(self.ODoc)
+
+
+ #python ../../src/xml_formats/DS2PageXml.py -i trnskrbs_5400/out/17442.ds_xml --multi
+ # DS2MPXML
+ DS2MPXML = DS2PageXMLConvertor()
+ lPageXml = DS2MPXML.run(self.ODoc.getDom())
+ if lPageXml != []:
+# if DS2MPXML.bMultiPages:
+ newDoc = MultiPageXml.makeMultiPageXmlMemory(map(lambda xy:xy[0],lPageXml))
+ outputFileName = os.path.join(self.coldir, sCOL, self.docid+TableProcessing.sMPXMLExtension)
+ newDoc.write(outputFileName, xml_declaration=True,encoding="UTF-8",pretty_print=True)
+# else:
+# DS2MPXML.storePageXmlSetofFiles(lPageXml)
+
+ return
+
+ #upload
+ # python ../../../TranskribusPyClient/src/TranskribusCommands/TranskribusDU_transcriptUploader.py --nodu trnskrbs_5400 5400 17442
+ self.upLoadDocument(colid, coldir,docid,sNote='NLE workflow;table row done')
+
+
+ ## apply HTR
+ ## how to deal with specific dictionaries?
+
+ ## here need to know the ontology and the template
+
+ nbPages=1
+ jobid = self.applyHTR(colid,docid, nbPages,self.sHTRmodel,self.sDictName)
+ bWait=True
+ traceln("waiting for job %s"%jobid)
+ while bWait:
+ dInfo = self.myTrKCient.getJobStatus(jobid)
+ bWait = dInfo['state'] not in [ 'FINISHED', 'FAILED' ,'CANCELED']
+
+
+ # download where???
+ # python ../../../TranskribusPyClient/src/TranskribusCommands/Transkribus_downloader.py 5400 --force
+ # coldir is not right!! coldir must refer to the parent folder!
+ self.downloadCollection(colid,coldir,docid,bNoImg=True,bForce=True)
+
+ #done!!
+
+ # IE extr
+ ## not here: specific to a usecas
+ #python src/IE_test.py -i trnskrbs_5400/xml/17442.ds_xml -o trnskrbs_5400/out/17442.ds_xml --doie --usetemplate
+
+
+ def processCollection(self,coldir):
+ """
+ process all files in a colelction
+ need mpxml files
+ """
+ lsDocFilename = sorted(glob.iglob(os.path.join(coldir, "*"+TableProcessing.sMPXMLExtension)))
+ lDocId = []
+ for sDocFilename in lsDocFilename:
+ sDocId = os.path.basename(sDocFilename)[:-len(TableProcessing.sMPXMLExtension)]
+ try:
+ docid = int(sDocId)
+ lDocId.append(docid)
+ except ValueError:
+ traceln("Warning: folder %s : %s invalid docid, IGNORING IT"%(self.coldir, sDocId))
+ continue
+
+ # process each document
+ for docid in lDocId:
+ traceln("Processing %s : %s "%(self.coldir, sDocId))
+ self.processDocument(self.colid, docid)
+ traceln("\tProcessing done for %s "%(self.coldir, sDocId))
+
+
+ def processParameters(self):
+ """
+ what to do with the parameters provided by the command line
+ """
+ if self.colid is None:
+ print('collection id missing!')
+ sys.exit(1)
+
+ self.bFullCol = self.docid != None
+
+ if self.bRegenerateMPXML and self.docid is not None:
+ l = glob.glob(os.path.join(self.coldir,sCOL,self.docid, "*.pxml"))
+ doc = MultiPageXml.makeMultiPageXml(l)
+ outputFileName = os.path.join(self.coldir, sCOL, self.docid+TableProcessing.sMPXMLExtension)
+ doc.write(outputFileName, xml_declaration=True,encoding="UTF-8",pretty_print=True)
+ return doc
+ return None
+
+ def run(self):
+ """
+ process at collection level or document level
+ """
+ newMPXML = self.processParameters()
+ if self.bFullCol is None:
+ self.processCollection(self.colid)
+ else:
+ self.processDocument(self.coldir,self.colid, self.docid,newMPXML)
+
+if __name__ == "__main__":
+
+
+ ## parser for cloud connection
+ parser = OptionParser()
+
+
+ tableprocessing = TableProcessing()
+ tableprocessing.createCommandLineParser()
+
+ tableprocessing.parser.add_option("-s", "--server" , dest='server', action="store", type="string", default="https://transkribus.eu/TrpServer", help="Transkribus server URL")
+
+ tableprocessing.parser.add_option("-l", "--login" , dest='login' , action="store", type="string", help="Transkribus login (consider storing your credentials in 'transkribus_credentials.py')")
+ tableprocessing.parser.add_option("-p", "--pwd" , dest='pwd' , action="store", type="string", help="Transkribus password")
+
+ tableprocessing.parser.add_option("--persist" , dest='persist', action="store_true", help="Try using an existing persistent session, or log-in and persists the session.")
+
+ tableprocessing.parser.add_option("--https_proxy" , dest='https_proxy' , action="store", type="string", help="proxy, e.g. http://cornillon:8000")
+
+ tableprocessing.parser.add_option("--pxml", dest="regMPXML", action="store_true", help="recreate MPXML frol PXML")
+
+ tableprocessing.parser.add_option("--coldir", dest="coldir", action="store", type="string", help="collection folder")
+ tableprocessing.parser.add_option("--colid", dest="colid", action="store", type="string", help="collection id")
+
+ tableprocessing.parser.add_option("--docid", dest="docid", action="store", type="string", help="document id")
+ tableprocessing.parser.add_option("--useExt", dest="useExt", action="store", type="string", help="generate mpxml using page file .ext")
+
+ ## ROW
+ tableprocessing.parser.add_option("--rowmodel", dest="rowmodelname", action="store", type="string", help="row model name")
+ tableprocessing.parser.add_option("--rowmodeldir", dest="rowmodeldir", action="store", type="string", help="row model directory")
+ ## HTR
+ tableprocessing.parser.add_option("--htrid", dest="htrmodel", action="store", type="string", help="HTR mode")
+ tableprocessing.parser.add_option("--dictname", dest="dictname", action="store", type="string", help="dictionary for HTR")
+
+# tableprocessing.add_option('-f',"--first", dest="first", action="store", type="int", help="first page to be processed")
+# tableprocessing.add_option('-l',"--last", dest="last", action="store", type="int", help="last page to be processed")
+
+ #parse the command line
+ dParams, args = tableprocessing.parseCommandLine()
+ #Now we are back to the normal programmatic mode, we set the componenet parameters
+ tableprocessing.setParams(dParams)
+
+ tableprocessing.run()
+
+
\ No newline at end of file
diff --git a/TranskribusDU/tasks/DU_Task.py b/TranskribusDU/tasks/DU_Task.py
index 2f60c9e..285182c 100644
--- a/TranskribusDU/tasks/DU_Task.py
+++ b/TranskribusDU/tasks/DU_Task.py
@@ -5,18 +5,7 @@
Copyright Xerox(C) 2016, 2017 JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -27,7 +16,9 @@
import sys, os, glob, datetime
import json
from importlib import import_module
-import random
+from io import StringIO
+import traceback
+import lxml.etree as etree
import numpy as np
@@ -40,15 +31,18 @@
except ImportError:
sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
import TranskribusDU_version
+TranskribusDU_version
from common.trace import trace, traceln
-from common.chrono import chronoOn, chronoOff
+from common.chrono import chronoOn, chronoOff, pretty_time_delta
from common.TestReport import TestReportConfusion
from xml_formats.PageXml import MultiPageXml
-from graph.GraphModel import GraphModel, GraphModelException
+from graph.GraphModel import GraphModel, GraphModelException, GraphModelNoEdgeException
+from graph.Graph_JsonOCR import Graph_JsonOCR
+from graph.Graph_DOM import Graph_DOM
import graph.FeatureDefinition
-from tasks import _checkFindColDir, _exit
+from tasks import _checkFindColDir
class DU_Task:
@@ -89,7 +83,7 @@ class DU_Task:
cFeatureDefinition = None # FeatureDefinition_PageXml_StandardOnes #I keep this for backward compa
- sMetadata_Creator = "NLE Document Understanding"
+ sMetadata_Creator = "NLE Document Understanding: DU_Task"
sMetadata_Comments = ""
#dGridSearch_LR_conf = {'C':[0.1, 0.5, 1.0, 2.0] } #Grid search parameters for LR baseline method training
@@ -101,6 +95,8 @@ class DU_Task:
iNbNodeType = 1 # as of today, only CRF can do multitype
+ bConjugate = False
+
def configureGraphClass(self, configuredClass=None):
"""
class method to set the graph class ONCE (subsequent calls are ignored)
@@ -114,9 +110,11 @@ class method to set the graph class ONCE (subsequent calls are ignored)
assert configuredClass is not None, "getConfiguredGraphClass returned None"
self.cGraphClass = configuredClass
+ self.bConjugate = configuredClass.bConjugate
assert self.cGraphClass is not None
traceln("SETUP: Graph class is %s (graph mode %d)" % (self.cGraphClass, self.cGraphClass.getGraphMode()))
+ traceln("SETUP: Input format is '%s'" % (self.cGraphClass.getDocInputFormat()))
return self.cGraphClass
@@ -144,10 +142,6 @@ def __init__(self, sModelName, sModelDir
self._mdl = None
- # for the conjugate mode
- self.bConjugate = False
- self.nbEdgeClass = None
-
self._lBaselineModel = []
self.bVerbose = True
@@ -173,10 +167,16 @@ def getVersion(cls):
def standardDo(self, options):
"""
- do whatever is reuested by an option from the parsed command line
+ do whatever is requested by an option from the parsed command line
return None
"""
+ if bool(options.iServer):
+ self.load()
+ # run in server mode!
+ self.serve_forever(options.iServer, options.bServerDebug, options=options)
+ return
+
if options.rm:
self.rm()
return
@@ -252,7 +252,7 @@ def standardDo(self, options):
# lsOutputFilename = self.runForExternalMLMethod(lRun, options.storeX, options.applyY, options.bRevertEdges)
# else:
self.load()
- lsOutputFilename = self.predict(lRun, bGraph=options.bGraph)
+ lsOutputFilename = self.predict(lRun, bGraph=options.bGraph,bOutXML=options.bOutXML)
traceln("Done, see in:\n %s"%lsOutputFilename)
else:
@@ -269,7 +269,113 @@ def __del__(self):
del self.cFeatureDefinition
del self.cModelClass
+ #--- SERVER MODE ---------------------------------------------------------
+ def serve_forever(self, iPort, bDebug=False, options={}):
+ self.sTime_start = datetime.datetime.now().isoformat()
+ self.sTime_load = self.sTime_start
+
+ import socket
+ sURI = "http://%s:%d" % (socket.gethostbyaddr(socket.gethostname())[0], iPort)
+ sDescr = """
+- home page for humans: %s
+- POST or GET on %s/predict with argument xml=...
+""" % ( sURI, sURI)
+ traceln("SERVER MODE")
+ traceln(sDescr)
+
+ from flask import Flask
+ from flask import request, abort
+ from flask import render_template_string #, render_template
+ from flask import redirect, url_for #, send_from_directory, send_file
+
+
+ # Create Flask app load app.config
+ app = Flask(self.__class__.__name__)
+
+ @app.route('/')
+ def home_page():
+ # String-based templates
+ return render_template_string("""
+DU_Task server
+
+
+reload the model
+
Provide some {{ input_format }} data and get PageXml output:
+
+
+This server runs with those options: {{ sOptions }}
+"""
+ , model_type=self.__class__.__name__
+ , model_spec=os.path.abspath(self.getModel().getModelFilename())
+ , input_format=self.getGraphClass().getDocInputFormat()
+ , start_time=self.sTime_start
+ , load_time=self.sTime_load
+ , sOptions=str(options))
+ traceln("SERVER ENDING. BYE")
+
+ @app.route('/predict', methods = ['POST'])
+ def predict():
+ try:
+ sData = request.form['data']
+ if sData.startswith("")+2:]
+
+ doc, lg = self._predict_file(self.getGraphClass(), [], StringIO(sData), bGraph=options.bGraph)
+
+ # if nothing to do, the method returns None...
+ if doc is None:
+ # doc = etree.parse(StringIO(sXml))
+ return sData
+ else:
+ if not(isinstance(doc, etree._ElementTree)):
+ traceln(" converting to PageXml...")
+ doc = Graph_DOM.exportToDom(lg)
+ return etree.tostring(doc.getroot(), encoding='UTF-8', xml_declaration=False)
+
+ except Exception as e:
+ traceln("----- predict exception -------------------------")
+ traceln(traceback.format_exc())
+ traceln("--------------------------------------------------")
+ abort(418, repr(e))
+
+ @app.route('/reload')
+ def reload():
+ """
+ Force to reload the model
+ """
+ self.load(bForce=True)
+ self.sTime_load = datetime.datetime.now().isoformat()
+ return redirect(url_for('home_page'))
+
+ # RUN THE SERVER !!
+ # CAUTION: TensorFlow incompatible with debug=True (double load => GPU issue)
+ app.run(host='0.0.0.0', port=iPort, debug=bDebug)
+
+ @app.route('/stop')
+ def stop():
+ """
+ Force to exit
+ """
+ traceln("Exiting")
+ sys.exit(0)
+ # RUN THE SERVER !!
+ # CAUTION: TensorFlow incompatible with debug=True (double load => GPU issue)
+ app.run(host='0.0.0.0', port=iPort, debug=bDebug)
+
+ return
+
+
#--- CONFIGURATION setters --------------------------------------------------------------------
def getGraphClass(self):
return self.cGraphClass
@@ -318,40 +424,15 @@ def getNbClass(self): #OK
"""
return self.nbClass
- def setConjugateMode(self
- , lEdgeLabel = None # list of labels (list of strings, or of int)
- , funEdgeLabel_get = None # to compute the edge labels
- , funEdgeLabel_set = None # to use the predicted edge labels
- ):
+ def setXmlFilenamePattern(self, sExt):
"""
- to learn and predict on the conjugate graph instead of the usual graph.
- 1 - The usual graph is created as always
- 2 - the function is called on each edge to compute the edge label
- 3 - the conjugate is created and used for learning or predicting
- 4 - the function is called on each edge to exploit the edge predicted label
-
-
- The prototype of the functions are:
- funEdgeLabel_get(primal_graph, primal_X, primal_Y)
- -> dual_Y
- funEdgeLabel_set(primal_graph, nd_node, edge_matrix, dual_Y)
- -> None
-
- the dual_Y has as many rows as the primal Edge array, and in the same order
- this order also corresponds to the lEdge attribute of the graph object
-
- In case the graph has some pre-established settings, you can omit the parameters.
- """
- self.bConjugate = True
- self.cModelClass.setConjugateMode()
- self.cGraphClass.setConjugateMode(lEdgeLabel
- , funEdgeLabel_get
- , funEdgeLabel_set)
- self.nbEdgeLabel = len(self.cGraphClass.getEdgeLabelNameList())
-
- return self.bConjugate
+ Set the expected file extension of the input data
+ """
+ assert sExt, "Empty extension not allowed"
+ if not sExt.startswith("."): sExt = "." + sExt
+ self.sXmlFilenamePattern = "*" + sExt
+
-
#----------------------------------------------------------------------------------------------------------
def setBaselineList(self, lMdl):
"""
@@ -432,7 +513,7 @@ def train_save_test(self, lsTrnColDir, lsTstColDir, lsVldColDir, bWarm=False, bP
return a test report object
"""
self.traceln("-"*50)
- self.traceln("Model files of '%s' in folder '%s'"%(self.sModelName, self.sModelDir))
+ self.traceln("Model files of '%s' in folder '%s'"%(self.sModelName, os.path.abspath(self.sModelDir)))
self.traceln("Training with collection(s):", lsTrnColDir)
self.traceln("Testing with collection(s):", lsTstColDir)
if lsVldColDir: self.traceln("Validating with collection(s):", lsVldColDir)
@@ -473,33 +554,23 @@ def test(self, lsTstColDir):
if lPageConstraint:
for dat in lPageConstraint: self.traceln("\t\t%s"%str(dat))
- if True:
- oReport = self._mdl.testFiles(lFilename_tst, lambda fn: DU_GraphClass.loadGraphs(self.cGraphClass, [fn], bDetach=True, bLabelled=True, iVerbose=1)
- , self.getBaselineList() != [])
- else:
- self.traceln("- loading test graphs")
- lGraph_tst = DU_GraphClass.loadGraphs(self.cGraphClass, lFilename_tst, bDetach=True, bLabelled=True, iVerbose=1)
- if self.bConjugate:
- for _g in lGraph_tst: _g.computeEdgeLabels()
-
- self.traceln(" %d graphs loaded"%len(lGraph_tst))
- oReport = self._mdl.test(lGraph_tst)
+ oReport = self._mdl.testFiles(lFilename_tst, lambda fn: DU_GraphClass.loadGraphs(self.cGraphClass, [fn], bDetach=True, bLabelled=True, iVerbose=1)
+ , self.getBaselineList() != [])
return oReport
- def predict(self, lsColDir, docid=None, bGraph=False):
+ def predict(self, lsColDir, docid=None, bGraph=False, bOutXML=True):
"""
Return the list of produced files
"""
- self.traceln("-"*50)
- self.traceln("Predicting for collection(s):", lsColDir)
- self.traceln("-"*50)
-
if not self._mdl: raise Exception("The model must be loaded beforehand!")
#list files
if docid is None:
+ self.traceln("-"*50)
+ self.traceln("Predicting for collection(s):", lsColDir, " (%s)" % self.sXmlFilenamePattern)
+ self.traceln("-"*50)
_ , lFilename = self.listMaxTimestampFile(lsColDir, self.sXmlFilenamePattern)
# predict for this file only
else:
@@ -517,123 +588,84 @@ def predict(self, lsColDir, docid=None, bGraph=False):
chronoOn("predict")
self.traceln("- loading collection as graphs, and processing each in turn. (%d files)"%len(lFilename))
- du_postfix = "_du"+MultiPageXml.sEXT
lsOutputFilename = []
for sFilename in lFilename:
- if sFilename.endswith(du_postfix): continue #:)
- chronoOn("predict_1")
- lg = DU_GraphClass.loadGraphs(self.cGraphClass, [sFilename], bDetach=False, bLabelled=False, iVerbose=1)
- #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list
- if lg:
- for i, g in enumerate(lg):
- doc = g.doc
- if lPageConstraint:
- self.traceln("\t- prediction with logical constraints: %s"%sFilename)
+ if DU_GraphClass.isOutputFilename(sFilename):
+ traceln(" - ignoring '%s' because of its extension" % sFilename)
+ continue
+
+ doc, lg = self._predict_file(DU_GraphClass, lPageConstraint, sFilename, bGraph=bGraph)
+
+ if doc is None:
+ self.traceln("\t- no prediction to do for: %s"%sFilename)
+ else:
+ sCreator = self.sMetadata_Creator + " " + self.getVersion()
+ sComment = self.sMetadata_Comments \
+ if bool(self.sMetadata_Comments) \
+ else "Model: %s %s (%s)" % (
+ self.sModelName
+ , self._mdl.__class__.__name__
+ , os.path.abspath(self.sModelDir))
+ # which output format
+ if bOutXML:
+ if DU_GraphClass == Graph_DOM:
+ traceln(" ignoring export-to-DOM (already DOM output)")
+ pass
else:
- self.traceln("\t- prediction : %s"%sFilename)
+ doc = Graph_DOM.exportToDom(lg)
+ sDUFilename = Graph_DOM.saveDoc(sFilename, doc, lg, sCreator, sComment)
+ traceln(" - exported as XML to ", sDUFilename)
+ else:
+ sDUFilename = DU_GraphClass.saveDoc(sFilename, doc, lg
+ , sCreator=sCreator
+ , sComment=sComment)
- if self.bConjugate:
- Y = self._mdl.predict(g, bProba=True)
- g.exploitEdgeLabels(Y)
- else:
- Y = self._mdl.predict(g)
- g.setDomLabels(Y)
- if bGraph: g.addEdgeToDOM(Y)
- del Y
-
- MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments)
- sDUFilename = sFilename[:-len(MultiPageXml.sEXT)] +du_postfix
- doc.write(sDUFilename,
- xml_declaration=True,
- encoding="utf-8",
- pretty_print=True
- #compression=0, #0 to 9
- )
del doc
del lg
-
- lsOutputFilename.append(sDUFilename)
- else:
- self.traceln("\t- no prediction to do for: %s"%sFilename)
-
- self.traceln("\t done [%.2fs]"%chronoOff("predict_1"))
+ lsOutputFilename.append(sDUFilename)
self.traceln(" done [%.2fs]"%chronoOff("predict"))
-
return lsOutputFilename
- def runForExternalMLMethod(self, lsColDir, storeX, applyY, bRevertEdges=False):
+ def _predict_file(self, DU_GraphClass, lPageConstraint, sFilename, bGraph=False):
"""
- HACK: to test new ML methods, not yet integrated in our SW: storeX=None, storeXY=None, applyY=None
- Return the list of produced files
+ Return the doc (a DOM?, a JSON?, another ?), the list of graphs
+ Note: the doc can be None is no graph
"""
+ chronoOn("predict_1")
+ doc = None
+ lg = DU_GraphClass.loadGraphs(self.cGraphClass, [sFilename], bDetach=False, bLabelled=False, iVerbose=1)
- self.traceln("-"*50)
- if storeX: traceln("Loading data and storing [X] (1 X per graph)")
- if applyY: traceln("Loading data, loading Y, labelling data, storing annotated data")
- self.traceln("-"*50)
+ #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list
+ for i, g in enumerate(lg):
+ if not g.lNode: continue # no node...
+ doc = g.doc
+ if lPageConstraint:
+ #self.traceln("\t- prediction with logical constraints: %s"%sFilename)
+ self.traceln("\t- page constraints IGNORED!!")
+ self.traceln("\t- prediction : %s"%sFilename)
- if storeX and applyY:
- raise ValueError("Either store X or applyY, not both")
+ self._predict_graph(g, lPageConstraint=lPageConstraint, bGraph=bGraph)
+ self.traceln("\t done [%.2fs]"%chronoOff("predict_1"))
+ return doc, lg
- if not self._mdl: raise Exception("The model must be loaded beforehand!")
-
- #list files
- _ , lFilename = self.listMaxTimestampFile(lsColDir, self.sXmlFilenamePattern)
-
- DU_GraphClass = self.getGraphClass()
-
- lPageConstraint = DU_GraphClass.getPageConstraint()
- if lPageConstraint:
- for dat in lPageConstraint: self.traceln("\t\t%s"%str(dat))
-
- if applyY:
- self.traceln("LOADING [Y] from %s"%applyY)
- lY = self._mdl.gzip_cPickle_load(applyY)
- if storeX: lX = []
-
- chronoOn("predict")
- self.traceln("- loading collection as graphs, and processing each in turn. (%d files)"%len(lFilename))
- du_postfix = "_du"+MultiPageXml.sEXT
- lsOutputFilename = []
- for sFilename in lFilename:
- if sFilename.endswith(du_postfix): continue #:)
- chronoOn("predict_1")
- lg = DU_GraphClass.loadGraphs(self.cGraphClass, [sFilename], bDetach=False, bLabelled=False, iVerbose=1)
- #normally, we get one graph per file, but in case we load one graph per page, for instance, we have a list
- if lg:
- for g in lg:
- if self.bConjugate: g.computeEdgeLabels()
- doc = g.doc
- if bRevertEdges: g.revertEdges() #revert the directions of the edges
- if lPageConstraint:
- self.traceln("\t- prediction with logical constraints: %s"%sFilename)
- else:
- self.traceln("\t- prediction : %s"%sFilename)
- if storeX:
- [X] = self._mdl.get_lX([g])
- lX.append(X)
- else:
- Y = lY.pop(0)
- g.setDomLabels(Y)
- del lg
-
- if applyY:
- MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments)
- sDUFilename = sFilename[:-len(MultiPageXml.sEXT)]+du_postfix
- doc.saveFormatFileEnc(sDUFilename, "utf-8", True) #True to indent the XML
- doc.freeDoc()
- lsOutputFilename.append(sDUFilename)
- else:
- self.traceln("\t- no prediction to do for: %s"%sFilename)
-
- self.traceln("\t done [%.2fs]"%chronoOff("predict_1"))
- self.traceln(" done [%.2fs]"%chronoOff("predict"))
-
- if storeX:
- self.traceln("STORING [X] in %s"%storeX)
- self._mdl.gzip_cPickle_dump(storeX, lX)
-
- return lsOutputFilename
+ def _predict_graph(self, g, lPageConstraint=None, bGraph=False):
+ """
+ predict for a graph
+ side effect on the graph g
+ return the graph
+ """
+ try:
+ Y = self._mdl.predict(g, bProba=g.bConjugate)
+ g.setDocLabels(Y)
+ if bGraph and not Y is None:
+ if g.bConjugate:
+ g.addEdgeToDoc(Y)
+ else:
+ g.addEdgeToDoc()
+ del Y
+ except GraphModelNoEdgeException:
+ traceln("*** ERROR *** cannot predict due to absence of edge in graph")
+ return g
def checkLabelCoverage(self, lY):
#check that all classes are represented in the dataset
@@ -882,7 +914,7 @@ def _train_save_test(self, sModelName, bWarm, lFilename_trn, ts_trn, lFilename_t
#for this check, we load the Y once...
if self.bConjugate:
- mdl.setNbClass(self.nbEdgeLabel)
+ mdl.setNbClass(len(self.cGraphClass.getEdgeLabelNameList()))
for _g in lGraph_trn: _g.computeEdgeLabels()
for _g in lGraph_vld: _g.computeEdgeLabels()
else:
@@ -921,7 +953,8 @@ def _train_save_test(self, sModelName, bWarm, lFilename_trn, ts_trn, lFilename_t
chronoOn("MdlTrn")
mdl.train(lGraph_trn, lGraph_vld, True, ts_trn, verbose=1 if self.bVerbose else 0)
mdl.save()
- self.traceln(" done [%.1fs]"%chronoOff("MdlTrn"))
+ tTrn = chronoOff("MdlTrn")
+ self.traceln(" training done [%.1f s] (%s)" % (tTrn, pretty_time_delta(tTrn)))
# OK!!
self._mdl = mdl
@@ -966,9 +999,7 @@ def listMaxTimestampFile(cls, lsDir, sPattern, bIgnoreDUFiles=True):
listMaxTimestampFile = classmethod(listMaxTimestampFile)
-# ------------------------------------------------------------------------------------------------------------------------------
-
-
+# -----------------------------------------------------------------------------------------------------------------------------
if __name__ == "__main__":
usage, parser = DU_Task.getStandardOptionsParser(sys.argv[0])
diff --git a/TranskribusDU/tasks/DU_Task_Factory.py b/TranskribusDU/tasks/DU_Task_Factory.py
index c34491e..02fbed7 100644
--- a/TranskribusDU/tasks/DU_Task_Factory.py
+++ b/TranskribusDU/tasks/DU_Task_Factory.py
@@ -5,18 +5,7 @@
Copyright NAVER(C) 2019 JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -38,15 +27,16 @@
from common.trace import traceln
from graph.Graph import Graph
from tasks.DU_CRF_Task import DU_CRF_Task
-from tasks.DU_ECN_Task import DU_ECN_Task
+from tasks.DU_ECN_Task import DU_ECN_Task, DU_Ensemble_ECN_Task
from tasks.DU_GAT_Task import DU_GAT_Task
+
class DU_Task_Factory:
VERSION = "Factory_19"
version = None # dynamically computed
- l_CHILDREN_CLASS = [DU_CRF_Task, DU_ECN_Task, DU_GAT_Task]
+ l_CHILDREN_CLASS = [DU_CRF_Task, DU_ECN_Task, DU_Ensemble_ECN_Task, DU_GAT_Task]
# faster load for debug... l_CHILDREN_CLASS = [DU_CRF_Task]
@classmethod
@@ -55,6 +45,7 @@ def getStandardOptionsParser(cls, sys_argv0=None):
or for a cross-validation [--fold-init ] [--fold-run [-w]] [--fold-finish] [--fold ]+
[--pkl]
[--g1|--g2]
+[--server ]
For the named MODEL using the given FOLDER for storage:
--rm : remove all model data from the folder
@@ -77,6 +68,9 @@ def getStandardOptionsParser(cls, sys_argv0=None):
--graph : store the graph in the output XML
--g1 : default mode (historical): edges created only to closest overlapping block (downward and rightward)
--g2 : implements the line-of-sight edges (when in line of sight, then link by an edge)
+ --server port : run in server mode, offering a predict method
+ --server-debug: run the server in debug
+ --outxml port : output PageXML files, whatever th einput format is.
"""%sys_argv0
#prepare for the parsing of the command line
@@ -116,6 +110,14 @@ def getStandardOptionsParser(cls, sys_argv0=None):
, help="default mode (historical): edges created only to closest overlapping block (downward and rightward)")
parser.add_option("--g2", dest='bG2', action="store_true"
, help="implements the line-of-sight edges (when in line of sight, then link the nodes by an edge)")
+ parser.add_option("--ext", dest='sExt', action="store", type="string"
+ , help="Expected extension of the data files, e.g. '.pxml'")
+ parser.add_option("--server", dest='iServer', action="store", type="int"
+ , help="run in server mode, offering a predict method, for the given model")
+ parser.add_option("--server_debug", dest='bServerDebug', action="store_true"
+ , help="run the server in debug mode (incompatible with TensorFLow)")
+ parser.add_option("--outxml", dest='bOutXML', action="store_true"
+ , help="output XML files, whatever the input format is.")
# consolidate...
@@ -139,13 +141,10 @@ def getVersion(cls):
@classmethod
def getDoer(cls, sModelDir, sModelName
, options = None
- , bCRF = None
- , bECN = None
- , bGAT = None
, fun_getConfiguredGraphClass = None
, sComment = None
, cFeatureDefinition = None
- , dFeatureConfig = {}
+ , dFeatureConfig = {}
):
"""
Create the requested doer object
@@ -160,20 +159,25 @@ def getDoer(cls, sModelDir, sModelName
if options.bG2: iGraphMode = 2
Graph.setGraphMode(iGraphMode)
- bCRF = bCRF or (not(options is None) and options.bCRF)
- bECN = bECN or (not(options is None) and options.bECN)
- bGAT = bGAT or (not(options is None) and options.bGAT)
-
- assert (bCRF or bECN or bGAT) , "You must specify one learning method."
- assert [bCRF, bECN, bGAT].count(True) == 1 , "You must specify only one learning method."
+# bCRF = bCRF or (not(options is None) and options.bCRF)
+# bECN = bECN or (not(options is None) and options.bECN)
+# bECNEnsemble = bECNEnsemble or (not(options is None) and options.bECN)
+# bGAT = bGAT or (not(options is None) and options.bGAT)
+
+ assert (options.bCRF
+ or options.bECN or options.bECNEnsemble
+ or options.bGAT) , "You must specify one learning method."
+ assert [options.bCRF, options.bECN, options.bECNEnsemble, options.bGAT].count(True) == 1 , "You must specify only one learning method."
- if bECN:
+ if options.bECN:
c = DU_ECN_Task
- elif bCRF:
+ elif options.bECNEnsemble:
+ c = DU_Ensemble_ECN_Task
+ elif options.bCRF:
c = DU_CRF_Task
- elif bGAT:
+ elif options.bGAT:
c = DU_GAT_Task
-
+
c.getConfiguredGraphClass = fun_getConfiguredGraphClass
doer = c(sModelName, sModelDir
@@ -181,6 +185,9 @@ def getDoer(cls, sModelDir, sModelName
, cFeatureDefinition = cFeatureDefinition
, dFeatureConfig = dFeatureConfig)
+ if options.sExt:
+ doer.setXmlFilenamePattern(options.sExt)
+
if options.seed is None:
random.seed()
traceln("SETUP: Randomizer initialized automatically")
diff --git a/TranskribusDU/tasks/DU_Task_Features.py b/TranskribusDU/tasks/DU_Task_Features.py
index 1ad3ef0..9f65699 100644
--- a/TranskribusDU/tasks/DU_Task_Features.py
+++ b/TranskribusDU/tasks/DU_Task_Features.py
@@ -5,18 +5,7 @@
Copyright NAVER(C) 2019 JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -24,6 +13,7 @@
under grant agreement No 674943.
"""
+from sklearn.preprocessing.data import QuantileTransformer
from graph.Edge import HorizontalEdge, VerticalEdge
@@ -39,6 +29,14 @@
from graph.FeatureDefinition_Standard import EdgeClassShifter
from graph.Transformer import Pipeline, FeatureUnion
+from graph.pkg_GraphBinaryConjugateSegmenter.PageXmlSeparatorRegion import Separator_boolean, Separator_num
+
+
+# EDGES
+# which types of edge can we get??
+# It depends on the type of graph!!
+lEdgeClass = [HorizontalEdge, VerticalEdge]
+
class Features_June19_Simple(FeatureDefinition):
"""
@@ -50,24 +48,29 @@ class Features_June19_Simple(FeatureDefinition):
n_QUANTILES = 16
bShiftEdgeByClass = False
-
+ bSeparator = False
+
def __init__(self):
FeatureDefinition.__init__(self)
# NODES
- node_transformer = FeatureUnion([ \
+ self.lNodeFeature = [
("geometry" , Node_Geometry()) # one can set nQuantile=...
- ])
+ ]
+ node_transformer = FeatureUnion(self.lNodeFeature)
# EDGES
- # which types of edge can we get??
- # It depends on the type of graph!!
- lEdgeClass = [HorizontalEdge, VerticalEdge]
# standard set of features, including a constant 1 for CRF
- edge_transformer = FeatureUnion([ \
+ self.lEdgeFeature = [
('1hot' , Edge_Type_1Hot(lEdgeClass=lEdgeClass)) # Edge class 1 hot encoded (PUT IT FIRST)
, ('geom' , Edge_Geometry()) # one can set nQuantile=...
- ])
+ ]
+ if self.bSeparator:
+ self.lEdgeFeature = self.lEdgeFeature + [
+ ('sprtr_bool', Separator_boolean())
+ , ('sprtr_num' , Separator_num())
+ ]
+ edge_transformer = FeatureUnion(self.lEdgeFeature)
# OPTIONNALLY, you can have one range of features per type of edge.
# the 1-hot encoding must be the first part of the union and it will determine
@@ -101,12 +104,13 @@ class Features_June19_Full(FeatureDefinition):
n_QUANTILES = 16
bShiftEdgeByClass = False
+ bSeparator = False
def __init__(self):
FeatureDefinition.__init__(self)
# NODES
- node_transformer = FeatureUnion([ \
+ self.lNodeFeature = [ \
("geometry" , Node_Geometry()) # one can set nQuantile=...
, ("neighbor_count" , Node_Neighbour_Count()) # one can set nQuantile=...
, ("text" , Node_Text_NGram( 'char' # character n-grams
@@ -114,14 +118,15 @@ def __init__(self):
, (2,3) # N
, False # lowercase?))
))
- ])
-
+ ]
+ node_transformer = FeatureUnion(self.lNodeFeature)
+
# EDGES
# which types of edge can we get??
# It depends on the type of graph!!
lEdgeClass = [HorizontalEdge, VerticalEdge]
# standard set of features, including a constant 1 for CRF
- fu = FeatureUnion([ \
+ self.lEdgeFeature = [ \
('1hot' , Edge_Type_1Hot(lEdgeClass=lEdgeClass)) # Edge class 1 hot encoded (PUT IT FIRST)
, ('1' , Edge_1()) # optional constant 1 for CRF
, ('geom' , Edge_Geometry()) # one can set nQuantile=...
@@ -135,7 +140,13 @@ def __init__(self):
, (2,3) # N
, False # lowercase?))
))
- ])
+ ]
+ if self.bSeparator:
+ self.lEdgeFeature = self.lEdgeFeature + [
+ ('sprtr_bool', Separator_boolean())
+ , ('sprtr_num' , Separator_num())
+ ]
+ fu = FeatureUnion(self.lEdgeFeature)
# you can use directly this union of features!
edge_transformer = fu
@@ -161,3 +172,27 @@ class Features_June19_Full_Shift(Features_June19_Full):
"""
bShiftEdgeByClass = True
+# --- Separator ------------------------------------------------------
+class Features_June19_Simple_Separator(Features_June19_Simple):
+ """
+ Same as Features_June19_Simple, with additional features on edges
+ """
+ bSeparator = True
+
+
+class Features_June19_Full_Separator(Features_June19_Full):
+ """
+ Same as Features_June19_Full, with additional features on edges
+ """
+ bSeparator = True
+
+
+# --- Separator Shifted ------------------------------------------------------
+class Features_June19_Simple_Separator_Shift(Features_June19_Simple_Separator
+ , Features_June19_Simple_Shift):
+ pass
+
+
+class Features_June19_Full_Separator_Shift(Features_June19_Full_Separator
+ , Features_June19_Full_Shift):
+ pass
diff --git a/TranskribusDU/tasks/DU_analyze_collection.py b/TranskribusDU/tasks/DU_analyze_collection.py
new file mode 100644
index 0000000..d9ba20b
--- /dev/null
+++ b/TranskribusDU/tasks/DU_analyze_collection.py
@@ -0,0 +1,405 @@
+# -*- coding: utf-8 -*-
+
+"""
+ Utility to compute statistics regarding a PageXml collection.
+
+ How many document? pages? objects? labels?
+
+ The raw result is stored as a pikle file in a CSV file. (in the future version!!!)
+ The statistics are reported on stdout.
+
+ Copyright Xerox(C) 2017 JL. Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union�s Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+
+
+
+import sys, os, collections, pickle, glob
+from lxml import etree
+import re
+import gc
+from optparse import OptionParser
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from xml_formats.PageXml import PageXml
+
+# ===============================================================================================================
+#DEFINING THE CLASS OF GRAPH WE USE
+
+# ===============================================================================================================
+
+class DoubleHistogram:
+ """
+ Double keyed histogram
+ """
+ def __init__(self, name):
+ self.name = name
+ self.dCnt = collections.defaultdict(lambda : collections.defaultdict(int) )
+
+ def seenK1K2(self, k1, k2):
+ self.dCnt[k1][k2] += 1
+
+ #--- First Key
+ def addFirstKeys(self, lk1):
+ """
+ Make sure those key are present in the histogram, possibly with count of zero
+ """
+ for k1 in lk1: self.dCnt[k1]
+
+ def getFirstKeyList(self):
+ """
+ return the sorted list of first key
+ """
+ l = list(self.dCnt.keys()); l.sort()
+ return l
+
+ #--- Second Key
+ def getAllSecondKeys(self):
+ setK = set()
+ for k in self.getFirstKeyList():
+ setK = setK.union( self.getSecondKeyList(k) )
+ return list(setK)
+
+ def getSecondKeyList(self, k):
+ """
+ return the sorted list of observed labels for this tag
+ """
+ l = list(self.dCnt[k].keys()); l.sort()
+ return l
+
+ def getSecondKeyCountList(self, k):
+ """
+ return the count of observed second keys, in same order as the second key list, for that first key
+ """
+ return [self.dCnt[k][v] for v in self.getSecondKeyList(k)]
+
+ def getCount(self, k1, k2): return self.dCnt[k1][k2]
+
+ #--- Sum
+ def getSumByFirstKey(self, k1):
+ """
+ return the sum of counts of observed second keys, for that first key
+ """
+ return sum( self.dCnt[k1][v] for v in self.getSecondKeyList(k1) )
+
+ def getSumBySecondKey(self, k2):
+ """
+ return the sum of counts of observed first keys, for that second key
+ """
+ cnt = 0
+ for k1 in self.getFirstKeyList():
+ if k2 in self.getSecondKeyList(k1): cnt += self.getCount(k1, k2)
+ return cnt
+
+class CollectionAnalyzer:
+ def __init__(self, lTag):
+ self.start()
+ self.lTag = lTag #all tag names
+
+ def start(self):
+ """
+ reset any accumulated data
+ """
+ self.hPageCountPerDoc = DoubleHistogram("Page count stat")
+ self.hTagCountPerDoc = DoubleHistogram("Tag stat per document")
+ self.hLblCountPerTag = DoubleHistogram("Label stat per tag")
+
+ self.lDoc = None #all doc names
+ self.lNbPage = None
+
+ def runPageXml(self, sDir):
+ """
+ process one folder per document
+ """
+ assert False, "Method must be specialized"
+
+ def runMultiPageXml(self, sDir):
+ """
+ process one PXML per document
+ """
+ assert False, "Method must be specialized"
+
+ def end(self):
+ """
+ Consolidate the gathered data
+ """
+ self.lDoc = self.hPageCountPerDoc.getFirstKeyList() #all doc are listed here
+ self.hTagCountPerDoc.addFirstKeys(self.lDoc) #to make sure we have all of them listed, even those without tags of interest
+ self.lObservedTag = self.hTagCountPerDoc.getAllSecondKeys() #all tag of interest observed in dataset
+
+ self.lNbPage = list()
+ for doc in self.lDoc:
+ lNb = self.hPageCountPerDoc.getSecondKeyList(doc)
+ assert len(lNb) == 1
+ self.lNbPage.append(lNb[0])
+ #label list per tag: self.hLblCountPerTag.getSecondKeyList(tag)
+
+ def save(self, filename):
+ t = (self.hPageCountPerDoc, self.hTagCountPerDoc, self.hLblCountPerTag)
+ with open(filename, "wb") as fd: pickle.dump(t, fd)
+
+ def load(self, filename):
+ with open(filename, "rb")as fd:
+ self.hPageCountPerDoc, self.hTagCountPerDoc, self.hLblCountPerTag = pickle.load(fd)
+
+ def prcnt(self, num, totnum):
+ if totnum==0:
+ return "n/a"
+ else:
+ f = num*100.0/totnum
+ if 0.0 < f and f < 2.0:
+ return "%.1f%%" % f
+ else:
+ return "%.0f%%" % f
+
+ def report(self):
+ """
+ report on accumulated data so far
+ """
+ print( "-"*60)
+
+ print( " ----- %d documents, %d pages" %(len(self.lDoc), sum(self.lNbPage)))
+ for doc, nb in zip(self.lDoc, self.lNbPage):
+ print( "\t---- %40s %6d pages"%(doc, nb))
+
+ print()
+ print( " ----- %d objects of interest (%d observed): %s"%(len(self.lTag), len(self.lObservedTag), self.lTag))
+ for doc in self.lDoc:
+ print( "\t---- %s %6d occurences"%(doc, self.hTagCountPerDoc.getSumByFirstKey(doc)))
+ for tag in self.lObservedTag:
+ print( "\t\t--%20s %6d occurences" %(tag, self.hTagCountPerDoc.getCount(doc, tag)))
+ print()
+ for tag in self.lObservedTag:
+ print( "\t-- %s %6d occurences" %(tag, self.hTagCountPerDoc.getSumBySecondKey(tag)))
+ for doc in self.lDoc:
+ print( "\t\t---- %40s %6d occurences"%(doc, self.hTagCountPerDoc.getCount(doc, tag)))
+
+ print()
+ print( " ----- Label frequency for ALL %d objects of interest: %s"%(len(self.lTag), self.lTag))
+ for tag in self.lTag:
+ totnb = self.hTagCountPerDoc.getSumBySecondKey(tag)
+ totnblabeled = self.hLblCountPerTag.getSumByFirstKey(tag)
+ print( "\t-- %s %6d occurences %d labelled" %(tag, totnb, totnblabeled))
+ for lbl in self.hLblCountPerTag.getSecondKeyList(tag):
+ nb = self.hLblCountPerTag.getCount(tag, lbl)
+ print( "\t\t- %20s %6d occurences\t(%5s) (%5s)"%(lbl,
+ nb,
+ self.prcnt(nb, totnb),
+ self.prcnt(nb, totnblabeled)))
+ nb = totnb - totnblabeled
+ lbl=""
+ print( "\t\t- %20s %6d occurences\t(%5s)"%(lbl, nb, self.prcnt(nb, totnb)))
+
+ print( "-"*60)
+ return ""
+
+ def seenDocPageCount(self, doc, pagecnt):
+ self.hPageCountPerDoc.seenK1K2(doc, pagecnt) #strange way to indicate the page count of a doc....
+ def seenDocTag(self, doc, tag):
+ self.hTagCountPerDoc.seenK1K2(doc, tag)
+ def seenTagLabel(self, tag, lbl):
+ self.hLblCountPerTag.seenK1K2(tag, lbl)
+
+class PageXmlCollectionAnalyzer(CollectionAnalyzer):
+ """
+ Annalyse a collection of PageXmlDocuments
+ """
+
+ dNS = {"pg":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
+
+ def __init__(self, sDocPattern, sPagePattern, lTag, sCustom=None):
+ """
+ sRootDir is the root directory of the collection
+ sDocPattern is the pattern followed by folders, assuming one folder contains one document
+ sPagePattern is the pattern followed by each PageXml file , assuming one file contains one PageXml XML
+ ltTagAttr is the list of pair of tag of interest and label attribute
+ """
+ CollectionAnalyzer.__init__(self, lTag)
+ self.sDocPattern = sDocPattern
+ self.sPagePattern = sPagePattern
+ self.lTag = lTag
+ self.sCustom = sCustom
+ self.ltCRES = [] #list of tuple (cre, replacement-string)
+
+ def setLabelPattern(self, sRE, sRepl):
+ """
+ replace any occurence of the pattern by the replacement string in a label
+ """
+ self.ltCRES.append( (re.compile(sRE), sRepl) )
+
+ def runPageXml(self, sRootDir):
+ lFolder = [os.path.basename(folder) for folder in glob.iglob(os.path.join(sRootDir, self.sDocPattern))
+ if os.path.isdir(folder)]
+ lFolder.sort()
+ print( "Documents: ", lFolder)
+
+ for docdir in lFolder:
+ print( "Document ", docdir)
+ lPageFile = [os.path.basename(name) for name in glob.iglob(os.path.join(sRootDir, docdir, self.sPagePattern))
+ if os.path.isfile(os.path.join(sRootDir, docdir, name))]
+ lPageFile.sort()
+ self.seenDocPageCount(docdir, len(lPageFile))
+ for sPageFile in lPageFile:
+ print( ".",)
+ doc = etree.parse(os.path.join(sRootDir, docdir, sPageFile))
+ self.parsePage(doc, doc.getroot(), docdir)
+ doc = None
+ gc.collect()
+ print()
+ sys.stdout.flush()
+
+ def runMultiPageXml(self, sRootDir):
+ print( os.path.join(sRootDir, self.sDocPattern))
+ print( glob.glob(os.path.join(sRootDir, self.sDocPattern)))
+ lDocFile = [os.path.basename(filename) for filename in glob.iglob(os.path.join(sRootDir, self.sDocPattern))
+ if os.path.isfile(filename)]
+ lDocFile.sort()
+ print( "Documents: ", lDocFile)
+
+ for docFile in lDocFile:
+ print( "Document ", docFile)
+ doc = etree.parse(os.path.join(sRootDir, docFile))
+ lNdPage = doc.getroot().xpath("//pg:Page",
+ namespaces=self.dNS)
+ self.seenDocPageCount(docFile, len(lNdPage))
+ for ndPage in lNdPage:
+ print( ".",)
+ self.parsePage(doc, ndPage, docFile)
+ print()
+ sys.stdout.flush()
+
+ def parsePage(self, doc, ctxtNd, name):
+ for tag in self.lTag:
+ lNdTag = ctxtNd.xpath(".//pg:%s"%tag, namespaces=self.dNS)
+ for nd in lNdTag:
+ self.seenDocTag(name, tag)
+ if self.sCustom != None:
+ if self.sCustom == "":
+ try:
+ lbl = PageXml.getCustomAttr(nd, "structure", "type")
+ except:
+ lbl = ''
+ else:
+ lbl = nd.get(self.sCustom)
+ else:
+ lbl = nd.get("type")
+
+ if lbl:
+ for cre, sRepl in self.ltCRES: lbl = cre.sub(sRepl, lbl) #pattern processing
+ self.seenTagLabel(tag, lbl)
+
+
+def test_simple():
+ sTESTS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+ "tests")
+
+ sDATA_DIR = os.path.join(sTESTS_DIR, "data")
+
+ doer = PageXmlCollectionAnalyzer("*.mpxml",
+ None,
+ ["Page", "TextRegion", "TextLine"],
+ #["type"],
+ sCustom="")
+ doer.start()
+ doer.runMultiPageXml(os.path.join(sDATA_DIR, "abp_TABLE_9142_mpxml", "col"))
+ doer.end()
+ sReport = doer.report()
+ print( sReport)
+
+if __name__ == "__main__":
+
+ if False:
+ test_simple()
+
+ sUsage="""Usage: %s [sPagePattern])
+For Multi-PageXml, only root directory and document pattern (2 arguments, e.g. 9142_GTRC/col '*.mpxml' )
+For PageXml, give also the Xml page pattern (3 arguments, e.g. 9142_GTRC/col '[0-9]+' *.mpxml')
+"""%sys.argv[0]
+
+ #prepare for the parsing of the command line
+ parser = OptionParser(usage=sUsage)
+
+# parser.add_option("--dir", dest='lTrn', action="store", type="string"
+# , help="Train or continue previous training session using the given annotated collection.")
+# parser.add_option("--tst", dest='lTst', action="store", type="string"
+# , help="Test a model using the given annotated collection.")
+# parser.add_option("--run", dest='lRun', action="store", type="string"
+# , help="Run a model on the given non-annotated collection.")
+# parser.add_option("-w", "--warm", dest='warm', action="store_true"
+# , help="Attempt to warm-start the training")
+ parser.add_option("-c", "--custom", dest='custom', action="store", type="string"
+ , help="With --custom= , it reads @custom Xml attribute instead of @type, or if you specify --custom=toto, it will read the @toto attribute.")
+ parser.add_option("--pattern", dest='pattern', action="store"
+ , help="Replace the given pattern in the label by # (specific for BAR so far...)")
+
+ # ---
+# bMODEUN = True
+
+ #parse the command line
+ (options, args) = parser.parse_args()
+ # ---
+ try:
+ try:
+ sRootDir, sDocPattern, sPagePattern = args[0:3]
+ bMultiPageXml = False
+ except:
+ sRootDir, sDocPattern = args[0:2]
+ bMultiPageXml = True
+ sPagePattern = None
+ except:
+ print(sUsage)
+ exit(1)
+
+ #all tag supporting the attribute type in PageXml 2003
+ lTag = ["Page", "TextRegion", "GraphicRegion", "CharRegion", "RelationType"]
+ #Pragmatism: don't think we will have annotatetd page
+ lTag = ["TextRegion", "GraphicRegion", "CharRegion", "RelationType"]
+ #Pragmatism: we may also have tagged TextLine ...
+ lTag.append("TextLine")
+
+ print( sRootDir, sDocPattern, sPagePattern, lTag)
+
+# if bMODEUN:
+# #all tag supporting the attribute type in PageXml 2003
+# ltTagAttr = [ (name, "type") for name in ["Page", "TextRegion", "GraphicRegion", "CharRegion", "RelationType"]]
+# else:
+# ls = args[3:]
+# ltTagAttr = zip(ls[slice(0, len(ls), 2)], ls[slice(1, len(ls), 2)])
+# print( sRootDir, sDocPattern, sPagePattern, ltTagAttr)
+# except:
+# # if bMODEUN:
+# # print( "Usage: %s sRootDir sDocPattern [sPagePattern]"%(sys.argv[0] ))
+# # else:
+# # print( "Usage: %s sRootDir sDocPattern [sPagePattern] [Tag Attr]+"%(sys.argv[0] ))
+# exit(1)
+
+ doer = PageXmlCollectionAnalyzer(sDocPattern, sPagePattern, lTag, sCustom=options.custom)
+ if options.pattern != None:
+ doer.setLabelPattern(options.pattern, "#")
+
+ doer.start()
+ if bMultiPageXml:
+ print( "--- MultiPageXml ---")
+ doer.runMultiPageXml(sRootDir)
+ else:
+ print( "--- PageXml ---")
+ doer.runPageXml(sRootDir)
+
+ doer.end()
+ sReport = doer.report()
+
+ print( sReport)
+
diff --git a/TranskribusDU/tasks/DU_split_collection.py b/TranskribusDU/tasks/DU_split_collection.py
index b9add87..2d781ae 100644
--- a/TranskribusDU/tasks/DU_split_collection.py
+++ b/TranskribusDU/tasks/DU_split_collection.py
@@ -3,20 +3,9 @@
"""
DU task: split a collection in N equal parts, at random
- Copyright NAVER(C) 2019 Jean-Luc Meunier
+ Copyright Xerox(C) 2019 Jean-Luc Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
from the European Union's Horizon 2020 research and innovation programme
@@ -26,6 +15,8 @@
import sys, os, random
from shutil import copyfile
+from optparse import OptionParser
+import math
try: #to ease the use without proper Python installation
import TranskribusDU_version
@@ -41,15 +32,27 @@
if __name__ == "__main__":
# import better_exceptions
# better_exceptions.MAX_LENGTH = None
+ sUsage = """
+USAGE: %s DIR ( N | p1,p2(,p)+ )
+Split in N folders
+ or
+Split in folders following the proportions p1, ... pN
+
+The folders are named after the DIR folder by adding suffix_part_<1 to N>
+
+(Expecting to find a 'col' subfolder in DIR)""" % sys.argv[0]
+
+ parser = OptionParser(usage=sUsage)
+ (options, args) = parser.parse_args()
try:
- sDir = sys.argv[1]
- n = int(sys.argv[2])
+ sDir, sN = args
except:
- print("USAGE: %s DIR N"%sys.argv[0])
+ print(sUsage)
exit(1)
sColDir= os.path.join(sDir, "col")
+ assert os.path.isdir(sColDir), "%s is not a folder"%sColDir
print("- looking at ", sColDir)
lsFile = []
@@ -60,16 +63,36 @@
if not(_fnl.endswith(".mpxml") or _fnl.endswith(".pxml")):
continue
lsFile.append(_fn)
- traceln(" %d files to split in %d parts" % (len(lsFile), n))
+
+ nbFile = len(lsFile)
+ try:
+ lP = [int(_s) for _s in sN.split(',')]
+ if len(lP) < 2: raise ValueError("want to run the except code")
+ lP = [p / sum(lP) for p in lP]
+ traceln(" %d files to split in %d parts with proportions %s" % (
+ nbFile
+ , len(lP)
+ , ",".join("%.2f"%_p for _p in lP)))
+ lP.sort()
+ ld = []
+ for i, p in enumerate(lP):
+ ld += [i] * math.ceil(p * nbFile)
+ ld = ld[:nbFile]
+ while len(ld) < nbFile: ld.append(len(lP)-1)
+ random.shuffle(ld)
+ except ValueError:
+ # Split in N parts
+ traceln(" %d files to split in %d parts" % (nbFile, int(sN)))
+ n = int(sN)
- N = len(lsFile)
- ld = getSplitIndexList(N, n, traceln)
- assert len(ld) == N
+ ld = getSplitIndexList(nbFile, n, traceln)
+ assert len(ld) == nbFile
- # *** SHUFFLING!! ***
- random.shuffle(ld)
+ # *** SHUFFLING!! ***
+ random.shuffle(ld)
+ # ld [I] gives the folder index where to put the Ith file
def get_sToColDir(sDir, d, bExistIsOk=False):
"""
@@ -92,10 +115,11 @@ def get_sToColDir(sDir, d, bExistIsOk=False):
raise Exception("First remove the destination folders: ", (sToDir, sToColDir))
return sToColDir
+ assert len(ld) == len(lsFile)
# make sure the folder are not already containing some stuff (from previous runs...)
- for _d in range(1, n+1):
- get_sToColDir(sDir, _d, bExistIsOk=False)
+ for _d in set(ld):
+ get_sToColDir(sDir, _d+1, bExistIsOk=False)
ld = [1+d for d in ld] # convenience
for d, sFilename in zip(ld, lsFile):
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableAnnotation.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableAnnotation.py
index 2d37159..e115a5f 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableAnnotation.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableAnnotation.py
@@ -5,18 +5,7 @@
Copyright Xerox(C) 2017 H. Déjean
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -223,16 +212,16 @@ def annotateDocument(self,lsTrnColDir):
## TEXT
for tl in lTextLine:
try:
- sLabel = tl.type.parseDomNodeLabel(tl.node)
+ sLabel = tl.type.parseDocNodeLabel(tl)
# cls = DU_GRAPH._dClsByLabel[sLabel] #Here, if a node is not labelled, and no default label is set, then KeyError!!!
# except KeyError:
except ValueError:
tl.node.setProp(tl.type.sLabelAttr,lLabels[4])
## SEP
for sep in lSeparator:
-# sLabel = sep.type.parseDomNodeLabel(sep.node)
+# sLabel = sep.type.parseDocNodeLabel(sep)
try:
- sLabel = sep.type.parseDomNodeLabel(sep.node)
+ sLabel = sep.type.parseDocNodeLabel(sep)
# cls = DU_GRAPH._dClsByLabel[sLabel] #Here, if a node is not labelled, and no default label is set, then KeyError!!!
except ValueError:
sep.node.setProp(sep.type.sLabelAttr,lLabels[6])
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableCutPredictor.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableCutPredictor.py
index f1b45e4..82bf3a9 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableCutPredictor.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableCutPredictor.py
@@ -70,18 +70,7 @@
Copyright Naver Labs Europe 2018
JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableGrid.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableGrid.py
index faac880..4aa8ea5 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableGrid.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableGrid.py
@@ -6,18 +6,7 @@
Copyright Naver Labs Europe 2018
JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableH.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableH.py
index b89f1b1..c309212 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableH.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableH.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2018 H. Déjean, JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableR.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableR.py
index 3c511d0..a8593ec 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableR.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableR.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2018 H. Déjean, JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRC.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRC.py
index eca280d..28d521f 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRC.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRC.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2018 H. Déjean
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCAnnotation_checker.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCAnnotation_checker.py
index 5791ed1..eef79ab 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCAnnotation_checker.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCAnnotation_checker.py
@@ -6,18 +6,7 @@
Copyright Naver Labs Europe 2018
JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut.py
index 583c0f7..ca22626 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut.py
@@ -9,18 +9,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -85,7 +74,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -119,7 +108,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut1.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut1.py
index edbf959..c55f925 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut1.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut1.py
@@ -9,18 +9,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -87,7 +76,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -121,7 +110,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
@@ -648,7 +637,7 @@ def evalClusterByRow(self, sFilename):
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
#load the block nodes per page
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut1SIO.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut1SIO.py
index 908bb32..bd55b3b 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut1SIO.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut1SIO.py
@@ -11,18 +11,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -89,7 +78,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -125,7 +114,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
@@ -652,7 +641,7 @@ def evalClusterByRow(self, sFilename):
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
#load the block nodes per page
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut2.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut2.py
index aeafda4..19d910f 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut2.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRCut2.py
@@ -9,18 +9,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -85,7 +74,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -119,7 +108,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG.py
index 0dae259..6a96008 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -83,7 +72,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -108,7 +97,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for pnum, page, domNdPage in self._iter_Page_DomNode(self.doc):
+ for pnum, page, domNdPage in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG2.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG2.py
index 946c731..c3644ea 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG2.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG2.py
@@ -7,18 +7,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG3.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG3.py
index d011a0b..34ce0aa 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG3.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG3.py
@@ -7,18 +7,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -85,7 +74,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -110,7 +99,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for pnum, page, domNdPage in self._iter_Page_DomNode(self.doc):
+ for pnum, page, domNdPage in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG4.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG4.py
index d1fd2ea..e3ace94 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG4.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG4.py
@@ -16,18 +16,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -95,7 +84,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -128,7 +117,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for pnum, page, domNdPage in self._iter_Page_DomNode(self.doc):
+ for pnum, page, domNdPage in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG41.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG41.py
index 70b80cf..d6e93e4 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG41.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG41.py
@@ -16,18 +16,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -95,7 +84,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -128,7 +117,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for pnum, page, domNdPage in self._iter_Page_DomNode(self.doc):
+ for pnum, page, domNdPage in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG42.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG42.py
index bf8e500..fe38508 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG42.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRG42.py
@@ -16,18 +16,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -96,7 +85,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -129,7 +118,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for pnum, page, domNdPage in self._iter_Page_DomNode(self.doc):
+ for pnum, page, domNdPage in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRGw.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRGw.py
index d95be48..fce8476 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRGw.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRGw.py
@@ -18,18 +18,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -96,7 +85,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -121,7 +110,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for pnum, page, domNdPage in self._iter_Page_DomNode(self.doc):
+ for pnum, page, domNdPage in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRH.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRH.py
index 1fb1c32..3ad9f73 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRH.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRH.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2018 H. Déjean, JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRHCut1SIO.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRHCut1SIO.py
index bf32679..640a441 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRHCut1SIO.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRHCut1SIO.py
@@ -11,18 +11,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -112,7 +101,7 @@ def setFactoredClassicalType(cls, ntClassic, ntFactored):
cls._dFactorialType[ntClassic] = ntFactored
cls._lfactoredType.append(ntFactored)
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -154,7 +143,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
assert len(lClassicType) == 1
assert len(lSpecialType) == 1
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
@@ -202,7 +191,7 @@ def computeSpecialEdges(cls, lClassicPageNode, lSpecialPageNode):
# ------------------------------------
- def parseDomLabels(self):
+ def parseDocLabels(self):
"""
Parse the label of the graph from the dataset, and set the node label
return the set of observed class (set of integers in N+)
@@ -213,13 +202,13 @@ def parseDomLabels(self):
== ad-hoc graph ==
We also load the class of the factored classical nodes
"""
- setSeensLabels = Graph_MultiPageXml.parseDomLabels(self)
+ setSeensLabels = Graph_MultiPageXml.parseDocLabels(self)
# and we go thru the classical node types to also load the factored label
for nd in self.lNodeBlock:
factoredType = self._dFactorialType[nd.type]
try:
- sFactoredLabel = factoredType.parseDomNodeLabel(nd.node)
+ sFactoredLabel = factoredType.parseDocNodeLabel(nd)
except KeyError:
raise ValueError("Page %d, unknown label in %s (Known labels are %s)"%(nd.pnum, str(nd.node), self._dClsByLabel))
factoredLabel = self._dClsByLabel[sFactoredLabel]
@@ -228,7 +217,7 @@ def parseDomLabels(self):
setSeensLabels.add(factoredLabel)
return setSeensLabels
- def setDomLabels(self, Y):
+ def setDocLabels(self, Y):
"""
Set the labels of the graph nodes from the Y matrix
return the DOM
@@ -245,18 +234,18 @@ def setDomLabels(self, Y):
# Blocks
for i, nd in enumerate(self.lNodeBlock):
sLabel = self._dLabelByCls[ Y[i] ]
- ntBlock.setDomNodeLabel(nd.node, sLabel)
+ ntBlock.setDocNodeLabel(nd, sLabel)
# factored Blocks
for i, nd in enumerate(self.lNodeBlock):
sLabel = self._dLabelByCls[ Y[i+NB] ]
- ntFactored.setDomNodeLabel(nd.node, sLabel)
+ ntFactored.setDocNodeLabel(nd, sLabel)
# cut nodes
Z = NB + NB
for i, nd in enumerate(self.lNodeCutLine):
sLabel = self._dLabelByCls[ Y[i+Z] ]
- ntCut.setDomNodeLabel(nd.node, sLabel)
+ ntCut.setDocNodeLabel(nd, sLabel)
return self.doc
@@ -873,7 +862,7 @@ def evalClusterByRow(self, sFilename):
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
#load the block nodes per page
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO.py
index 665d0bd..8762bb3 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2018 H. Déjean, JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -48,20 +37,21 @@
from tasks.DU_CRF_Task import DU_CRF_Task
#from crf.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText
-from crf.FeatureDefinition_PageXml_std_noText_v4 import FeatureDefinition_PageXml_StandardOnes_noText_v4
+from crf.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText
class NodeType_BIESO_to_SIO(NodeType_PageXml_type_woText):
"""
Convert BIESO labeling to SIO
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
-
+ domnode = graph_node.node
+
sXmlLabel = domnode.get(self.sLabelAttr)
sXmlLabel = {'B':'S',
@@ -155,7 +145,7 @@ def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs
}
, sComment=sComment
#,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
- ,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText_v4
+ ,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
)
#self.setNbClass(3) #so that we check if all classes are represented in the training set
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIOH.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIOH.py
index 764c0ce..bbffab6 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIOH.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIOH.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2018 H. Déjean, JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -56,13 +45,13 @@ class NodeType_BIESO_to_SIOH(NodeType_PageXml_type_woText):
Convert BIESO labeling to SIO
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
-
+ domnode = graph_node.node
sXmlLabel = domnode.get("DU_header")
if sXmlLabel != 'CH':
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_Cut1SIO.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_Cut1SIO.py
index e90b5c1..11d50f8 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_Cut1SIO.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_Cut1SIO.py
@@ -11,18 +11,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -89,7 +78,7 @@ def setClassicNodeTypeList(cls, lNodeType):
"""
cls._lClassicNodeType = lNodeType
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -125,7 +114,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
@@ -537,13 +526,13 @@ class NodeType_BIESO_to_SIO(NodeType_PageXml_type_woText):
Convert BIESO labeling to SIO
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
-
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
sXmlLabel = {'B':'S',
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_H.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_H.py
index 4075e21..6c12a9c 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_H.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_H.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2018 H. Déjean, JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -58,12 +47,13 @@ class NodeType_BIESO_to_SIO_and_CHDO(NodeType_PageXml_type_woText):
Convert BIESO labeling to SIO
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_HCut1SIO.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_HCut1SIO.py
index 5d16923..888f88a 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_HCut1SIO.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_HCut1SIO.py
@@ -11,18 +11,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -112,7 +101,7 @@ def setFactoredClassicalType(cls, ntClassic, ntFactored):
cls._dFactorialType[ntClassic] = ntFactored
cls._lfactoredType.append(ntFactored)
- def parseXmlFile(self, sFilename, iVerbose=0):
+ def parseDocFile(self, sFilename, iVerbose=0):
"""
Load that document as a CRF Graph.
Also set the self.doc variable!
@@ -154,7 +143,7 @@ def parseXmlFile(self, sFilename, iVerbose=0):
assert len(lClassicType) == 1
assert len(lSpecialType) == 1
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
@@ -202,7 +191,7 @@ def computeSpecialEdges(cls, lClassicPageNode, lSpecialPageNode):
# ------------------------------------
- def parseDomLabels(self):
+ def parseDocLabels(self):
"""
Parse the label of the graph from the dataset, and set the node label
return the set of observed class (set of integers in N+)
@@ -213,13 +202,13 @@ def parseDomLabels(self):
== ad-hoc graph ==
We also load the class of the factored classical nodes
"""
- setSeensLabels = Graph_MultiPageXml.parseDomLabels(self)
+ setSeensLabels = Graph_MultiPageXml.parseDocLabels(self)
# and we go thru the classical node types to also load the factored label
for nd in self.lNodeBlock:
factoredType = self._dFactorialType[nd.type]
try:
- sFactoredLabel = factoredType.parseDomNodeLabel(nd.node)
+ sFactoredLabel = factoredType.parseDocNodeLabel(nd)
except KeyError:
raise ValueError("Page %d, unknown label in %s (Known labels are %s)"%(nd.pnum, str(nd.node), self._dClsByLabel))
factoredLabel = self._dClsByLabel[sFactoredLabel]
@@ -228,7 +217,7 @@ def parseDomLabels(self):
setSeensLabels.add(factoredLabel)
return setSeensLabels
- def setDomLabels(self, Y):
+ def setDocLabels(self, Y):
"""
Set the labels of the graph nodes from the Y matrix
return the DOM
@@ -245,18 +234,18 @@ def setDomLabels(self, Y):
# Blocks
for i, nd in enumerate(self.lNodeBlock):
sLabel = self._dLabelByCls[ Y[i] ]
- ntBlock.setDomNodeLabel(nd.node, sLabel)
+ ntBlock.setDocNodeLabel(nd, sLabel)
# factored Blocks
for i, nd in enumerate(self.lNodeBlock):
sLabel = self._dLabelByCls[ Y[i+NB] ]
- ntFactored.setDomNodeLabel(nd.node, sLabel)
+ ntFactored.setDocNodeLabel(nd, sLabel)
# cut nodes
Z = NB + NB
for i, nd in enumerate(self.lNodeCutLine):
sLabel = self._dLabelByCls[ Y[i+Z] ]
- ntCut.setDomNodeLabel(nd.node, sLabel)
+ ntCut.setDocNodeLabel(nd, sLabel)
return self.doc
@@ -738,12 +727,13 @@ class NodeType_BIESO_to_SIO_and_CHDO(NodeType_PageXml_type_woText):
Convert BIESO labeling to SIO
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
@@ -911,7 +901,7 @@ def evalClusterByRow(self, sFilename):
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
#load the block nodes per page
- for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
#now that we have the page, let's create the node for each type!
lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_H_v2.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_H_v2.py
index b68fee2..44b97e8 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_H_v2.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_H_v2.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2018 H. Déjean, JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -139,12 +128,13 @@ class NodeType_BIESO_to_SIO_and_CHDO(NodeType_PageXml_type_woText):
Convert BIESO labeling to SIO
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_v2.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_v2.py
index 2387de4..3306f60 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_v2.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableRSIO_v2.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2018 H. Déjean, JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -140,13 +129,13 @@ class NodeType_BIESO_to_SIO(NodeType_PageXml_type_woText):
Convert BIESO labeling to SIO
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
-
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
sXmlLabel = {'B':'S',
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed.py
new file mode 100644
index 0000000..421c369
--- /dev/null
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed.py
@@ -0,0 +1,1051 @@
+# -*- coding: utf-8 -*-
+
+"""
+ DU task for ABP Table:
+ doing jointly row EIO and near horizontal cuts SIO
+
+ block2line edges do not cross another block.
+
+ The cut are based on baselines of text blocks, with some positive or negative inclination.
+
+ - the labels of cuts are SIO
+
+ Copyright Naver Labs Europe(C) 2018 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+
+
+
+import sys, os
+import math
+from lxml import etree
+from collections import Counter
+from ast import literal_eval
+
+import numpy as np
+import shapely.geometry as geom
+import shapely.ops
+
+from sklearn.pipeline import Pipeline, FeatureUnion
+from sklearn.feature_extraction.text import CountVectorizer
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+from tasks import _checkFindColDir, _exit
+from tasks.DU_CRF_Task import DU_CRF_Task
+from tasks.DU_Table.DU_ABPTableSkewed_CutAnnotator import SkewedCutAnnotator
+
+from xml_formats.PageXml import MultiPageXml, PageXml
+
+import graph.GraphModel
+from graph.Block import Block
+from graph.Edge import Edge, SamePageEdge, HorizontalEdge, VerticalEdge
+from graph.Graph_MultiPageXml import Graph_MultiPageXml
+from graph.NodeType_PageXml import NodeType_PageXml_type
+
+#from graph.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText
+from graph.FeatureDefinition import FeatureDefinition
+from graph.Transformer import Transformer, TransformerListByType, SparseToDense
+from graph.Transformer import EmptySafe_QuantileTransformer as QuantileTransformer
+from graph.Transformer_PageXml import NodeTransformerXYWH_v2, NodeTransformerNeighbors, Node1HotFeatures_noText,\
+ NodeTransformerText, NodeTransformerTextLen, EdgeNumericalSelector_v2
+from graph.Transformer_PageXml import Edge1HotFeatures_noText, EdgeBooleanFeatures_v2, EdgeNumericalSelector_noText
+from graph.PageNumberSimpleSequenciality import PageNumberSimpleSequenciality
+
+from util.Shape import ShapeLoader
+
+class GraphSkewedCut(Graph_MultiPageXml):
+ """
+ We specialize the class of graph because the computation of edges is quite specific
+
+ Here we consider horizontal and near-horizontal cuts
+ """
+ bCutAbove = False # the cut line is above the "support" text
+ lRadAngle = None
+
+ #Cut stuff
+ #iModulo = 1 # map the coordinate to this modulo
+ fMinPageCoverage = 0.5 # minimal coverage to consider a GT table separator
+ # fCutHeight = 25 # height of a cutting ribbon
+ # For NAF to get 91% GT recall with same recall on ABP 98% (moving from 105 to 108% cuts)
+ fCutHeight = 10 # height of a cutting ribbon
+
+ # BAAAAD iLineVisibility = 5 * 11 # a cut line sees other cut line up to N pixels downward
+ iLineVisibility = 3700 // 7 # (528) a cut line sees other cut line up to N pixels downward
+ iBlockVisibility = 3*7*13 # (273) a block sees neighbouring cut lines at N pixels
+
+ _lClassicNodeType = None
+
+ # when loading a text, we create a shapely shape using the function below.
+ shaper_fun = ShapeLoader.node_to_Point
+
+ @classmethod
+ def setClassicNodeTypeList(cls, lNodeType):
+ """
+ determine which type of node goes thru the classical way for determining
+ the edges (vertical or horizontal overlap, with occlusion, etc.)
+ """
+ cls._lClassicNodeType = lNodeType
+
+ def parseDocFile(self, sFilename, iVerbose=0):
+ """
+ Load that document as a CRF Graph.
+ Also set the self.doc variable!
+
+ CAUTION: DOES NOT WORK WITH MULTI-PAGE DOCUMENTS...
+
+ Return a CRF Graph object
+ """
+ traceln(" ----- FILE %s ------" % sFilename)
+ self.doc = etree.parse(sFilename)
+ self.lNode, self.lEdge = list(), list()
+ self.lNodeBlock = [] # text node
+ self.lNodeCutLine = [] # cut line node
+
+ doer = SkewedCutAnnotator(self.bCutAbove, lAngle=self.lRadAngle)
+ domid = 0
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
+ traceln(" --- page %s - constructing separator candidates" % pnum)
+ #load the page objects and the GT partition (defined by the table) if any
+ loBaseline, dsetTableByRow = doer.loadPage(domNdPage, shaper_fun=self.shaper_fun)
+ traceln(" - found %d objects on page" % (len(loBaseline)))
+ if loBaseline: traceln("\t - shaped as %s" % type(loBaseline[0]))
+
+ # find almost-horizontal cuts and tag them if GT is available
+ loHCut = doer.findHCut(domNdPage, loBaseline, dsetTableByRow, self.fCutHeight, iVerbose)
+
+ #create DOM node reflecting the cuts
+ #first clean (just in case!)
+ n = doer.remove_cuts_from_dom(domNdPage)
+ if n > 0:
+ traceln(" - removed %d pre-existing cut lines" % n)
+
+ # if GT, then we have labelled cut lines in DOM
+ domid = doer.add_Hcut_to_Page(domNdPage, loHCut, domid)
+
+ lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType]
+ lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType]
+
+ for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
+ traceln(" --- page %s - constructing the graph" % pnum)
+ #now that we have the page, let's create the node for each type!
+ lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
+ lSpecialPageNode = [nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ]
+
+ self.lNode.extend(lClassicPageNode) # e.g. the TextLine objects
+ self.lNodeBlock.extend(lClassicPageNode)
+
+ self.lNode.extend(lSpecialPageNode) # e.g. the cut lines!
+ self.lNodeCutLine.extend(lSpecialPageNode)
+
+ #no previous page to consider (for cross-page links...) => None
+ lClassicPageEdge = Edge.computeEdges(None, lClassicPageNode, self.iGraphMode)
+ self.lEdge.extend(lClassicPageEdge)
+
+ # Now, compute edges between special and classic objects...
+ lSpecialPageEdge = self.computeSpecialEdges(lClassicPageNode,
+ lSpecialPageNode)
+ self.lEdge.extend(lSpecialPageEdge)
+
+ #if iVerbose>=2: traceln("\tPage %5d %6d nodes %7d edges"%(pnum, len(lPageNode), len(lPageEdge)))
+ if iVerbose>=2:
+ traceln("\tPage %5d"%(pnum))
+ traceln("\t block: %6d nodes %7d edges (to block)" %(len(lClassicPageNode), len(lClassicPageEdge)))
+ traceln("\t line: %6d nodes %7d edges (from block or line)"%(len(lSpecialPageNode), len(lSpecialPageEdge)))
+ c = Counter(type(o).__name__ for o in lSpecialPageEdge)
+ l = list(c.items())
+ l.sort()
+ traceln("\t\t", l)
+ if iVerbose: traceln("\t\t (%d nodes, %d edges)"%(len(self.lNode), len(self.lEdge)) )
+
+ return self
+
+ def addParsedLabelToDom(self):
+ """
+ while parsing the pages, we may have updated the standard BIESO labels
+ we store the possibly new label in the DOM
+ """
+ for nd in self.lNode:
+ nd.type.setDocNodeLabel(nd, self._dLabelByCls[ nd.cls ])
+
+ def addEdgeToDoc(self, Y=None):
+ """
+ To display the grpah conveniently we add new Edge elements
+ """
+ import random
+ (pnum, page, ndPage) = next(self._iter_Page_DocNode(self.doc))
+ w = int(ndPage.get("imageWidth"))
+
+ nn = 1 + len([e for e in self.lEdge if type(e) not in [HorizontalEdge, VerticalEdge, Edge_BL]])
+ ii = 0
+ for edge in self.lEdge:
+ if type(edge) in [HorizontalEdge, VerticalEdge]:
+ A, B = edge.A.shape.centroid, edge.B.shape.centroid
+ elif type(edge) in [Edge_BL]:
+ A = edge.A.shape.centroid
+ # not readable _pt, B = shapely.ops.nearest_points(A, edge.B.shape)
+ _pt, B = shapely.ops.nearest_points(edge.A.shape, edge.B.shape)
+ else:
+ ii += 1
+ x = 1 + ii * (w/nn)
+ pt = geom.Point(x, 0)
+ A, _ = shapely.ops.nearest_points(edge.A.shape, pt)
+ B, _ = shapely.ops.nearest_points(edge.B.shape, pt)
+ ndSep = MultiPageXml.createPageXmlNode("Edge")
+ ndSep.set("DU_type", type(edge).__name__)
+ ndPage.append(ndSep)
+ MultiPageXml.setPoints(ndSep, [(A.x, A.y), (B.x, B.y)])
+ return
+
+ @classmethod
+ def computeSpecialEdges(cls, lClassicPageNode, lSpecialPageNode):
+ """
+ return a list of edges
+ """
+ raise Exception("Specialize this method")
+
+
+
+class Edge_BL(Edge):
+ """Edge block-to-Line"""
+ pass
+
+class Edge_LL(Edge):
+ """Edge line-to-Line"""
+ pass
+
+class GraphSkewedCut_H(GraphSkewedCut):
+ """
+ Only horizontal cut lines
+ """
+
+ def __init__(self):
+ self.showClassParam()
+
+ @classmethod
+ def showClassParam(cls):
+ """
+ show the class parameters
+ return whether or not they were shown
+ """
+ try:
+ cls.bParamShownOnce
+ return False
+ except:
+ #traceln(" - iModulo : " , cls.iModulo)
+ traceln(" - block_see_line : " , cls.iBlockVisibility)
+ traceln(" - line_see_line : " , cls.iLineVisibility)
+ traceln(" - cut height : " , cls.fCutHeight)
+ traceln(" - cut above : " , cls.bCutAbove)
+ traceln(" - angles : " , [math.degrees(v) for v in cls.lRadAngle])
+ traceln(" - fMinPageCoverage : " , cls.fMinPageCoverage)
+ traceln(" - Textual features : " , cls.bTxt)
+ cls.bParamShownOnce = True
+ return True
+
+ def getNodeListByType(self, iTyp):
+ if iTyp == 0:
+ return self.lNodeBlock
+ else:
+ return self.lNodeCutLine
+
+ def getEdgeListByType(self, typA, typB):
+ if typA == 0:
+ if typB == 0:
+ return (e for e in self.lEdge if isinstance(e, SamePageEdge))
+ else:
+ return (e for e in self.lEdge if isinstance(e, Edge_BL))
+ else:
+ if typB == 0:
+ return []
+ else:
+ return (e for e in self.lEdge if isinstance(e, Edge_LL))
+
+
+ @classmethod
+ def computeSpecialEdges(self, lClassicPageNode, lSpecialPageNode):
+ """
+ Compute:
+ - edges between each block and the cut line above/across/below the block
+ - edges between cut lines
+ return a list of edges
+ """
+ #augment the block with the coordinate of its baseline central point
+ for blk in lClassicPageNode:
+ try:
+ pt = blk.shape.centroid
+ blk.x_bslne = pt.x
+ blk.y_bslne = pt.y
+ except IndexError:
+ traceln("** WARNING: no Baseline in ", blk.domid)
+ traceln("** Using BB instead... :-/")
+ blk.x_bslne = (blk.x1+blk.x2) / 2
+ blk.y_bslne = (blk.y1+blk.y2) / 2
+ blk._in_edge_up = 0 # count of incoming edge from upper lines
+ blk._in_edge_down = 0 # count of incoming edge from downward lines
+
+ #block to cut line edges
+ # no _type=0 because they are valid cut (never crossing any block)
+ lEdge = []
+ for cutBlk in lSpecialPageNode:
+ #equation of the line
+ # y = A x + B
+ A = (cutBlk.y2 - cutBlk.y1) / (cutBlk.x2 - cutBlk.x1)
+ B = cutBlk.y1 - A * cutBlk.x1
+ oCut = cutBlk.shape
+ for blk in lClassicPageNode:
+ dist = oCut.distance(blk.shape)
+ if dist <= self.iBlockVisibility:
+ edge = Edge_BL(blk, cutBlk) # Block _to_ Cut !!
+ # experiments show that abs helps
+ # edge.len = (blk.y_bslne - cutBlk.y1) / self.iBlockVisibility
+ edge.len = dist / self.iBlockVisibility
+ y = A * blk.x_bslne + B # y of the point on cut line
+ # edge._type = -1 if blk.y_bslne > y else (+1 if blk.y_bslne < y else 0)
+ # shapely can give as distance a very small number while y == 0
+ edge._type = -1 if blk.y_bslne >= y else +1
+ assert edge._type != 0, (str(oCut), list(blk.shape.coords), oCut.distance(blk.shape.centroid), str(blk.shape.centroid))
+ lEdge.append(edge)
+
+ #now filter those edges
+ n0 = len(lEdge)
+ #lEdge = self._filterBadEdge(lEdge, lClassicPageNode, lSpecialPageNode)
+ lEdge = self._filterBadEdge(lEdge, lSpecialPageNode)
+
+ traceln(" - filtering: removed %d edges due to obstruction." % (n0-len(lEdge)))
+
+ # add a counter of incoming edge to nodes, for features eng.
+ for edge in lEdge:
+ if edge._type > 0:
+ edge.A._in_edge_up += 1
+ else:
+ edge.A._in_edge_down += 1
+
+ # Cut line to Cut line edges
+ n0 = len(lEdge)
+ if self.iLineVisibility > 0:
+ for i, A in enumerate(lSpecialPageNode):
+ for B in lSpecialPageNode[i+1:]:
+ dist = A.shape.distance(B.shape)
+ if dist <= self.iLineVisibility:
+ edge = Edge_LL(A, B)
+ edge.len = dist / self.iLineVisibility
+ lEdge.append(edge)
+ traceln(" - edge_LL: added %d edges." % (len(lEdge)-n0))
+
+ return lEdge
+
+
+ @classmethod
+ def _filterBadEdge(cls, lEdge, lCutLine, fRatio=0.25):
+ """
+ We get
+ - a list of block2Line edges
+ - a sorted list of cut line
+ But some block should not be connected to a line due to obstruction by
+ another blocks.
+ We filter out those edges...
+ return a sub-list of lEdge
+ """
+ lKeepEdge = []
+
+
+ def isTargetLineVisible_X(edge, lEdge, fThreshold=0.9):
+ """
+ can the source node of the edge see the target node line?
+ we say no if some other block obstructs half or more of the view
+ """
+ a1, a2 = edge.A.x1, edge.A.x2
+ w = a2 - a1
+ minVisibility = w * fThreshold
+ for _edge in lEdge:
+ # we want a visibility window of at least 1/4 of the object A
+ b1, b2 = _edge.A.x1, _edge.A.x2
+ vis = min(w, max(0, b1 - a1) + max(0, a2 - b2))
+ if vis <= minVisibility: return False
+ return True
+
+ #there are two ways for dealing with lines crossed by a block
+ # - either it prevents another block to link to the line (assuming an x-overlap)
+ # - or not (historical way)
+ # THIS IS THE "MODERN" way!!
+
+ #take each line in turn
+ for ndLine in lCutLine:
+ #--- process downward edges
+ #TODO: index!
+ lDownwardAndXingEdge = [edge for edge in lEdge \
+ if edge._type > 0 and edge.B == ndLine]
+ if lDownwardAndXingEdge:
+ #sort edge by source block from closest to line block to farthest
+ #lDownwardAndXingEdge.sort(key=lambda o: ndLine.y1 - o.A.y_bslne)
+ lDownwardAndXingEdge.sort(key=lambda o: ndLine.shape.distance(o.A.shape))
+
+ lKeepDownwardEdge = [lDownwardAndXingEdge.pop(0)]
+
+ #now keep all edges whose source does not overlap vertically with
+ # the source of an edge that is kept
+ for edge in lDownwardAndXingEdge:
+ if isTargetLineVisible_X(edge, lKeepDownwardEdge):
+ lKeepDownwardEdge.append(edge)
+ lKeepEdge.extend(lKeepDownwardEdge)
+
+ #--- process upward edges
+ #TODO: index!
+ lUpwarAndXingdEdge = [edge for edge in lEdge \
+ if edge._type < 0 and edge.B == ndLine]
+ if lUpwarAndXingdEdge:
+ #sort edge by source block from closest to line -block to farthest
+ #lUpwarAndXingdEdge.sort(key=lambda o: o.A.y_bslne - ndLine.y2)
+ lUpwarAndXingdEdge.sort(key=lambda o: ndLine.shape.distance(o.A.shape))
+ lKeepUpwardEdge = [lUpwarAndXingdEdge.pop(0)]
+
+ #now keep all edges whose source does not overlap vertically with
+ # the source of an edge that is kept
+ for edge in lUpwarAndXingdEdge:
+ if isTargetLineVisible_X(edge, lKeepUpwardEdge):
+ lKeepUpwardEdge.append(edge)
+
+ # now we keep only the edges, excluding the crossing ones
+ # (already included!!)
+ lKeepEdge.extend(edge for edge in lKeepUpwardEdge)
+
+ #--- and include the crossing ones (that are discarded
+ return lKeepEdge
+
+
+#------------------------------------------------------------------------------------------------------
+class SupportBlock_NodeTransformer(Transformer):
+ """
+ aspects related to the "support" notion of a block versus a cut line
+ """
+ def transform(self, lNode):
+# a = np.empty( ( len(lNode), 5 ) , dtype=np.float64)
+# for i, blk in enumerate(lNode): a[i, :] = [blk.x1, blk.y2, blk.x2-blk.x1, blk.y2-blk.y1, blk.fontsize] #--- 2 3 4 5 6
+ a = np.empty( ( len(lNode), 2 ) , dtype=np.float64)
+ for i, blk in enumerate(lNode):
+ a[i, :] = (blk._in_edge_up, blk._in_edge_down)
+ return a
+
+#------------------------------------------------------------------------------------------------------
+class CutLine_NodeTransformer_v3(Transformer):
+ """
+ features of a Cut line:
+ - horizontal or vertical.
+ """
+ def transform(self, lNode):
+ #We allocate TWO more columns to store in it the tfidf and idf computed at document level.
+ #a = np.zeros( ( len(lNode), 10 ) , dtype=np.float64) # 4 possible orientations: 0, 1, 2, 3
+ N = 6
+ a = np.zeros( ( len(lNode), N ) , dtype=np.float64) # 4 possible orientations: 0, 1, 2, 3
+
+ for i, blk in enumerate(lNode):
+ page = blk.page
+ assert abs(blk.x2 - blk.x1) > abs(blk.y1 - blk.y2)
+ #horizontal
+ v = (blk.y1+blk.y2)/float(page.h) - 1 # to range -1, +1
+ a[i,:] = (1.0, v, v*v
+ , blk.angle, blk.angle_freq, blk.angle_cumul_freq)
+# else:
+# #vertical
+# v = 2*blk.x1/float(page.w) - 1 # to range -1, +1
+# a[i, N:] = (1.0, v, v*v
+# ,blk.angle, blk.angle_freq, blk.angle_cumul_freq)
+ # traceln("CutLine_NodeTransformer_v3", a[:min(100, len(lNode)),])
+ return a
+
+class CutLine_NodeTransformer_qty(Transformer):
+ """
+ features of a Cut line:
+ - horizontal or vertical.
+ """
+ def transform(self, lNode):
+ #We allocate TWO more columns to store in it the tfidf and idf computed at document level.
+ #a = np.zeros( ( len(lNode), 10 ) , dtype=np.float64) # 4 possible orientations: 0, 1, 2, 3
+ N = 1
+ a = np.zeros( ( len(lNode), 2*N ) , dtype=np.float64) # 4 possible orientations: 0, 1, 2, 3
+
+ for i, blk in enumerate(lNode):
+ assert abs(blk.x2 - blk.x1) > abs(blk.y1 - blk.y2)
+ #horizontal
+ a[i,:] = (len(blk.set_support))
+ return a
+
+
+#------------------------------------------------------------------------------------------------------
+class Block2CutLine_EdgeTransformer(Transformer):
+ """
+ features of a block to Cut line edge:
+ - below, crossing, above
+ """
+ def transform(self, lEdge):
+ N = 8
+ a = np.zeros( ( len(lEdge), 2 * N) , dtype=np.float64)
+ for i, edge in enumerate(lEdge):
+ z = 0 if edge._type < 0 else N # _type is -1 or 1
+ blk = edge.A
+ page = blk.page
+ w = float(page.w) # h = float(page.h)
+ x = (blk.x1 + blk.x2) / w - 1 # [-1, +1]
+ a[i, z:z+N] = (1.0
+ , edge.len
+ , edge.len*edge.len
+ , edge.B.angle_freq
+ , edge.B.angle_cumul_freq
+ , 1.0 if edge.A.du_index in edge.B.set_support else 0.0
+ , x, x * x
+ )
+# print(a[i,:].tolist())
+ # traceln("Block2CutLine_EdgeTransformer", a[:min(100, len(lEdge)),])
+ return a
+
+class Block2CutLine_EdgeTransformer_qtty(Transformer):
+ def transform(self, lEdge):
+ N = 3
+ a = np.zeros( ( len(lEdge), 2 * N) , dtype=np.float64)
+ for i, edge in enumerate(lEdge):
+ z = 0 if edge._type < 0 else N # _type is -1 or 1
+ a[i, z:z+N] = (len(edge.B.set_support)
+ , edge.A._in_edge_up
+ , edge.A._in_edge_down
+ )
+# print(a[i,:].tolist())
+ # traceln("Block2CutLine_EdgeTransformer", a[:min(100, len(lEdge)),])
+ return a
+
+class Block2CutLine_FakeEdgeTransformer(Transformer):
+ """
+ a fake transformer that return as many features as the union of real ones above
+ """
+ def transform(self, lEdge):
+ assert not(lEdge)
+ return np.zeros( ( len(lEdge), 2*8 + 2*3) , dtype=np.float64)
+
+
+class CutLine2CutLine_EdgeTransformer(Transformer): # ***** USELESS *****
+ """
+ features of a block to Cut line edge:
+ - below, crossing, above
+ """
+# BEST SO FAR
+# def transform(self, lEdge):
+# a = np.zeros( ( len(lEdge), 4 ) , dtype=np.float64)
+# for i, edge in enumerate(lEdge):
+# a[i,:] = (1, edge.len, edge.len * edge.len, int(edge.len==0))
+# # traceln("CutLine2CutLine_EdgeTransformer", a[:min(100, len(lEdge)),])
+# return a
+
+# WORSE
+# def transform(self, lEdge):
+# a = np.zeros( ( len(lEdge), 12) , dtype=np.float64)
+# for i, edge in enumerate(lEdge):
+# dAngle = (edge.A.angle - edge.B.angle) / 5 # we won't go beyond +-5 degrees.
+# iSameSupport = int(len(edge.B.set_support.intersection(edge.A.set_support)) > 0)
+# iCrosses = int(edge.A.shape.crosses(edge.B.shape))
+# a[i,:] = (1
+# , edge.len, edge.len * edge.len, int(edge.len==0), int(edge.len < 5)
+# , dAngle, dAngle * dAngle, int(abs(dAngle) < 0.1), int(abs(dAngle) < 0.1)
+# , iSameSupport
+# , iCrosses
+# , (1-iSameSupport) * iCrosses # not same support but crossing each other
+# )
+# return a
+
+ def transform(self, lEdge):
+ a = np.zeros( ( len(lEdge), 7 ) , dtype=np.float64)
+ for i, edge in enumerate(lEdge):
+ dAngle = (edge.A.angle - edge.B.angle) / 5 # we won't go beyond +-5 degrees.
+ iSameSupport = int(len(edge.B.set_support.intersection(edge.A.set_support)) > 0)
+ iCrosses = int(edge.A.shape.crosses(edge.B.shape))
+ a[i,:] = (1, edge.len, edge.len * edge.len
+ , dAngle, dAngle * dAngle
+ , iSameSupport
+ , iCrosses
+ )
+ # traceln("CutLine2CutLine_EdgeTransformer", a[:min(100, len(lEdge)),])
+ return a
+
+
+
+class My_FeatureDefinition_v3_base(FeatureDefinition):
+ n_QUANTILES = 16
+ n_QUANTILES_sml = 8
+
+ def __init__(self, **kwargs):
+ """
+ set _node_transformer, _edge_transformer, tdifNodeTextVectorizer
+ """
+ FeatureDefinition.__init__(self)
+ self._node_transformer = None
+ self._edge_transformer = None
+ self._node_text_vectorizer = None #tdifNodeTextVectorizer
+
+ def fitTranformers(self, lGraph,lY=None):
+ """
+ Fit the transformers using the graphs, but TYPE BY TYPE !!!
+ return True
+ """
+ self._node_transformer[0].fit([nd for g in lGraph for nd in g.getNodeListByType(0)])
+ self._node_transformer[1].fit([nd for g in lGraph for nd in g.getNodeListByType(1)])
+
+ self._edge_transformer[0].fit([e for g in lGraph for e in g.getEdgeListByType(0, 0)])
+ self._edge_transformer[1].fit([e for g in lGraph for e in g.getEdgeListByType(0, 1)])
+ self._edge_transformer[2].fit([e for g in lGraph for e in g.getEdgeListByType(1, 0)])
+ self._edge_transformer[3].fit([e for g in lGraph for e in g.getEdgeListByType(1, 1)])
+
+ return True
+
+class My_FeatureDefinition_v3(My_FeatureDefinition_v3_base):
+ """
+ Multitype version:
+ so the node_transformer actually is a list of node_transformer of length n_class
+ the edge_transformer actually is a list of node_transformer of length n_class^2
+
+ We also inherit from FeatureDefinition_T !!!
+ """
+
+ def __init__(self, **kwargs):
+ """
+ set _node_transformer, _edge_transformer, tdifNodeTextVectorizer
+ """
+ My_FeatureDefinition_v3_base.__init__(self)
+
+ nbTypes = self._getTypeNumber(kwargs)
+
+ block_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
+ ("xywh", Pipeline([
+ ('selector', NodeTransformerXYWH_v2()),
+ #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling
+ ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling
+ ])
+ )
+ , ("edge_cnt", Pipeline([
+ ('selector', SupportBlock_NodeTransformer()),
+ #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling
+ ('edge_cnt', QuantileTransformer(n_quantiles=self.n_QUANTILES_sml, copy=False)) #use in-place scaling
+ ])
+ )
+ , ("neighbors", Pipeline([
+ ('selector', NodeTransformerNeighbors()),
+ #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling
+ ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling
+ ])
+ )
+ , ("1hot", Pipeline([
+ ('1hot', Node1HotFeatures_noText()) #does the 1-hot encoding directly
+ ])
+ )
+ ])
+
+ Cut_line_transformer = FeatureUnion( [
+ ("std", CutLine_NodeTransformer_v3())
+ , ("qty", Pipeline([
+ ('selector', CutLine_NodeTransformer_qty()),
+ ('quantile', QuantileTransformer(n_quantiles=self.n_QUANTILES_sml, copy=False)) #use in-place scaling
+ ])
+ )
+ ])
+
+ self._node_transformer = TransformerListByType([block_transformer, Cut_line_transformer])
+
+ edge_BB_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
+ ("1hot", Pipeline([
+ ('1hot', Edge1HotFeatures_noText(PageNumberSimpleSequenciality()))
+ ])
+ )
+ , ("boolean", Pipeline([
+ ('boolean', EdgeBooleanFeatures_v2())
+ ])
+ )
+ , ("numerical", Pipeline([
+ ('selector', EdgeNumericalSelector_noText()),
+ #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling
+ ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling
+ ])
+ )
+ ] )
+ #edge_BL_transformer = Block2CutLine_EdgeTransformer()
+ edge_BL_transformer = FeatureUnion( [
+ ("std", Block2CutLine_EdgeTransformer())
+ , ("qty", Pipeline([
+ ('selector', Block2CutLine_EdgeTransformer_qtty()),
+ ('quantile', QuantileTransformer(n_quantiles=self.n_QUANTILES_sml, copy=False)) #use in-place scaling
+ ])
+ )
+ ])
+
+ edge_LL_transformer = CutLine2CutLine_EdgeTransformer()
+ self._edge_transformer = TransformerListByType([edge_BB_transformer,
+ edge_BL_transformer,
+ # edge_BL_transformer, # useless but required
+ Block2CutLine_FakeEdgeTransformer(), # fit is called with [], so the Pipeline explodes
+ edge_LL_transformer
+ ])
+
+
+
+gTBL = str.maketrans("0123456789", "NNNNNNNNNN")
+def My_FeatureDefinition_v3_txt_preprocess(s):
+ """
+ Normalization of the etxt before extracting ngrams
+ """
+ return s.lower().translate(gTBL)
+
+
+class My_FeatureDefinition_v3_txt(My_FeatureDefinition_v3_base):
+ """
+ Multitype version:
+ so the node_transformer actually is a list of node_transformer of length n_class
+ the edge_transformer actually is a list of node_transformer of length n_class^2
+
+ We also inherit from FeatureDefinition_T !!!
+ """
+ t_ngrams_range = (2, 4)
+ n_ngrams = 1000
+
+ # pre-processing of text before extracting ngrams
+ def __init__(self, **kwargs):
+ """
+ set _node_transformer, _edge_transformer, tdifNodeTextVectorizer
+ """
+ My_FeatureDefinition_v3_base.__init__(self)
+
+ nbTypes = self._getTypeNumber(kwargs)
+
+ # since we have a preprocessor, lowercase and strip_accents options are disabled
+ self._node_text_vectorizer = CountVectorizer( analyzer = 'char'
+ # AttributeError: Can't pickle local object 'My_FeatureDefinition_v3_txt.__init__..'
+ # , preprocessor = lambda x: x.lower().translate(self.TBL)
+ , preprocessor = My_FeatureDefinition_v3_txt_preprocess
+ , max_features = self.n_ngrams
+ , ngram_range = self.t_ngrams_range #(2,6)
+ , dtype=np.float64)
+
+ block_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
+ ("text", Pipeline([
+ ('selector', NodeTransformerText())
+ , ('vecto', self._node_text_vectorizer) #we can use it separately from the pipleline once fitted
+ , ('todense', SparseToDense()) #pystruct needs an array, not a sparse matrix
+ ])
+ )
+ ,
+ ("textlen", Pipeline([
+ ('selector', NodeTransformerTextLen()),
+ ('textlen', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling
+ ])
+ )
+ , ("xywh", Pipeline([
+ ('selector', NodeTransformerXYWH_v2()),
+ #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling
+ ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling
+ ])
+ )
+ , ("edge_cnt", Pipeline([
+ ('selector', SupportBlock_NodeTransformer()),
+ #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling
+ ('edge_cnt', QuantileTransformer(n_quantiles=self.n_QUANTILES_sml, copy=False)) #use in-place scaling
+ ])
+ )
+ , ("neighbors", Pipeline([
+ ('selector', NodeTransformerNeighbors()),
+ #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling
+ ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling
+ ])
+ )
+ , ("1hot", Pipeline([
+ ('1hot', Node1HotFeatures_noText()) #does the 1-hot encoding directly
+ ])
+ )
+ ])
+
+ Cut_line_transformer = FeatureUnion( [
+ ("std", CutLine_NodeTransformer_v3())
+ , ("qty", Pipeline([
+ ('selector', CutLine_NodeTransformer_qty()),
+ ('quantile', QuantileTransformer(n_quantiles=self.n_QUANTILES_sml, copy=False)) #use in-place scaling
+ ])
+ )
+ ])
+
+ self._node_transformer = TransformerListByType([block_transformer, Cut_line_transformer])
+
+ edge_BB_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
+ ("1hot", Pipeline([
+ ('1hot', Edge1HotFeatures_noText(PageNumberSimpleSequenciality()))
+ ])
+ )
+ , ("boolean", Pipeline([
+ ('boolean', EdgeBooleanFeatures_v2())
+ ])
+ )
+ , ("numerical", Pipeline([
+ ('selector', EdgeNumericalSelector_v2()),
+ #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling
+ ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False)) #use in-place scaling
+ ])
+ )
+ ] )
+ #edge_BL_transformer = Block2CutLine_EdgeTransformer()
+ edge_BL_transformer = FeatureUnion( [
+ ("std", Block2CutLine_EdgeTransformer())
+ , ("qty", Pipeline([
+ ('selector', Block2CutLine_EdgeTransformer_qtty()),
+ ('quantile', QuantileTransformer(n_quantiles=self.n_QUANTILES_sml, copy=False)) #use in-place scaling
+ ])
+ )
+ ])
+
+ edge_LL_transformer = CutLine2CutLine_EdgeTransformer()
+ self._edge_transformer = TransformerListByType([edge_BB_transformer,
+ edge_BL_transformer,
+ # edge_BL_transformer, # useless but required
+ Block2CutLine_FakeEdgeTransformer(), # fit is called with [], so the Pipeline explodes
+ edge_LL_transformer
+ ])
+
+
+ def cleanTransformers(self):
+ """
+ the TFIDF transformers are keeping the stop words => huge pickled file!!!
+
+ Here the fix is a bit rough. There are better ways....
+ JL
+ """
+ self._node_transformer[0].transformer_list[0][1].steps[1][1].stop_words_ = None #is 1st in the union...
+# for i in [2, 3, 4, 5, 6, 7]:
+# self._edge_transformer.transformer_list[i][1].steps[1][1].stop_words_ = None #are 3rd and 4th in the union....
+ return self._node_transformer, self._edge_transformer
+
+
+def test_preprocess(capsys):
+
+ with capsys.disabled():
+ print("toto")
+ tbl = str.maketrans("0123456789", "NNNNNNNNNN")
+ fun = lambda x: x.lower().translate(tbl)
+ assert "abc" == fun("abc")
+ assert "abc" == fun("ABC")
+ assert "abcdé" == fun("ABCdé")
+ assert "tüv" == fun("tÜv")
+ assert "tüv NN " == fun("tÜv 12 ")
+ assert "" == fun("")
+ assert "N" == fun("1")
+ assert "NN" == fun("23")
+ assert "j't'aime moi non plus. dites NN!!" == fun("J't'aime MOI NON PlUs. Dites 33!!")
+ assert "" == fun("")
+ assert "" == fun("")
+ assert "" == fun("")
+
+
+class NodeType_PageXml_Cut_Shape(NodeType_PageXml_type):
+ """
+ we specialize it because our cuts are near horizontal
+ """
+ def _iter_GraphNode(self, doc, domNdPage, page):
+ """
+ Get the DOM, the DOM page node, the page object
+
+ iterator on the DOM, that returns nodes (of class Block)
+ """
+ #--- XPATH contexts
+ assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
+ lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page
+
+ for ndBlock in lNdBlock:
+ domid = ndBlock.get("id")
+ sText = ""
+
+ #now we need to infer the bounding box of that object
+ (x1, y1), (x2, y2) = PageXml.getPointList(ndBlock) #the polygon
+
+ orientation = 0
+ classIndex = 0 #is computed later on
+
+ #and create a Block
+ # we pass the coordinates, not x1,y1,w,h !!
+ cutBlk = Block(page, ((x1, y1), (x2, y2)), sText, orientation, classIndex, self, ndBlock, domid=domid)
+
+ # Create the shapely shape
+ cutBlk.shape = geom.LineString([(x1, y1), (x2, y2)])
+ cutBlk.angle = float(ndBlock.get("DU_angle"))
+ cutBlk.angle_freq = float(ndBlock.get("DU_angle_freq"))
+ cutBlk.angle_cumul_freq = float(ndBlock.get("DU_angle_cumul_freq"))
+ cutBlk.set_support = literal_eval(ndBlock.get("DU_set_support"))
+
+ yield cutBlk
+
+ return
+
+
+# ----------------------------------------------------------------------------
+
+def main(TableSkewedRowCut_CLASS, sModelDir, sModelName, options):
+ """
+ TableSkewedRowCut_CLASS must be a class inheriting from DU_Graph_CRF
+ """
+ lDegAngle = [float(s) for s in options.lsAngle.split(",")]
+ lRadAngle = [math.radians(v) for v in lDegAngle]
+
+ doer = TableSkewedRowCut_CLASS(sModelName, sModelDir,
+ iBlockVisibility = options.iBlockVisibility,
+ iLineVisibility = options.iLineVisibility,
+ fCutHeight = options.fCutHeight,
+ bCutAbove = options.bCutAbove,
+ lRadAngle = lRadAngle,
+ bTxt = options.bTxt,
+ C = options.crf_C,
+ tol = options.crf_tol,
+ njobs = options.crf_njobs,
+ max_iter = options.max_iter,
+ inference_cache = options.crf_inference_cache)
+
+ if options.rm:
+ doer.rm()
+ return
+
+ lTrn, lTst, lRun, lFold = [_checkFindColDir(lsDir, bAbsolute=False) for lsDir in [options.lTrn, options.lTst, options.lRun, options.lFold]]
+# if options.bAnnotate:
+# doer.annotateDocument(lTrn)
+# traceln('annotation done')
+# sys.exit(0)
+
+
+ traceln("- classes: ", doer.getGraphClass().getLabelNameList())
+
+ ## use. a_mpxml files
+ #doer.sXmlFilenamePattern = doer.sLabeledXmlFilenamePattern
+
+
+ if options.iFoldInitNum or options.iFoldRunNum or options.bFoldFinish:
+ if options.iFoldInitNum:
+ """
+ initialization of a cross-validation
+ """
+ splitter, ts_trn, lFilename_trn = doer._nfold_Init(lFold, options.iFoldInitNum, bStoreOnDisk=True)
+ elif options.iFoldRunNum:
+ """
+ Run one fold
+ """
+ oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm, options.pkl)
+ traceln(oReport)
+ elif options.bFoldFinish:
+ tstReport = doer._nfold_Finish()
+ traceln(tstReport)
+ else:
+ assert False, "Internal error"
+ #no more processing!!
+ exit(0)
+ #-------------------
+
+ if lFold:
+ loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl)
+ sReportPickleFilename = os.path.join(sModelDir, sModelName + "__report.txt")
+ traceln("Results are in %s"%sReportPickleFilename)
+ graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, loTstRpt)
+ elif lTrn:
+ doer.train_save_test(lTrn, lTst, options.warm, options.pkl)
+ try: traceln("Baseline best estimator: %s"%doer.bsln_mdl.best_params_) #for CutSearch
+ except: pass
+ traceln(" --- CRF Model ---")
+ traceln(doer.getModel().getModelInfo())
+ elif lTst:
+ doer.load()
+ tstReport = doer.test(lTst)
+ traceln(tstReport)
+ if options.bDetailedReport:
+ traceln(tstReport.getDetailledReport())
+ sReportPickleFilename = os.path.join(sModelDir, sModelName + "__detailled_report.txt")
+ graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, tstReport)
+
+ if lRun:
+ if options.storeX or options.applyY:
+ try: doer.load()
+ except: pass #we only need the transformer
+ lsOutputFilename = doer.runForExternalMLMethod(lRun, options.storeX, options.applyY, options.bRevertEdges)
+ else:
+ doer.load()
+ lsOutputFilename = doer.predict(lRun)
+
+ traceln("Done, see in:\n %s"%lsOutputFilename)
+
+
+def main_command_line(TableSkewedRowCut_CLASS):
+ version = "v.01"
+ usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version)
+# parser.add_option("--annotate", dest='bAnnotate', action="store_true",default=False, help="Annotate the textlines with BIES labels")
+
+ #FOR GCN
+ parser.add_option("--revertEdges", dest='bRevertEdges', action="store_true", help="Revert the direction of the edges")
+ parser.add_option("--detail", dest='bDetailedReport', action="store_true", default=False,help="Display detailed reporting (score per document)")
+ parser.add_option("--baseline", dest='bBaseline', action="store_true", default=False, help="report baseline method")
+ parser.add_option("--line_see_line", dest='iLineVisibility', action="store",
+ type=int, default=GraphSkewedCut.iLineVisibility,
+ help="seeline2line: how far in pixel can a line see another cut line?")
+ parser.add_option("--block_see_line", dest='iBlockVisibility', action="store",
+ type=int, default=GraphSkewedCut.iBlockVisibility,
+ help="seeblock2line: how far in pixel can a block see a cut line?")
+ parser.add_option("--height", dest="fCutHeight", default=GraphSkewedCut.fCutHeight
+ , action="store", type=float, help="Minimal height of a cut")
+ parser.add_option("--cut-above", dest='bCutAbove', action="store_true", default=False
+ ,help="Each object defines one or several cuts above it (instead of below as by default)")
+ parser.add_option("--angle", dest='lsAngle'
+ , action="store", type="string", default="-1,0,+1"
+ ,help="Allowed cutting angles, in degree, comma-separated")
+
+ parser.add_option("--graph", dest='bGraph', action="store_true", help="Store the graph in the XML for displaying it")
+
+ # ---
+ #parse the command line
+ (options, args) = parser.parse_args()
+
+ if options.bGraph:
+ import os.path
+ # hack
+ TableSkewedRowCut_CLASS.bCutAbove = options.bCutAbove
+ traceln("\t%s.bCutAbove=" % TableSkewedRowCut_CLASS.__name__, TableSkewedRowCut_CLASS.bCutAbove)
+ TableSkewedRowCut_CLASS.lRadAngle = [math.radians(v) for v in [float(s) for s in options.lsAngle.split(",")]]
+ traceln("\t%s.lRadAngle=" % TableSkewedRowCut_CLASS.__name__, TableSkewedRowCut_CLASS.lRadAngle)
+ for sInputFilename in args:
+ sp, sf = os.path.split(sInputFilename)
+ sOutFilename = os.path.join(sp, "graph-" + sf)
+ doer = TableSkewedRowCut_CLASS("debug", "."
+ , iBlockVisibility=options.iBlockVisibility
+ , iLineVisibility=options.iLineVisibility
+ , fCutHeight=options.fCutHeight
+ , bCutAbove=options.bCutAbove
+ , lRadAngle=[math.radians(float(s)) for s in options.lsAngle.split(",")])
+ o = doer.cGraphClass()
+ o.parseDocFile(sInputFilename, 9)
+ o.parseDocLabels()
+ o.addParsedLabelToDom()
+ o.addEdgaddEdgeToDoc print('Graph edges added to %s'%sOutFilename)
+ o.doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
+ SkewedCutAnnotator.gtStatReport()
+ exit(0)
+
+ # ---
+ try:
+ sModelDir, sModelName = args
+ except Exception as e:
+ traceln("Specify a model folder and a model name!")
+ _exit(usage, 1, e)
+
+ main(TableSkewedRowCut_CLASS, sModelDir, sModelName, options)
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+ from tasks.DU_ABPTableSkewed_txtBIO_sepSIO import DU_ABPTableSkewedRowCut
+ main_command_line(DU_ABPTableSkewedRowCut)
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIESO_sepSIO.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIESO_sepSIO.py
index 213ec3d..13aa064 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIESO_sepSIO.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIESO_sepSIO.py
@@ -12,18 +12,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -59,13 +48,13 @@ class NodeType_BIESO_Shape(NodeType_PageXml_type_woText):
"""
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
-
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
sXmlLabel = {'B':'B',
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIESO_sepSIO_line_hack.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIESO_sepSIO_line_hack.py
index 8a388d5..03d7567 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIESO_sepSIO_line_hack.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIESO_sepSIO_line_hack.py
@@ -14,18 +14,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -87,13 +76,13 @@ class NodeType_BIESO_Shape(NodeType_PageXml_type_woText):
"""
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
-
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
sXmlLabel = {'B':'B',
@@ -142,12 +131,12 @@ class GraphSkewedCut_H_lines(GraphSkewedCut_H):
shaper_fun = ShapeLoader.node_to_SingleLine
- def addEdgeToDOM(self):
+ def addEdgeToDoc(self):
"""
To display the grpah conveniently we add new Edge elements
Since we change the BAseline representation, we show the new one
"""
- super().addEdgeToDOM()
+ super().addEdgeToDoc()
for blk in self.lNode:
assert blk.type.name in ["row", "sepH"], blk.type.name
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOH_sepSIO_line.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOH_sepSIO_line.py
new file mode 100644
index 0000000..68db140
--- /dev/null
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOH_sepSIO_line.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+
+"""
+ *** Same as DU_ABPTableSkewed_txtBIO_sepSIO_line, except that text have BIOH as labels
+
+ DU task for ABP Table:
+ doing jointly row BIOH and near horizontal cuts SIO
+
+ block2line edges do not cross another block.
+
+ The cut are based on baselines of text blocks, with some positive or negative inclination.
+
+ - the labels of cuts are SIO
+
+ Copyright Naver Labs Europe(C) 2018 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+
+
+
+import sys, os
+
+from lxml import etree
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+from tasks.DU_CRF_Task import DU_CRF_Task
+from tasks.DU_ABPTableSkewed import My_FeatureDefinition_v3, NodeType_PageXml_Cut_Shape, main_command_line
+from tasks.DU_ABPTableSkewed_txtBIO_sepSIO import NodeType_BIESO_to_BIO_Shape_txt
+from tasks.DU_ABPTableSkewed_txtBIO_sepSIO_line import GraphSkewedCut_H_lines, DU_ABPTableSkewedRowCutLine
+
+
+class NodeType_BIESO_to_BIOH_Shape_txt(NodeType_BIESO_to_BIO_Shape_txt):
+ """
+ Convert BIESO labeling to SIO
+ """
+
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
+ """
+ Parse and set the graph node label and return its class index
+ raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
+ """
+ sLabel = self.sDefaultLabel
+ domnode = graph_node.node
+
+ sXmlLabel = domnode.get("DU_header")
+ if sXmlLabel != 'CH':
+ sXmlLabel = domnode.get(self.sLabelAttr)
+
+ sXmlLabel = {'B':'B',
+ 'I':'I',
+ 'E':'I',
+ 'S':'B',
+ 'O':'O',
+ 'CH':'CH'}[sXmlLabel]
+ try:
+ sLabel = self.dXmlLabel2Label[sXmlLabel]
+ except KeyError:
+ #not a label of interest
+ try:
+ self.checkIsIgnored(sXmlLabel)
+ #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel:
+ except:
+ raise ValueError("Invalid label '%s'"
+ " (from @%s or @%s) in node %s"%(sXmlLabel,
+ self.sLabelAttr,
+ self.sDefaultLabel,
+ etree.tostring(domnode)))
+
+ return sLabel
+
+
+class NodeType_BIESO_to_BIOH_Shape(NodeType_BIESO_to_BIOH_Shape_txt):
+ """
+ without text
+ """
+ def _get_GraphNodeText(self, doc, domNdPage, ndBlock, ctxt=None):
+ return u""
+
+
+class DU_ABPTableSkewedRowCutLine_BIOH(DU_ABPTableSkewedRowCutLine):
+ """
+ We will do a CRF model for a DU task
+ , with the below labels
+ """
+
+ #=== CONFIGURATION ====================================================================
+ @classmethod
+ def getConfiguredGraphClass(cls):
+ """
+ In this class method, we must return a configured graph class
+ """
+
+ # Textline labels
+ # Begin Inside End Single Other
+ lLabels_BIOH_row = ['B', 'I', 'O', 'CH']
+
+ # Cut lines:
+ # Border Ignore Separator Outside
+ lLabels_SIO_Cut = ['S', 'I', 'O']
+
+ #DEFINING THE CLASS OF GRAPH WE USE
+ DU_GRAPH = GraphSkewedCut_H_lines
+
+ DU_GRAPH.iBlockVisibility = cls.iBlockVisibility
+ DU_GRAPH.iLineVisibility = cls.iLineVisibility
+ DU_GRAPH.fCutHeight = cls.fCutHeight
+ DU_GRAPH.bCutAbove = cls.bCutAbove
+ DU_GRAPH.lRadAngle = cls.lRadAngle
+ DU_GRAPH.bTxt = cls.bTxt
+
+ # ROW
+ ntR = ( NodeType_BIESO_to_BIOH_Shape_txt if cls.bTxt \
+ else NodeType_BIESO_to_BIOH_Shape \
+ )("row"
+ , lLabels_BIOH_row
+ , None
+ , False
+ , None
+ )
+ ntR.setLabelAttribute("DU_row")
+ ntR.setXpathExpr( (".//pc:TextLine" #how to find the nodes
+ , "./pc:TextEquiv/pc:Unicode") #how to get their text
+ )
+ DU_GRAPH.addNodeType(ntR)
+
+ # CUT
+ ntCutH = NodeType_PageXml_Cut_Shape("sepH"
+ , lLabels_SIO_Cut
+ , None
+ , False
+ , None # equiv. to: BBoxDeltaFun=lambda _: 0
+ )
+ ntCutH.setLabelAttribute("DU_type")
+ ntCutH.setXpathExpr( ('.//pc:CutSeparator[@orient="0"]' #how to find the nodes
+ # the angle attribute give the true orientation (which is near 0)
+ , "./pc:TextEquiv") #how to get their text
+ )
+ DU_GRAPH.addNodeType(ntCutH)
+
+ DU_GRAPH.setClassicNodeTypeList( [ntR ])
+
+ return DU_GRAPH
+
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+ main_command_line(DU_ABPTableSkewedRowCutLine_BIOH)
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOStmb_sepSIO_line.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOStmb_sepSIO_line.py
index e18c70f..9f5c119 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOStmb_sepSIO_line.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOStmb_sepSIO_line.py
@@ -12,18 +12,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -67,11 +56,12 @@ class NodeType_BIESO_to_SIOStSmSb_Shape(NodeType_BIESO_to_BIO_Shape):
'O':'O',
'CH':'CH'}
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
# in case we also deal with column headers
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOStmb_sepSIO_line_hack.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOStmb_sepSIO_line_hack.py
index fcc45ac..74e4f10 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOStmb_sepSIO_line_hack.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIOStmb_sepSIO_line_hack.py
@@ -12,18 +12,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -96,11 +85,12 @@ class NodeType_BIESO_to_SIOStSmSb_Shape(NodeType_BIESO_to_BIO_Shape):
'O':'O',
'CH':'CH'}
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
# in case we also deal with column headers
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO.py
new file mode 100644
index 0000000..1de999b
--- /dev/null
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO.py
@@ -0,0 +1,222 @@
+# -*- coding: utf-8 -*-
+
+"""
+ DU task for ABP Table:
+ doing jointly row BIO and near horizontal cuts SIO
+
+ block2line edges do not cross another block.
+
+ The cut are based on baselines of text blocks, with some positive or negative inclination.
+
+ - the labels of cuts are SIO
+
+ Copyright Naver Labs Europe(C) 2018 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+
+
+
+import sys, os
+from lxml import etree
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+from tasks.DU_CRF_Task import DU_CRF_Task
+
+from util.Shape import ShapeLoader
+
+from tasks.DU_ABPTableSkewed import GraphSkewedCut_H, My_FeatureDefinition_v3, NodeType_PageXml_Cut_Shape, main_command_line
+from graph.NodeType_PageXml import NodeType_PageXml_type
+
+
+
+# class NodeType_BIESO_to_BIO_Shape(NodeType_PageXml_type_woText):
+class NodeType_BIESO_to_BIO_Shape_txt(NodeType_PageXml_type):
+ """
+ Convert BIESO labeling to BIO
+ """
+
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
+ """
+ Parse and set the graph node label and return its class index
+ raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
+ """
+ sLabel = self.sDefaultLabel
+ domnode = graph_node.node
+ sXmlLabel = domnode.get(self.sLabelAttr)
+
+ sXmlLabel = {'B':'B',
+ 'I':'I',
+ 'E':'I',
+ 'S':'B',
+ 'O':'O'}[sXmlLabel]
+ try:
+ sLabel = self.dXmlLabel2Label[sXmlLabel]
+ except KeyError:
+ #not a label of interest
+ try:
+ self.checkIsIgnored(sXmlLabel)
+ #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel:
+ except:
+ raise ValueError("Invalid label '%s'"
+ " (from @%s or @%s) in node %s"%(sXmlLabel,
+ self.sLabelAttr,
+ self.sDefaultLabel,
+ etree.tostring(domnode)))
+
+ return sLabel
+
+ def _iter_GraphNode(self, doc, domNdPage, page):
+ """
+ to add the shape object reflecting the baseline
+ """
+ for blk in super()._iter_GraphNode(doc, domNdPage, page):
+ try:
+ ndBaseline = blk.node.xpath(".//pc:Baseline", namespaces=self.dNS)[0]
+ try:
+ o = ShapeLoader.node_to_LineString(ndBaseline)
+ except ValueError:
+ traceln("SKIPPING INVALID Baseline: ", etree.tostring(ndBaseline))
+ continue
+ blk.shape = o
+ blk.du_index = int(ndBaseline.get("du_index"))
+ yield blk
+ except:
+ pass
+ return
+
+
+class NodeType_BIESO_to_BIO_Shape(NodeType_BIESO_to_BIO_Shape_txt):
+ """
+ without text
+ """
+ def _get_GraphNodeText(self, doc, domNdPage, ndBlock, ctxt=None):
+ return u""
+
+
+class DU_ABPTableSkewedRowCut(DU_CRF_Task):
+ """
+ We will do a CRF model for a DU task
+ , with the below labels
+ """
+ sXmlFilenamePattern = "*[0-9].mpxml"
+
+ iBlockVisibility = None
+ iLineVisibility = None
+ fCutHeight = None
+ bCutAbove = None
+ lRadAngle = None
+ bTxt = None # use textual features?
+
+ #=== CONFIGURATION ====================================================================
+ @classmethod
+ def getConfiguredGraphClass(cls):
+ """
+ In this class method, we must return a configured graph class
+ """
+
+ # Textline labels
+ # Begin Inside End Single Other
+ lLabels_BIO_row = ['B', 'I', 'O']
+
+ # Cut lines:
+ # Border Ignore Separator Outside
+ lLabels_SIO_Cut = ['S', 'I', 'O']
+
+ #DEFINING THE CLASS OF GRAPH WE USE
+ DU_GRAPH = GraphSkewedCut_H
+
+ DU_GRAPH.iBlockVisibility = cls.iBlockVisibility
+ DU_GRAPH.iLineVisibility = cls.iLineVisibility
+ DU_GRAPH.fCutHeight = cls.fCutHeight
+ DU_GRAPH.bCutAbove = cls.bCutAbove
+ DU_GRAPH.lRadAngle = cls.lRadAngle
+ DU_GRAPH.bTxt = cls.bTxt
+
+ # ROW
+ ntR = ( NodeType_BIESO_to_BIO_Shape_txt if cls.bTxt \
+ else NodeType_BIESO_to_BIO_Shape \
+ )("row"
+ , lLabels_BIO_row
+ , None
+ , False
+ , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3))
+ )
+ ntR.setLabelAttribute("DU_row")
+ ntR.setXpathExpr( (".//pc:TextLine" #how to find the nodes
+ , "./pc:TextEquiv/pc:Unicode") #how to get their text
+ )
+ DU_GRAPH.addNodeType(ntR)
+
+ # CUT
+ ntCutH = NodeType_PageXml_Cut_Shape("sepH"
+ , lLabels_SIO_Cut
+ , None
+ , False
+ , None # equiv. to: BBoxDeltaFun=lambda _: 0
+ )
+ ntCutH.setLabelAttribute("DU_type")
+ ntCutH.setXpathExpr( ('.//pc:CutSeparator[@orient="0"]' #how to find the nodes
+ # the angle attribute give the true orientation (which is near 0)
+ , "./pc:TextEquiv") #how to get their text
+ )
+ DU_GRAPH.addNodeType(ntCutH)
+
+ DU_GRAPH.setClassicNodeTypeList( [ntR ])
+
+ return DU_GRAPH
+
+ def __init__(self, sModelName, sModelDir,
+ iBlockVisibility = None,
+ iLineVisibility = None,
+ fCutHeight = None,
+ bCutAbove = None,
+ lRadAngle = None,
+ bTxt = None,
+ sComment = None,
+ C=None, tol=None, njobs=None, max_iter=None,
+ inference_cache=None):
+
+ DU_ABPTableSkewedRowCut.iBlockVisibility = iBlockVisibility
+ DU_ABPTableSkewedRowCut.iLineVisibility = iLineVisibility
+ DU_ABPTableSkewedRowCut.fCutHeight = fCutHeight
+ DU_ABPTableSkewedRowCut.bCutAbove = bCutAbove
+ DU_ABPTableSkewedRowCut.lRadAngle = lRadAngle
+ DU_ABPTableSkewedRowCut.bTxt = bTxt
+
+ DU_CRF_Task.__init__(self
+ , sModelName, sModelDir
+ , dFeatureConfig = {'row_row':{}, 'row_sepH':{},
+ 'sepH_row':{}, 'sepH_sepH':{},
+ 'sepH':{}, 'row':{}}
+ , dLearnerConfig = {
+ 'C' : .1 if C is None else C
+ , 'njobs' : 4 if njobs is None else njobs
+ , 'inference_cache' : 50 if inference_cache is None else inference_cache
+ #, 'tol' : .1
+ , 'tol' : .05 if tol is None else tol
+ , 'save_every' : 50 #save every 50 iterations,for warm start
+ , 'max_iter' : 10 if max_iter is None else max_iter
+ }
+ , sComment=sComment
+ #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
+ ,cFeatureDefinition=My_FeatureDefinition_v3
+ )
+
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+ main_command_line(DU_ABPTableSkewedRowCut)
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_line.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_line.py
new file mode 100644
index 0000000..41d20b7
--- /dev/null
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_line.py
@@ -0,0 +1,186 @@
+# -*- coding: utf-8 -*-
+
+"""
+ *** Same as its parent apart that text baselines are reflected as a LineString (instead of its centroid)
+
+ DU task for ABP Table:
+ doing jointly row BIO and near horizontal cuts SIO
+
+ block2line edges do not cross another block.
+
+ The cut are based on baselines of text blocks, with some positive or negative inclination.
+
+ - the labels of cuts are SIO
+
+ Copyright Naver Labs Europe(C) 2018 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+
+
+
+import sys, os
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+from xml_formats.PageXml import MultiPageXml
+from util.Shape import ShapeLoader
+
+from tasks.DU_CRF_Task import DU_CRF_Task
+from tasks.DU_ABPTableSkewed import GraphSkewedCut_H, My_FeatureDefinition_v3, NodeType_PageXml_Cut_Shape, main_command_line,\
+ My_FeatureDefinition_v3_txt
+from tasks.DU_ABPTableSkewed_txtBIO_sepSIO import NodeType_BIESO_to_BIO_Shape, NodeType_BIESO_to_BIO_Shape_txt
+
+
+class GraphSkewedCut_H_lines(GraphSkewedCut_H):
+
+ # reflecting text baseline as a LineString
+ shaper_fun = ShapeLoader.node_to_SingleLine
+
+
+ def addEdgeToDoc(self, Y=None):
+ """
+ To display the grpah conveniently we add new Edge elements
+ Since we change the BAseline representation, we show the new one
+ """
+ super().addEdgeToDoc()
+
+ for blk in self.lNode:
+ assert blk.type.name in ["row", "sepH"], blk.type.name
+
+ if blk.type.name == "row":
+ ndBaseline = blk.node.xpath(".//pc:Baseline", namespaces=self.dNS)[0]
+ o = self.shaper_fun(ndBaseline)
+ MultiPageXml.setPoints(ndBaseline, list(o.coords))
+
+ return
+
+
+class DU_ABPTableSkewedRowCutLine(DU_CRF_Task):
+ """
+ We will do a CRF model for a DU task
+ , with the below labels
+ """
+ sXmlFilenamePattern = "*.mpxml"
+ #sXmlFilenamePattern = "*.pxml"
+
+ iBlockVisibility = None
+ iLineVisibility = None
+ fCutHeight = None
+ bCutAbove = None
+ lRadAngle = None
+ bTxt = None # use textual features?
+
+ #=== CONFIGURATION ====================================================================
+ @classmethod
+ def getConfiguredGraphClass(cls):
+ """
+ In this class method, we must return a configured graph class
+ """
+
+ # Textline labels
+ # Begin Inside End Single Other
+ lLabels_BIO_row = ['B', 'I', 'O']
+
+ # Cut lines:
+ # Border Ignore Separator Outside
+ lLabels_SIO_Cut = ['S', 'I', 'O']
+
+ #DEFINING THE CLASS OF GRAPH WE USE
+ DU_GRAPH = GraphSkewedCut_H_lines
+
+ DU_GRAPH.iBlockVisibility = cls.iBlockVisibility
+ DU_GRAPH.iLineVisibility = cls.iLineVisibility
+ DU_GRAPH.fCutHeight = cls.fCutHeight
+ DU_GRAPH.bCutAbove = cls.bCutAbove
+ DU_GRAPH.lRadAngle = cls.lRadAngle
+ DU_GRAPH.bTxt = cls.bTxt
+
+ # ROW
+ ntR = ( NodeType_BIESO_to_BIO_Shape_txt if cls.bTxt \
+ else NodeType_BIESO_to_BIO_Shape \
+ )("row"
+ , lLabels_BIO_row
+ , None
+ , False
+ , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3))
+ )
+ ntR.setLabelAttribute("DU_row")
+ ntR.setXpathExpr( (".//pc:TextLine" #how to find the nodes
+ , "./pc:TextEquiv/pc:Unicode") #how to get their text
+ )
+ DU_GRAPH.addNodeType(ntR)
+
+ # CUT
+ ntCutH = NodeType_PageXml_Cut_Shape("sepH"
+ , lLabels_SIO_Cut
+ , None
+ , False
+ , None # equiv. to: BBoxDeltaFun=lambda _: 0
+ )
+ ntCutH.setLabelAttribute("DU_type")
+ ntCutH.setXpathExpr( ('.//pc:CutSeparator[@orient="0"]' #how to find the nodes
+ # the angle attribute give the true orientation (which is near 0)
+ , "./pc:TextEquiv") #how to get their text
+ )
+ DU_GRAPH.addNodeType(ntCutH)
+
+ DU_GRAPH.setClassicNodeTypeList( [ntR ])
+
+ return DU_GRAPH
+
+ def __init__(self, sModelName, sModelDir
+ , iBlockVisibility = None
+ , iLineVisibility = None
+ , fCutHeight = None
+ , bCutAbove = None
+ , lRadAngle = None
+ , bTxt = None
+ , sComment = None
+ , cFeatureDefinition = None
+ , dFeatureConfig = {}
+ , C=None, tol=None, njobs=None, max_iter=None
+ , inference_cache=None):
+
+ DU_ABPTableSkewedRowCutLine.iBlockVisibility = iBlockVisibility
+ DU_ABPTableSkewedRowCutLine.iLineVisibility = iLineVisibility
+ DU_ABPTableSkewedRowCutLine.fCutHeight = fCutHeight
+ DU_ABPTableSkewedRowCutLine.bCutAbove = bCutAbove
+ DU_ABPTableSkewedRowCutLine.lRadAngle = lRadAngle
+ DU_ABPTableSkewedRowCutLine.bTxt = bTxt
+
+ DU_CRF_Task.__init__(self
+ , sModelName, sModelDir
+ , dFeatureConfig = {'row_row':{}, 'row_sepH':{},
+ 'sepH_row':{}, 'sepH_sepH':{},
+ 'sepH':{}, 'row':{}}
+ , dLearnerConfig = {
+ 'C' : .1 if C is None else C
+ , 'njobs' : 4 if njobs is None else njobs
+ , 'inference_cache' : 50 if inference_cache is None else inference_cache
+ #, 'tol' : .1
+ , 'tol' : .05 if tol is None else tol
+ , 'save_every' : 50 #save every 50 iterations,for warm start
+ , 'max_iter' : 10 if max_iter is None else max_iter
+ }
+ , sComment=sComment
+ #,cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
+ , cFeatureDefinition= My_FeatureDefinition_v3_txt if self.bTxt else My_FeatureDefinition_v3
+ )
+
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+ main_command_line(DU_ABPTableSkewedRowCutLine)
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_line_weighted.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_line_weighted.py
index 1b38f84..cfc4239 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_line_weighted.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_line_weighted.py
@@ -14,18 +14,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_weighted.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_weighted.py
index e538721..ca1b4af 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_weighted.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBIO_sepSIO_weighted.py
@@ -12,18 +12,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -62,13 +51,13 @@ class NodeType_BIESO_to_BIO_Shape(NodeType_PageXml_type_woText):
Convert BIESO labeling to BIO
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
-
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
sXmlLabel = {'B':'B',
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBISO_sepSIO_line.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBISO_sepSIO_line.py
index 885d7dc..b7d2511 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBISO_sepSIO_line.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBISO_sepSIO_line.py
@@ -14,18 +14,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -60,12 +49,13 @@ class NodeType_BIESO_to_BISO_Shape(NodeType_PageXml_type_woText):
Convert BIESO labeling to BIO
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
@@ -116,12 +106,12 @@ class GraphSkewedCut_H_lines(GraphSkewedCut_H):
shaper_fun = ShapeLoader.node_to_SingleLine
- def addEdgeToDOM(self):
+ def addEdgeToDoc(self):
"""
To display the grpah conveniently we add new Edge elements
Since we change the BAseline representation, we show the new one
"""
- super().addEdgeToDOM()
+ super().addEdgeToDoc()
for blk in self.lNode:
assert blk.type.name in ["row", "sepH"], blk.type.name
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBISO_sepSIO_line_hack.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBISO_sepSIO_line_hack.py
index e683e04..7148305 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBISO_sepSIO_line_hack.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtBISO_sepSIO_line_hack.py
@@ -14,18 +14,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -87,11 +76,12 @@ class NodeType_BISO_Shape(NodeType_PageXml_type_woText):
"""
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
+ domnode = graph_node.node
sLabel = self.sDefaultLabel
sXmlLabel = domnode.get(self.sLabelAttr)
@@ -142,12 +132,12 @@ class GraphSkewedCut_H_lines(GraphSkewedCut_H):
shaper_fun = ShapeLoader.node_to_SingleLine
- def addEdgeToDOM(self):
+ def addEdgeToDoc(self):
"""
To display the grpah conveniently we add new Edge elements
Since we change the BAseline representation, we show the new one
"""
- super().addEdgeToDOM()
+ super().addEdgeToDoc()
for blk in self.lNode:
assert blk.type.name in ["row", "sepH"], blk.type.name
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtEIO_sepSIO.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtEIO_sepSIO.py
index e252c9d..f253fe5 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtEIO_sepSIO.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtEIO_sepSIO.py
@@ -12,18 +12,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -60,12 +49,13 @@ class NodeType_BIESO_to_EIO_Shape(NodeType_PageXml_type_woText):
Convert BIESO labeling to EIO
"""
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
sLabel = self.sDefaultLabel
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtTOMBS_sepSIO_line.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtTOMBS_sepSIO_line.py
index 0c18258..80cf9b7 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtTOMBS_sepSIO_line.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtTOMBS_sepSIO_line.py
@@ -10,18 +10,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -112,12 +101,12 @@ def showClassParam(cls):
traceln(" - iCutCloseDistanceTop : " , cls.iCutCloseDistanceTop)
traceln(" - iCutCloseDistanceBot : " , cls.iCutCloseDistanceBot)
- def addEdgeToDOM(self):
+ def addEdgeToDoc(self):
"""
To display the grpah conveniently we add new Edge elements
Since we change the BAseline representation, we show the new one
"""
- super().addEdgeToDOM()
+ super().addEdgeToDoc()
for blk in self.lNode:
assert blk.type.name in ["row", "sepH"], blk.type.name
@@ -132,13 +121,13 @@ def addEdgeToDOM(self):
"""
To compute TOMBS labels, it is better to use the built graph...
"""
- def parseDomLabels(self):
+ def parseDocLabels(self):
"""
Parse the label of the graph from the dataset, and set the node label
return the set of observed class (set of integers in N+)
"""
# WE expect I or O for text blocks!!
- setSeensLabels = super().parseDomLabels()
+ setSeensLabels = super().parseDocLabels()
# now look at edges to compute T M B S
# REMEMBER, we did: edge.len = dist / self.iBlockVisibility
@@ -193,11 +182,12 @@ class NodeType_BIESO_to_TOMBS_Shape(NodeType_BIESO_to_BIO_Shape):
'O':'O',
'CH':'CH'}
- def parseDomNodeLabel(self, domnode, defaultCls=None):
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
"""
Parse and set the graph node label and return its class index
raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
"""
+ domnode = graph_node.node
sXmlLabel = domnode.get(self.sLabelAttr)
# in case we also deal with column headers
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtTOMBS_sepSIO_line_hack.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtTOMBS_sepSIO_line_hack.py
index 5400c20..faf80f4 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtTOMBS_sepSIO_line_hack.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTableSkewed_txtTOMBS_sepSIO_line_hack.py
@@ -10,18 +10,7 @@
Copyright Naver Labs Europe(C) 2018 JL Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTable_Quantile_NoEF.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTable_Quantile_NoEF.py
index 74c1bff..d9512bb 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTable_Quantile_NoEF.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTable_Quantile_NoEF.py
@@ -5,18 +5,7 @@
Copyright Xerox(C) 2017 H. Déjean
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_ABPTable_Quantile_NoNF.py b/TranskribusDU/tasks/TablePrototypes/DU_ABPTable_Quantile_NoNF.py
index 014828e..fd28ea0 100644
--- a/TranskribusDU/tasks/TablePrototypes/DU_ABPTable_Quantile_NoNF.py
+++ b/TranskribusDU/tasks/TablePrototypes/DU_ABPTable_Quantile_NoNF.py
@@ -5,18 +5,7 @@
Copyright Xerox(C) 2017 H. Déjean
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_Table_BIO.py b/TranskribusDU/tasks/TablePrototypes/DU_Table_BIO.py
new file mode 100644
index 0000000..b211bf1
--- /dev/null
+++ b/TranskribusDU/tasks/TablePrototypes/DU_Table_BIO.py
@@ -0,0 +1,159 @@
+# -*- coding: utf-8 -*-
+
+"""
+ DU task for segmenting text in rows using a BIO scheme
+
+ Example of code after April SW re-engineering by JLM
+
+ Copyright NAVER(C) 2019 Jean-Luc Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+import sys, os
+import lxml.etree as etree
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+TranskribusDU_version
+
+from common.trace import traceln
+from graph.Graph_Multi_SinglePageXml import Graph_MultiSinglePageXml
+from graph.NodeType_PageXml import NodeType_PageXml_type_woText
+from graph.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText
+from tasks.DU_Task_Factory import DU_Task_Factory
+
+
+
+# to convert from BIESO to BIO we create our own NodeType by inheritance
+# class NodeType_BIESO_to_BIO_Shape(NodeType_PageXml_type_woText):
+class NodeType_PageXml_type_woText_BIESO_to_BIO(NodeType_PageXml_type_woText):
+ """
+ Convert BIESO labeling to BIO
+ """
+
+ def parseDocNodeLabel(self, graph_node, defaultCls=None):
+ """
+ Parse and set the graph node label and return its class index
+ raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
+ """
+ sLabel = self.sDefaultLabel
+ domnode = graph_node.node
+
+ sXmlLabel = domnode.get(self.sLabelAttr)
+
+ sXmlLabel = {'B':'B',
+ 'I':'I',
+ 'E':'I',
+ 'S':'B',
+ 'O':'O'}[sXmlLabel]
+ try:
+ sLabel = self.dXmlLabel2Label[sXmlLabel]
+ except KeyError:
+ #not a label of interest
+ try:
+ self.checkIsIgnored(sXmlLabel)
+ #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel:
+ except:
+ raise ValueError("Invalid label '%s'"
+ " (from @%s or @%s) in node %s"%(sXmlLabel,
+ self.sLabelAttr,
+ self.sDefaultLabel,
+ etree.tostring(domnode)))
+
+ return sLabel
+
+
+def getConfiguredGraphClass(doer):
+ """
+ In this function, we return a configured graph.Graph subclass
+
+ doer is a tasks.DU_task object created by tasks.DU_Task_Factory
+ """
+ #DEFINING THE CLASS OF GRAPH WE USE
+ DU_GRAPH = Graph_MultiSinglePageXml
+
+ lLabels = ['B', 'I', 'O']
+
+ lIgnoredLabels = []
+
+ """
+ if you play with a toy collection, which does not have all expected classes, you can reduce those.
+ """
+
+ lActuallySeen = None
+ if lActuallySeen:
+ print( "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
+ lIgnoredLabels = [lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen]
+ lLabels = [lLabels[i] for i in lActuallySeen ]
+ print( len(lLabels) , lLabels)
+ print( len(lIgnoredLabels) , lIgnoredLabels)
+
+ nt = NodeType_PageXml_type_woText_BIESO_to_BIO(
+ "abp" #some short prefix because labels below are prefixed with it
+ , lLabels
+ , lIgnoredLabels
+ , False #no label means OTHER
+ , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way
+ )
+ nt.setLabelAttribute("DU_row")
+
+ nt.setXpathExpr( (".//pc:TextLine" #how to find the nodes
+ , "./pc:TextEquiv") #how to get their text
+ )
+
+ # ntA.setXpathExpr( (".//pc:TextLine | .//pc:TextRegion" #how to find the nodes
+ # , "./pc:TextEquiv") #how to get their text
+ # )
+ DU_GRAPH.addNodeType(nt)
+
+ return DU_GRAPH
+
+
+if __name__ == "__main__":
+ # import better_exceptions
+ # better_exceptions.MAX_LENGTH = None
+
+ # standard command line options for CRF- ECN- GAT-based methods
+ usage, parser = DU_Task_Factory.getStandardOptionsParser(sys.argv[0])
+
+ traceln("VERSION: %s" % DU_Task_Factory.getVersion())
+
+ # ---
+ #parse the command line
+ (options, args) = parser.parse_args()
+
+ try:
+ sModelDir, sModelName = args
+ except Exception as e:
+ traceln("Specify a model folder and a model name!")
+ DU_Task_Factory.exit(usage, 1, e)
+
+ doer = DU_Task_Factory.getDoer(sModelDir, sModelName
+ , options = options
+ , fun_getConfiguredGraphClass= getConfiguredGraphClass
+ , cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes_noText
+ , dFeatureConfig = {}
+ )
+
+ # setting the learner configuration, in a standard way
+ # (from command line options, or from a JSON configuration file)
+ dLearnerConfig = doer.getStandardLearnerConfig(options)
+ # of course, you can put yours here instead.
+ doer.setLearnerConfiguration(dLearnerConfig)
+
+
+ # act as per specified in the command line (--trn , --fold-run, ...)
+ doer.standardDo(options)
+
+ del doer
+
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_Table_Col.py b/TranskribusDU/tasks/TablePrototypes/DU_Table_Col.py
new file mode 100644
index 0000000..8813232
--- /dev/null
+++ b/TranskribusDU/tasks/TablePrototypes/DU_Table_Col.py
@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+
+"""
+ Create column segmenters
+
+ Copyright Naver Labs Europe(C) 2018 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+
+
+
+import sys, os
+from optparse import OptionParser
+
+from lxml import etree
+
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+from tasks import _exit
+from tasks.DU_ABPTableCutAnnotator import CutAnnotator
+
+
+def main(sFilename, sOutFilename
+ , fRatio, fMinHLen
+ , fMinHorizProjection, fMinVertiProjection
+ ):
+
+ traceln("- cutting: %s --> %s"%(sFilename, sOutFilename))
+
+ #for the pretty printer to format better...
+ parser = etree.XMLParser(remove_blank_text=True)
+ doc = etree.parse(sFilename, parser)
+ root=doc.getroot()
+
+ doer = CutAnnotator()
+
+ # # Some grid line will be O or I simply because they are too short.
+ # fMinPageCoverage = 0.5 # minimum proportion of the page crossed by a grid line
+ # # we want to ignore col- and row- spans
+ #map the groundtruth table separators to our grid, per page (1 in tABP)
+ # ltlYlX = doer.get_separator_YX_from_DOM(root, fMinPageCoverage)
+
+ # Find cuts and map them to GT
+ #
+ doer.add_cut_to_DOM(root
+ #, ltlYlX=ltlYlX
+ , fMinHorizProjection=fMinHorizProjection
+ , fMinVertiProjection=fMinVertiProjection
+ , fRatio=fRatio
+ , fMinHLen=fMinHLen)
+
+ #l_DU_row_Y, l_DU_row_GT = doer.predict(root)
+
+ doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
+ traceln('Annotated cut separators added into %s'%sOutFilename)
+
+ del doc
+
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+ usage = """+| """
+ version = "v.01"
+ parser = OptionParser(usage=usage, version="0.1")
+ parser.add_option("--ratio", dest='fRatio', action="store"
+ , type=float
+ , help="Apply this ratio to the bounding box"
+ , default=CutAnnotator.fRATIO)
+ parser.add_option("--fMinHLen", dest='fMinHLen', action="store"
+ , type=float
+ , help="Do not scale horizontally a bounding box with width lower than this"
+ , default=75)
+
+ parser.add_option("--fHorizRatio", dest='fMinHorizProjection', action="store"
+ , type=float
+ , help="On the horizontal projection profile, it ignores profile lower than this ratio of the page width"
+ , default=0.05)
+ parser.add_option("--fVertRatio", dest='fMinVertiProjection', action="store"
+ , type=float
+ , help="On the vertical projection profile, it ignores profile lower than this ratio of the page height"
+ , default=0.05)
+# parser.add_option("--SIO" , dest='bSIO' , action="store_true", help="SIO labels")
+# parser.add_option("--annotate", dest='bAnnotate', action="store_true",default=False, help="Annotate the textlines with BIES labels")
+
+# parser.add_option("--detail", dest='bDetailedReport', action="store_true", default=False,help="Display detailed reporting (score per document)")
+# parser.add_option("--baseline", dest='bBaseline', action="store_true", default=False, help="report baseline method")
+# parser.add_option("--line_see_line", dest='iLineVisibility', action="store",
+# type=int, default=GraphSkewedCut.iLineVisibility,
+# help="seeline2line: how far in pixel can a line see another cut line?")
+# parser.add_option("--block_see_line", dest='iBlockVisibility', action="store",
+# type=int, default=GraphSkewedCut.iBlockVisibility,
+# help="seeblock2line: how far in pixel can a block see a cut line?")
+# parser.add_option("--height", dest="fCutHeight", default=GraphSkewedCut.fCutHeight
+# , action="store", type=float, help="Minimal height of a cut")
+# # parser.add_option("--cut-above", dest='bCutAbove', action="store_true", default=False
+# # ,help="Each object defines one or several cuts above it (instead of below as by default)")
+# parser.add_option("--angle", dest='lsAngle'
+# , action="store", type="string", default="-1,0,+1"
+# ,help="Allowed cutting angles, in degree, comma-separated")
+#
+# parser.add_option("--graph", dest='bGraph', action="store_true", help="Store the graph in the XML for displaying it")
+# parser.add_option("--bioh", "--BIOH", dest='bBIOH', action="store_true", help="Text are categorised along BIOH instead of BIO")
+
+ # ---
+ #parse the command line
+ (options, args) = parser.parse_args()
+
+ traceln(options)
+
+ if len(args) == 2 and os.path.isdir(args[0]) and os.path.isdir(args[1]):
+ # ok, let's work differently...
+ sFromDir,sToDir = args
+ for s in os.listdir(sFromDir):
+ if not s.lower().endswith("pxml"): pass
+ sFilename = sFromDir + "/" + s
+ sp, sf = os.path.split(s)
+ sOutFilename = sToDir + "/" + "cut-" + sf
+ traceln(sFilename," --> ", sOutFilename)
+ main(sFilename, sOutFilename
+ , options.fRatio, fMinHLen=options.fMinHLen
+ , fMinHorizProjection=options.fMinHorizProjection
+ , fMinVertiProjection=options.fMinVertiProjection
+ )
+ else:
+ for sFilename in args:
+ sp, sf = os.path.split(sFilename)
+ sOutFilename = os.path.join(sp, "cut-" + sf)
+ traceln(sFilename," --> ", sOutFilename)
+ main(sFilename, sOutFilename
+ , options.fRatio, fMinHLen=options.fMinHLen
+ , fMinHorizProjection=options.fMinHorizProjection
+ , fMinVertiProjection=options.fMinVertiProjection
+ )
+
diff --git a/TranskribusDU/tasks/TablePrototypes/DU_Table_Row.py b/TranskribusDU/tasks/TablePrototypes/DU_Table_Row.py
new file mode 100644
index 0000000..c69734c
--- /dev/null
+++ b/TranskribusDU/tasks/TablePrototypes/DU_Table_Row.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+"""
+ *** Same as its parent apart that text baselines are reflected as a LineString (instead of its centroid)
+
+ DU task for ABP Table:
+ doing jointly row BIO and near horizontal cuts SIO
+
+ block2line edges do not cross another block.
+
+ The cut are based on baselines of text blocks, with some positive or negative inclination.
+
+ - the labels of cuts are SIO
+
+ Copyright Naver Labs Europe(C) 2018 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+
+
+
+import sys, os
+
+import math
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+TranskribusDU_version
+
+from common.trace import traceln
+from tasks import _exit
+from tasks.DU_CRF_Task import DU_CRF_Task
+from tasks.DU_Table.DU_ABPTableSkewed import GraphSkewedCut, main
+from tasks.DU_Table.DU_ABPTableSkewed_CutAnnotator import SkewedCutAnnotator
+from tasks.DU_Table.DU_ABPTableSkewed_txtBIO_sepSIO_line import DU_ABPTableSkewedRowCutLine
+from tasks.DU_Table.DU_ABPTableSkewed_txtBIOH_sepSIO_line import DU_ABPTableSkewedRowCutLine_BIOH
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+
+ version = "v.01"
+ usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version)
+# parser.add_option("--annotate", dest='bAnnotate', action="store_true",default=False, help="Annotate the textlines with BIES labels")
+
+ #FOR GCN
+ # parser.add_option("--revertEdges", dest='bRevertEdges', action="store_true", help="Revert the direction of the edges")
+ parser.add_option("--detail", dest='bDetailedReport', action="store_true", default=False,help="Display detailed reporting (score per document)")
+ parser.add_option("--baseline", dest='bBaseline', action="store_true", default=False, help="report baseline method")
+ parser.add_option("--line_see_line", dest='iLineVisibility', action="store",
+ type=int, default=GraphSkewedCut.iLineVisibility,
+ help="seeline2line: how far in pixel can a line see another cut line?")
+ parser.add_option("--block_see_line", dest='iBlockVisibility', action="store",
+ type=int, default=GraphSkewedCut.iBlockVisibility,
+ help="seeblock2line: how far in pixel can a block see a cut line?")
+ parser.add_option("--height", dest="fCutHeight", default=GraphSkewedCut.fCutHeight
+ , action="store", type=float, help="Minimal height of a cut")
+ # parser.add_option("--cut-above", dest='bCutAbove', action="store_true", default=False
+ # ,help="Each object defines one or several cuts above it (instead of below as by default)")
+ parser.add_option("--angle", dest='lsAngle'
+ , action="store", type="string", default="-1,0,+1"
+ ,help="Allowed cutting angles, in degree, comma-separated")
+
+ parser.add_option("--graph", dest='bGraph', action="store_true", help="Store the graph in the XML for displaying it")
+ parser.add_option("--bioh", "--BIOH", dest='bBIOH', action="store_true", help="Text are categorised along BIOH instead of BIO")
+ parser.add_option("--text", "--txt", dest='bTxt', action="store_true", help="Use textual features.")
+
+ # ---
+ #parse the command line
+ (options, args) = parser.parse_args()
+
+ options.bCutAbove = True # Forcing this!
+
+ if options.bBIOH:
+ DU_CLASS = DU_ABPTableSkewedRowCutLine_BIOH
+ else:
+ DU_CLASS = DU_ABPTableSkewedRowCutLine
+
+ if options.bGraph:
+ import os.path
+ # hack
+ DU_CLASS.bCutAbove = options.bCutAbove
+ traceln("\t%s.bCutAbove=" % DU_CLASS.__name__, DU_CLASS.bCutAbove)
+ DU_CLASS.lRadAngle = [math.radians(v) for v in [float(s) for s in options.lsAngle.split(",")]]
+ traceln("\t%s.lRadAngle=" % DU_CLASS.__name__, DU_CLASS.lRadAngle)
+ for sInputFilename in args:
+ sp, sf = os.path.split(sInputFilename)
+ sOutFilename = os.path.join(sp, "graph-" + sf)
+ doer = DU_CLASS("debug", "."
+ , iBlockVisibility=options.iBlockVisibility
+ , iLineVisibility=options.iLineVisibility
+ , fCutHeight=options.fCutHeight
+ , bCutAbove=options.bCutAbove
+ , lRadAngle=[math.radians(float(s)) for s in options.lsAngle.split(",")]
+ , bTxt=options.bTxt)
+ o = doer.cGraphClass()
+ o.parseDocFile(sInputFilename, 9)
+ o.addEdgeToDoc()
+ print('Graph edges added to %s'%sOutFilename)
+ o.doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
+ SkewedCutAnnotator.gtStatReport()
+ exit(0)
+
+ # ---
+ try:
+ sModelDir, sModelName = args
+ except Exception as e:
+ traceln("Specify a model folder and a model name!")
+ _exit(usage, 1, e)
+
+ main(DU_CLASS, sModelDir, sModelName, options)
diff --git a/TranskribusDU/tasks/case_BAR/DU_BAR.py b/TranskribusDU/tasks/case_BAR/DU_BAR.py
new file mode 100644
index 0000000..ed4c150
--- /dev/null
+++ b/TranskribusDU/tasks/case_BAR/DU_BAR.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+"""
+ DU task for BAR - see https://read02.uibk.ac.at/wiki/index.php/Document_Understanding_BAR
+
+ Copyright Xerox(C) 2017 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+import sys, os
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+from tasks import _checkFindColDir, _exit
+
+from tasks.DU_CRF_Task import DU_CRF_Task
+
+
+def main(DU_BAR):
+ version = "v.01"
+ usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version)
+ parser.add_option("--docid", dest='docid', action="store",default=None, help="only process docid")
+ # ---
+ #parse the command line
+ (options, args) = parser.parse_args()
+
+ # ---
+ try:
+ sModelDir, sModelName = args
+ except Exception as e:
+ traceln("Specify a model folder and a model name!")
+ _exit(usage, 1, e)
+
+ doer = DU_BAR(sModelName, sModelDir,
+ C = options.crf_C,
+ tol = options.crf_tol,
+ njobs = options.crf_njobs,
+ max_iter = options.max_iter,
+ inference_cache = options.crf_inference_cache)
+
+
+ if options.docid:
+ sDocId=options.docid
+ else:
+ sDocId=None
+ if options.rm:
+ doer.rm()
+ sys.exit(0)
+
+ lTrn, lTst, lRun, lFold = [_checkFindColDir(lsDir) for lsDir in [options.lTrn, options.lTst, options.lRun, options.lFold]]
+# if options.bAnnotate:
+# doer.annotateDocument(lTrn)
+# traceln('annotation done')
+# sys.exit(0)
+
+ ## use. a_mpxml files
+ doer.sXmlFilenamePattern = doer.sLabeledXmlFilenamePattern
+
+
+ if options.iFoldInitNum or options.iFoldRunNum or options.bFoldFinish:
+ if options.iFoldInitNum:
+ """
+ initialization of a cross-validation
+ """
+ splitter, ts_trn, lFilename_trn = doer._nfold_Init(lFold, options.iFoldInitNum, bStoreOnDisk=True)
+ elif options.iFoldRunNum:
+ """
+ Run one fold
+ """
+ oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm)
+ traceln(oReport)
+ elif options.bFoldFinish:
+ tstReport = doer._nfold_Finish()
+ traceln(tstReport)
+ else:
+ assert False, "Internal error"
+ #no more processing!!
+ exit(0)
+ #-------------------
+
+ if lFold:
+ loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl)
+ import graph.GraphModel
+ sReportPickleFilename = os.path.join(sModelDir, sModelName + "__report.txt")
+ traceln("Results are in %s"%sReportPickleFilename)
+ graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, loTstRpt)
+ elif lTrn:
+ doer.train_save_test(lTrn, lTst, options.warm, options.pkl)
+ try: traceln("Baseline best estimator: %s"%doer.bsln_mdl.best_params_) #for GridSearch
+ except: pass
+ traceln(" --- CRF Model ---")
+ traceln(doer.getModel().getModelInfo())
+ elif lTst:
+ doer.load()
+ tstReport = doer.test(lTst)
+ traceln(tstReport)
+
+ if lRun:
+ if options.storeX or options.applyY:
+ try: doer.load()
+ except: pass #we only need the transformer
+ lsOutputFilename = doer.runForExternalMLMethod(lRun, options.storeX, options.applyY)
+ else:
+ doer.load()
+ lsOutputFilename = doer.predict(lRun)
+ traceln("Done, see in:\n %s"%lsOutputFilename)
+
+if __name__ == "__main__":
+ raise Exception("This is an abstract module.")
\ No newline at end of file
diff --git a/TranskribusDU/tasks/case_BAR/DU_BAR_ConvertGTAnnotation.py b/TranskribusDU/tasks/case_BAR/DU_BAR_ConvertGTAnnotation.py
new file mode 100644
index 0000000..18c8f9f
--- /dev/null
+++ b/TranskribusDU/tasks/case_BAR/DU_BAR_ConvertGTAnnotation.py
@@ -0,0 +1,460 @@
+# -*- coding: utf-8 -*-
+
+"""
+ DU task for BAR documents - see https://read02.uibk.ac.at/wiki/index.php/Document_Understanding_BAR
+
+ Here we convert the human annotation into 2 kinds of annotations:
+ - a semantic one: header, heading, page-number, resolution-marginalia, resolution-number, resolution-paragraph (we ignore Marginalia because only 2 occurences)
+ - a segmentation one: 2 complementary labels. We call them Heigh Ho. Could have been Yin Yang as well...
+ - also, we store the resolution number in @DU_num
+
+ These new annotations are stored in @DU_sem , @DU_sgm , @DU_num
+
+ Copyright Naver Labs(C) 2017 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+
+
+
+import sys, os, re
+
+from lxml import etree
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+
+from xml_formats.PageXml import PageXml, MultiPageXml, PageXmlException
+from crf.Graph_MultiPageXml import Graph_MultiPageXml
+from util.Polygon import Polygon
+
+
+class DU_BAR_Convert:
+ """
+ Here we convert the human annotation into 2 kinds of annotations:
+ - a semantic one: header, heading, page-number, resolution-marginalia, resolution-number, resolution-paragraph (we ignore Marginalia because only 2 occurences)
+ - a segmentation one: 2 complementary labels. We call them Heigh Ho. Could have been Yin Yang as well...
+
+ These new annotations are store in @DU_sem and @DU_sgm
+ """
+ sXml_HumanAnnotation_Extension = ".mpxml"
+ sXml_MachineAnnotation_Extension = ".du_mpxml"
+
+ sMetadata_Creator = "TranskribusDU/usecases/BAR/DU_ConvertGTAnnotation.py"
+ sMetadata_Comments = "Converted human annotation into semantic and segmentation annotation. See attributes @DU_sem and @DU_sgm."
+
+ dNS = {"pc":PageXml.NS_PAGE_XML}
+ sxpNode = ".//pc:TextRegion"
+
+ #Name of attributes for semantic / segmentation /resolution number
+ sSemAttr = "DU_sem"
+ sSgmAttr = "DU_sgm"
+ sNumAttr = "DU_num"
+
+ sOther = "other"
+
+ #Mapping to new semantic annotation
+ dAnnotMapping = {"header" :"header",
+ "heading" :"heading",
+ "page-number" :"page-number",
+ "marginalia" : sOther,
+ "p" :"resolution-paragraph",
+ "m" :"resolution-marginalia",
+ "" :"resolution-number",
+ None : sOther #for strange things
+ }
+ creResolutionHumanLabel = re.compile("([mp]?)([0-9]+.?)") #e.g. p1 m23 456 456a
+
+ #The two complementary segmentation labels
+ sSegmHeigh = "heigh"
+ sSegmHo = "ho"
+
+ #=== CONFIGURATION ====================================================================
+ def __init__(self):
+
+ pass
+
+
+ def convertDoc(self, sFilename):
+
+ assert sFilename.endswith(self.sXml_HumanAnnotation_Extension)
+
+ g = Graph_MultiPageXml()
+
+ doc = etree.parse(sFilename, encoding='utf-8')
+
+ #the Heigh/Ho annotation runs over consecutive pages, so we keep those values accross pages
+ self._initSegmentationLabel()
+ self.lSeenResoNum = list()
+
+ for pnum, page, domNdPage in g._iter_Page_DocNode(doc):
+ self._convertPageAnnotation(pnum, page, domNdPage)
+
+ MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments)
+
+ assert sFilename.endswith(self.sXml_HumanAnnotation_Extension)
+
+ sDUFilename = sFilename[:-len(self.sXml_HumanAnnotation_Extension)] + self.sXml_MachineAnnotation_Extension
+# doc.save(sDUFilename, encoding='utf-8', pretty_print=True)
+ doc.write(sDUFilename,
+ xml_declaration=True,
+ encoding="utf-8",
+ pretty_print=True
+ #compression=0, #0 to 9
+ )
+
+# doc.saveFormatFileEnc(sDUFilename, "utf-8", True) #True to indent the XML
+# doc.freeDoc()
+
+ return sDUFilename
+
+ # -----------------------------------------------------------------------------------------------------------
+
+ def _initSegmentationLabel(self):
+ self.prevResolutionNumber, self.prevSgmLbl = None, None
+
+ def _getNextSegmentationLabel(self, sPrevSegmLabel=None):
+ """
+ alternate beween HEIGH and HO, 1st at random
+ """
+ if sPrevSegmLabel == self.sSegmHeigh: return self.sSegmHo
+ elif sPrevSegmLabel == self.sSegmHo: return self.sSegmHeigh
+ else:
+ assert sPrevSegmLabel == None
+ return self.sSegmHeigh
+
+ def _iter_TextRegionNodeTop2Bottom(self, domNdPage, page):
+ """
+ Get the DOM, the DOM page node, the page object
+
+ iterator on the DOM, that returns nodes
+ """
+ assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
+ lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS)
+
+ #order blocks from top to bottom of page
+ lOrderedNdBlock = list()
+ for ndBlock in lNdBlock:
+
+ lXY = PageXml.getPointList(ndBlock) #the polygon
+ if lXY == []:
+ raise ValueError("Node %x has invalid coordinates" % str(ndBlock))
+
+ plg = Polygon(lXY)
+ _, (xg, yg) = plg.getArea_and_CenterOfMass()
+
+ lOrderedNdBlock.append( (yg, ndBlock)) #we want to order from top to bottom, so that TextRegions of different resolution are not interleaved
+
+ lOrderedNdBlock.sort()
+
+ for _, ndBlock in lOrderedNdBlock: yield ndBlock
+
+ return
+
+
+ def _convertPageAnnotation(self, pnum, page, domNdPage):
+ """
+
+ """
+
+ #change: on each page we start by Heigh
+ bRestartAtEachPageWithHeigh = True
+ if bRestartAtEachPageWithHeigh: self._initSegmentationLabel()
+
+ for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page):
+
+ try:
+ lbl = PageXml.getCustomAttr(nd, "structure", "type")
+ except PageXmlException:
+ nd.set(self.sSemAttr, self.sOther)
+ nd.set(self.sSgmAttr, self.sOther)
+ continue #this node has no annotation whatsoever
+
+ if lbl in ["heading", "header", "page-number", "marginalia"]:
+ semLabel = lbl
+ sgmLabel = self.sOther #those elements are not part of a resolution
+ sResoNum = None
+ else:
+ o = self.creResolutionHumanLabel.match(lbl)
+ if not o: raise ValueError("%s is not a valid human annotation" % lbl)
+ semLabel = o.group(1) #"" for the resolution number
+
+ #now decide on the segmentation label
+ sResoNum = o.group(2)
+ if not sResoNum: raise ValueError("%s is not a valid human annotation - missing resolution number" % lbl)
+
+ #now switch between heigh and ho !! :))
+ if self.prevResolutionNumber == sResoNum:
+ sgmLabel = self.prevSgmLbl
+ else:
+ sgmLabel = self._getNextSegmentationLabel(self.prevSgmLbl)
+ assert bRestartAtEachPageWithHeigh or sResoNum not in self.lSeenResoNum, "ERROR: the ordering of the block has not preserved resolution number contiguity"
+ self.lSeenResoNum.append(sResoNum)
+
+ self.prevResolutionNumber, self.prevSgmLbl = sResoNum, sgmLabel
+
+
+ #always have a semantic label
+ sNewSemLbl = self.dAnnotMapping[semLabel]
+ assert sNewSemLbl
+ nd.set(self.sSemAttr, sNewSemLbl) #DU annotation
+
+ #resolution parts also have a segmentation label and a resolution number
+ assert sgmLabel
+ nd.set(self.sSgmAttr, sgmLabel) #DU annotation
+
+ if sResoNum:
+ nd.set(self.sNumAttr, sResoNum)
+
+class DU_BAR_Convert_v2(DU_BAR_Convert):
+ """
+ For segmentation labels, we only use 'Heigh' or 'Ho' whatever the semantic label is, so that the task is purely a segmentation task.
+
+ Heading indicate the start of a resolution, and is part of it.
+ Anything else (Header page-number, marginalia) is part of the resolution.
+
+ """
+
+ def _initSegmentationLabel(self):
+ self.prevResolutionNumber = None
+ self._curSgmLbl = None
+
+ def _switchSegmentationLabel(self):
+ """
+ alternate beween HEIGH and HO, 1st is Heigh
+ """
+ if self._curSgmLbl == None:
+ self._curSgmLbl = self.sSegmHeigh
+ else:
+ self._curSgmLbl = self.sSegmHeigh if self._curSgmLbl == self.sSegmHo else self.sSegmHo
+ return self._curSgmLbl
+
+ def _getCurrentSegmentationLabel(self):
+ """
+ self.curSgmLbl or Heigh if not yet set!
+ """
+ if self._curSgmLbl == None: self._curSgmLbl = self.sSegmHeigh
+ return self._curSgmLbl
+
+ def _convertPageAnnotation(self, pnum, page, domNdPage):
+ """
+
+ """
+ for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page):
+
+ try:
+ sResoNum = None
+ lbl = PageXml.getCustomAttr(nd, "structure", "type")
+
+ if lbl in ["heading"]:
+ semLabel = self.dAnnotMapping[lbl]
+ #heading may indicate a new resolution!
+ if self.prevResolutionNumber == None:
+ sgmLabel = self._getCurrentSegmentationLabel() #for instance 2 consecutive headings
+ else:
+ sgmLabel = self._switchSegmentationLabel()
+ self.prevResolutionNumber = None #so that next number does not switch Heigh/Ho label
+ elif lbl in ["header", "page-number", "marginalia"]:
+ #continuation of a resolution
+ semLabel = self.dAnnotMapping[lbl]
+ sgmLabel = self._getCurrentSegmentationLabel()
+ else:
+ o = self.creResolutionHumanLabel.match(lbl)
+ if not o: raise ValueError("%s is not a valid human annotation" % lbl)
+ semLabel = self.dAnnotMapping[o.group(1)] #"" for the resolution number
+
+ #Here we have a resolution number!
+ sResoNum = o.group(2)
+ if not sResoNum: raise ValueError("%s is not a valid human annotation - missing resolution number" % lbl)
+
+ #now switch between heigh and ho !! :))
+ if self.prevResolutionNumber != None and self.prevResolutionNumber != sResoNum:
+ #we got a new number, so switching segmentation label!
+ sgmLabel = self._switchSegmentationLabel()
+ else:
+ #either same number or switching already done due to a heading
+ sgmLabel = self._getCurrentSegmentationLabel()
+
+ self.prevResolutionNumber = sResoNum
+
+ except PageXmlException:
+ semLabel = self.sOther
+ sgmLabel = self._getCurrentSegmentationLabel()
+
+ nd.set(self.sSemAttr, semLabel)
+ nd.set(self.sSgmAttr, sgmLabel)
+ if sResoNum:
+ nd.set(self.sNumAttr, sResoNum) #only when the number is part of the humanannotation!
+
+
+class DU_BAR_Convert_BIES(DU_BAR_Convert):
+ """
+ For segmentation labels, we only use B I E S whatever the semantic label is, so that the task is purely a segmentation task.
+
+ Heading indicate the start of a resolution, and is part of it.
+ Anything else (Header page-number, marginalia) is part of the resolution.
+
+ """
+ B = "B"
+ I = "I"
+ E = "E"
+ S = "S"
+
+ def _initSegmentationLabel(self):
+ self._prevNd = None
+ self._prevNum = False
+ self._prevIsB = None
+ def _convertPageAnnotation(self, pnum, page, domNdPage):
+ """
+
+ """
+ for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page):
+ sResoNum = None
+ bCurrentIsAStart = None
+ try:
+ lbl = PageXml.getCustomAttr(nd, "structure", "type")
+
+ if lbl == "heading":
+ semLabel = self.dAnnotMapping[lbl]
+ #heading indicate the start of a new resolution, unless the previous is already a start!
+ if self._prevIsB:
+ bCurrentIsAStart = False
+ else:
+ bCurrentIsAStart = True
+ self._prevNum = False #to prevent starting again when find the resolution number
+ elif lbl in ["header", "page-number", "marginalia"]:
+ semLabel = self.dAnnotMapping[lbl]
+ #continuation of a resolution, except at very beginning (first node)
+ if self._prevNd == None:
+ bCurrentIsAStart = True
+ else:
+ bCurrentIsAStart = False
+ else:
+ o = self.creResolutionHumanLabel.match(lbl)
+ if not o:
+
+ if False: # strict
+ raise ValueError("%s is not a valid human annotation" % lbl)
+ else:
+ # relaxed
+ print(" ** WARNING ** strange annotation on node id=%s : '%s'"%(nd.get("id"), lbl))
+ semLabel = self.dAnnotMapping[None]
+ #Here we have a resolution number!
+ sResoNum = self._prevNum
+ else:
+ semLabel = self.dAnnotMapping[o.group(1)] #"" for the resolution number
+
+ #Here we have a resolution number!
+ sResoNum = o.group(2)
+ if not sResoNum: raise ValueError("%s is not a valid human annotation - missing resolution number" % lbl)
+
+ if self._prevNum != False and self._prevNum != sResoNum:
+ #we got a new number, so switching segmentation label!
+ bCurrentIsAStart = True
+ else:
+ #either same number or switching already done due to a heading
+ bCurrentIsAStart = False
+ self._prevNum = sResoNum
+
+
+ except PageXmlException:
+ semLabel = self.sOther
+ bCurrentIsAStart = False
+
+ #Now tagging!!
+ #Semantic (easy)
+ nd.set(self.sSemAttr, semLabel)
+
+ # BIES, tough...
+ if bCurrentIsAStart:
+ if self._prevIsB:
+ #make previous a singleton!
+ if self._prevNd: self._prevNd.set(self.sSgmAttr, self.S)
+ else:
+ #make previous a End
+ if self._prevNd: self._prevNd.set(self.sSgmAttr, self.E)
+ self._prevIsB = True #for next cycle!
+ else:
+ if self._prevIsB:
+ #confirm previous a a B
+ if self._prevNd: self._prevNd.set(self.sSgmAttr, self.B)
+ else:
+ #confirm previous as a I
+ if self._prevNd: self._prevNd.set(self.sSgmAttr, self.I)
+ self._prevIsB = False #for next cycle!
+
+ if sResoNum: nd.set(self.sNumAttr, sResoNum) #only when the number is part of the humanannotation!
+ self._prevNd = nd #for next cycle!
+ # end for
+
+ if self._prevIsB:
+ #make previous a singleton!
+ if self._prevNd: self._prevNd.set(self.sSgmAttr, self.S)
+ else:
+ #make previous a End
+ if self._prevNd: self._prevNd.set(self.sSgmAttr, self.E)
+ return
+
+
+#------------------------------------------------------------------------------------------------------
+def test_RE():
+ cre = DU_BAR_Convert.creResolutionHumanLabel
+
+ o = cre.match("m103a")
+ assert o.group(1) == 'm'
+ assert o.group(2) == '103a'
+
+ o = cre.match("103a")
+ assert o.group(1) == ''
+ assert o.group(2) == '103a'
+
+ o = cre.match("103")
+ assert o.group(1) == ''
+ assert o.group(2) == '103'
+
+ o = cre.match("az103a")
+ assert o == None
+
+
+#------------------------------------------------------------------------------------------------------
+
+
+if __name__ == "__main__":
+ from optparse import OptionParser
+
+ #prepare for the parsing of the command line
+ parser = OptionParser(usage="BAR annotation conversion", version="1.0")
+
+# parser.add_option("--tst", dest='lTst', action="append", type="string"
+# , help="Test a model using the given annotated collection.")
+# parser.add_option("--fold-init", dest='iFoldInitNum', action="store", type="int"
+# , help="Initialize the file lists for parallel cross-validating a model on the given annotated collection. Indicate the number of folds.")
+# parser.add_option("--jgjhg", dest='bFoldFinish', action="store_true"
+# , help="Evaluate by cross-validation a model on the given annotated collection.")
+# parser.add_option("-w", "--warm", dest='warm', action="store_true"
+# , help="To make warm-startable model and warm-start if a model exist already.")
+
+ #parse the command line
+ (options, args) = parser.parse_args()
+
+ # ---
+ #doer = DU_BAR_Convert()
+ #doer = DU_BAR_Convert_v2()
+ doer = DU_BAR_Convert_BIES()
+ for sFilename in args:
+ print ("- Processing %s" % sFilename)
+ sOutputFilename = doer.convertDoc(sFilename)
+ print (" done --> %s" % sOutputFilename)
+
+ print ("DONE.")
+
diff --git a/TranskribusDU/tasks/case_BAR/DU_BAR_sem.py b/TranskribusDU/tasks/case_BAR/DU_BAR_sem.py
new file mode 100644
index 0000000..c471308
--- /dev/null
+++ b/TranskribusDU/tasks/case_BAR/DU_BAR_sem.py
@@ -0,0 +1,645 @@
+# -*- coding: utf-8 -*-
+
+"""
+ DU task for BAR - see https://read02.uibk.ac.at/wiki/index.php/Document_Understanding_BAR
+
+ Copyright Xerox(C) 2017 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+import sys, os
+
+import json
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+
+from crf.Graph_MultiPageXml import Graph_MultiPageXml
+from crf.Graph_Multi_SinglePageXml import Graph_MultiSinglePageXml
+from crf.NodeType_PageXml import NodeType_PageXml_type_woText, NodeType_PageXml_type
+from tasks.DU_CRF_Task import DU_CRF_Task
+from crf.FeatureDefinition_PageXml_std import FeatureDefinition_PageXml_StandardOnes
+from graph.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText
+from tasks import _checkFindColDir, _exit
+
+
+from crf.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText
+
+from gcn.DU_Model_ECN import DU_Model_GAT
+
+
+from tasks.DU_BAR import main as m
+
+class DU_BAR_sem(DU_CRF_Task):
+ """
+ We will do a typed CRF model for a DU task
+ , with the below labels
+ """
+ sLabeledXmlFilenamePattern = "*.mpxml" #"*.bar_mpxml"
+
+ bHTR = False # do we have text from an HTR?
+ bPerPage = False # do we work per document or per page?
+ bTextLine = True # if False then act as TextRegion
+
+ #=== CONFIGURATION ====================================================================
+ @classmethod
+ def getConfiguredGraphClass(cls):
+ """
+ In this class method, we must return a configured graph class
+ """
+ #DEFINING THE CLASS OF GRAPH WE USE
+ if cls.bPerPage:
+ DU_GRAPH = Graph_MultiSinglePageXml # consider each age as if indep from each other
+ else:
+ DU_GRAPH = Graph_MultiPageXml
+
+ #lLabels1 = ['heading', 'header', 'page-number', 'resolution-number', 'resolution-marginalia', 'resolution-paragraph', 'other']
+
+ lLabels1 = ['IGNORE', '577', '579', '581', '608', '32', '3431', '617', '3462', '3484', '615', '49', '3425', '73', '3', '3450', '2', '11', '70', '3451', '637', '77', '3447', '3476', '3467', '3494', '3493', '3461', '3434', '48', '3456', '35', '3482', '74', '3488', '3430', '17', '613', '625', '3427', '3498', '29', '3483', '3490', '362', '638a', '57', '616', '3492', '10', '630', '24', '3455', '3435', '8', '15', '3499', '27', '3478', '638b', '22', '3469', '3433', '3496', '624', '59', '622', '75', '640', '1', '19', '642', '16', '25', '3445', '3463', '3443', '3439', '3436', '3479', '71', '3473', '28', '39', '361', '65', '3497', '578', '72', '634', '3446', '627', '43', '62', '34', '620', '76', '23', '68', '631', '54', '3500', '3480', '37', '3440', '619', '44', '3466', '30', '3487', '45', '61', '3452', '3491', '623', '633', '53', '66', '67', '69', '643', '58', '632', '636', '7', '641', '51', '3489', '3471', '21', '36', '3468', '4', '576', '46', '63', '3457', '56', '3448', '3441', '618', '52', '3429', '3438', '610', '26', '609', '3444', '612', '3485', '3465', '41', '20', '3464', '3477', '3459', '621', '3432', '60', '3449', '626', '628', '614', '47', '3454', '38', '3428', '33', '12', '3426', '3442', '3472', '13', '639', '3470', '611', '6', '40', '14', '3486', '31', '3458', '3437', '3453', '55', '3424', '3481', '635', '64', '629', '3460', '50', '9', '18', '42', '3495', '5', '580']
+
+
+ #the converter changed to other unlabelled TextRegions or 'marginalia' TRs
+ lIgnoredLabels1 = None
+
+ """
+ if you play with a toy collection, which does not have all expected classes, you can reduce those.
+ """
+
+# lActuallySeen = None
+# if lActuallySeen:
+# print( "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
+# lIgnoredLabels = [lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen]
+# lLabels = [lLabels[i] for i in lActuallySeen ]
+# print( len(lLabels) , lLabels)
+# print( len(lIgnoredLabels) , lIgnoredLabels)
+ if cls.bHTR:
+ ntClass = NodeType_PageXml_type
+ else:
+ #ignore text
+ ntClass = NodeType_PageXml_type_woText
+
+ nt1 = ntClass("bar" #some short prefix because labels below are prefixed with it
+ , lLabels1
+ , lIgnoredLabels1
+ , False #no label means OTHER
+ , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way
+ )
+ nt1.setLabelAttribute("DU_num")
+ if cls.bTextLine:
+ nt1.setXpathExpr( (".//pc:TextRegion/pc:TextLine[@DU_num]" #how to find the nodes
+ , "./pc:TextEquiv")
+ )
+ else:
+ nt1.setXpathExpr( (".//pc:TextRegion" #how to find the nodes
+ , "./pc:TextEquiv") #how to get their text
+ )
+ DU_GRAPH.addNodeType(nt1)
+
+ return DU_GRAPH
+
+
+ # ===============================================================================================================
+
+
+
+ # """
+ # if you play with a toy collection, which does not have all expected classes, you can reduce those.
+ # """
+ #
+ # lActuallySeen = None
+ # if lActuallySeen:
+ # print "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING"
+ # lIgnoredLabels = [lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen]
+ # lLabels = [lLabels[i] for i in lActuallySeen ]
+ # print len(lLabels) , lLabels
+ # print len(lIgnoredLabels) , lIgnoredLabels
+ # nbClass = len(lLabels) + 1 #because the ignored labels will become OTHER
+
+
+
+ #=== CONFIGURATION ====================================================================
+ def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None):
+
+ if self.bHTR:
+ cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes
+ dFeatureConfig = {
+ 'n_tfidf_node':100, 't_ngrams_node':(1,2), 'b_tfidf_node_lc':False
+ , 'n_tfidf_edge':100, 't_ngrams_edge':(1,2), 'b_tfidf_edge_lc':False }
+ else:
+ cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes_noText
+ dFeatureConfig = { }
+ #'n_tfidf_node':None, 't_ngrams_node':None, 'b_tfidf_node_lc':None
+ #, 'n_tfidf_edge':None, 't_ngrams_edge':None, 'b_tfidf_edge_lc':None }
+
+ DU_CRF_Task.__init__(self
+ , sModelName, sModelDir
+ , dFeatureConfig = dFeatureConfig
+ , dLearnerConfig = {
+ 'C' : .1 if C is None else C
+ , 'njobs' : 16 if njobs is None else njobs
+ , 'inference_cache' : 50 if inference_cache is None else inference_cache
+ #, 'tol' : .1
+ , 'tol' : .05 if tol is None else tol
+ , 'save_every' : 50 #save every 50 iterations,for warm start
+ , 'max_iter' : 1000 if max_iter is None else max_iter
+ }
+ , sComment=sComment
+ , cFeatureDefinition=cFeatureDefinition
+# , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText
+# , dFeatureConfig = {
+# #config for the extractor of nodes of each type
+# "text": None,
+# "sprtr": None,
+# #config for the extractor of edges of each type
+# "text_text": None,
+# "text_sprtr": None,
+# "sprtr_text": None,
+# "sprtr_sprtr": None
+# }
+ )
+
+ traceln("- classes: ", self.getGraphClass().getLabelNameList())
+
+ self.bsln_mdl = self.addBaseline_LogisticRegression() #use a LR model trained by GridSearch as baseline
+
+ #=== END OF CONFIGURATION =============================================================
+
+
+ def predict(self, lsColDir):#,sDocId):
+ """
+ Return the list of produced files
+ """
+# self.sXmlFilenamePattern = "*.a_mpxml"
+ return DU_CRF_Task.predict(self, lsColDir)#,sDocId)
+
+
+ def runForExternalMLMethod(self, lsColDir, storeX, applyY, bRevertEdges=False):
+ """
+ Return the list of produced files
+ """
+ self.sXmlFilenamePattern = "*.mpxml"
+ return DU_CRF_Task.runForExternalMLMethod(self, lsColDir, storeX, applyY, bRevertEdges)
+
+
+
+
+from tasks.DU_ECN_Task import DU_ECN_Task
+import gcn.DU_Model_ECN
+class DU_ABPTable_ECN(DU_ECN_Task):
+ """
+ ECN Models
+ """
+ bHTR = False # do we have text from an HTR?
+ bPerPage = False # do we work per document or per page?
+ bTextLine = True # if False then act as TextRegion
+
+ sMetadata_Creator = "NLE Document Understanding ECN"
+ sXmlFilenamePattern = "*.mpxml"
+
+ # sLabeledXmlFilenamePattern = "*.a_mpxml"
+ sLabeledXmlFilenamePattern = "*.mpxml"
+
+
+ sLabeledXmlFilenameEXT = ".mpxml"
+
+ dLearnerConfig = None
+
+ #dLearnerConfig = {'nb_iter': 50,
+ # 'lr': 0.001,
+ # 'num_layers': 3,
+ # 'nconv_edge': 10,
+ # 'stack_convolutions': True,
+ # 'node_indim': -1,
+ # 'mu': 0.0,
+ # 'dropout_rate_edge': 0.0,
+ # 'dropout_rate_edge_feat': 0.0,
+ # 'dropout_rate_node': 0.0,
+ # 'ratio_train_val': 0.15,
+ # #'activation': tf.nn.tanh, Problem I can not serialize function HERE
+ # }
+ # === CONFIGURATION ====================================================================
+ @classmethod
+ def getConfiguredGraphClass(cls):
+ """
+ In this class method, we must return a configured graph class
+ """
+ #lLabels = ['heading', 'header', 'page-number', 'resolution-number', 'resolution-marginalia', 'resolution-paragraph', 'other']
+
+ lLabels = ['IGNORE', '577', '579', '581', '608', '32', '3431', '617', '3462', '3484', '615', '49', '3425', '73', '3', '3450', '2', '11', '70', '3451', '637', '77', '3447', '3476', '3467', '3494', '3493', '3461', '3434', '48', '3456', '35', '3482', '74', '3488', '3430', '17', '613', '625', '3427', '3498', '29', '3483', '3490', '362', '638a', '57', '616', '3492', '10', '630', '24', '3455', '3435', '8', '15', '3499', '27', '3478', '638b', '22', '3469', '3433', '3496', '624', '59', '622', '75', '640', '1', '19', '642', '16', '25', '3445', '3463', '3443', '3439', '3436', '3479', '71', '3473', '28', '39', '361', '65', '3497', '578', '72', '634', '3446', '627', '43', '62', '34', '620', '76', '23', '68', '631', '54', '3500', '3480', '37', '3440', '619', '44', '3466', '30', '3487', '45', '61', '3452', '3491', '623', '633', '53', '66', '67', '69', '643', '58', '632', '636', '7', '641', '51', '3489', '3471', '21', '36', '3468', '4', '576', '46', '63', '3457', '56', '3448', '3441', '618', '52', '3429', '3438', '610', '26', '609', '3444', '612', '3485', '3465', '41', '20', '3464', '3477', '3459', '621', '3432', '60', '3449', '626', '628', '614', '47', '3454', '38', '3428', '33', '12', '3426', '3442', '3472', '13', '639', '3470', '611', '6', '40', '14', '3486', '31', '3458', '3437', '3453', '55', '3424', '3481', '635', '64', '629', '3460', '50', '9', '18', '42', '3495', '5', '580']
+
+
+ lIgnoredLabels = None
+
+ """
+ if you play with a toy collection, which does not have all expected classes, you can reduce those.
+ """
+ if cls.bPerPage:
+ DU_GRAPH = Graph_MultiSinglePageXml # consider each age as if indep from each other
+ else:
+ DU_GRAPH = Graph_MultiPageXml
+
+
+
+ lActuallySeen = None
+ if lActuallySeen:
+ print("REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
+ lIgnoredLabels = [lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen]
+ lLabels = [lLabels[i] for i in lActuallySeen]
+ print(len(lLabels), lLabels)
+ print(len(lIgnoredLabels), lIgnoredLabels)
+
+ if cls.bHTR:
+ ntClass = NodeType_PageXml_type
+ else:
+ #ignore text
+ ntClass = NodeType_PageXml_type_woText
+
+
+
+ # DEFINING THE CLASS OF GRAPH WE USE
+ nt = ntClass("bar" # some short prefix because labels below are prefixed with it
+ , lLabels
+ , lIgnoredLabels
+ , False # no label means OTHER
+ , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3))
+ # we reduce overlap in this way
+ )
+
+
+
+ nt.setLabelAttribute("DU_num")
+ if cls.bTextLine:
+ nt.setXpathExpr( (".//pc:TextRegion/pc:TextLine[@DU_num]" #how to find the nodes
+ , "./pc:TextEquiv")
+ )
+ else:
+ nt.setXpathExpr( (".//pc:TextRegion" #how to find the nodes
+ , "./pc:TextEquiv") #how to get their text
+ )
+
+
+ DU_GRAPH.addNodeType(nt)
+
+ return DU_GRAPH
+
+ def __init__(self, sModelName, sModelDir, sComment=None,dLearnerConfigArg=None):
+ print ( self.bHTR)
+
+ if self.bHTR:
+ cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes
+ dFeatureConfig = { 'bMultiPage':False, 'bMirrorPage':False
+ , 'n_tfidf_node':300, 't_ngrams_node':(1,4), 'b_tfidf_node_lc':False
+ , 'n_tfidf_edge':300, 't_ngrams_edge':(1,4), 'b_tfidf_edge_lc':False }
+ else:
+ cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes_noText
+ dFeatureConfig = { }
+
+
+ if sComment is None: sComment = sModelName
+
+
+ if dLearnerConfigArg is not None and "ecn_ensemble" in dLearnerConfigArg:
+ print('ECN_ENSEMBLE')
+ DU_ECN_Task.__init__(self
+ , sModelName, sModelDir
+ , dFeatureConfig=dFeatureConfig
+ ,
+ dLearnerConfig=dLearnerConfigArg if dLearnerConfigArg is not None else self.dLearnerConfig
+ , sComment=sComment
+ , cFeatureDefinition= cFeatureDefinition
+ , cModelClass=gcn.DU_Model_ECN.DU_Ensemble_ECN
+ )
+
+
+ else:
+ #Default Case Single Model
+ DU_ECN_Task.__init__(self
+ , sModelName, sModelDir
+ , dFeatureConfig=dFeatureConfig
+ , dLearnerConfig= dLearnerConfigArg if dLearnerConfigArg is not None else self.dLearnerConfig
+ , sComment= sComment
+ , cFeatureDefinition=cFeatureDefinition
+ )
+
+ #if options.bBaseline:
+ # self.bsln_mdl = self.addBaseline_LogisticRegression() # use a LR model trained by GridSearch as baseline
+
+ # === END OF CONFIGURATION =============================================================
+ def predict(self, lsColDir):
+ """
+ Return the list of produced files
+ """
+ self.sXmlFilenamePattern = "*.mpxml"
+ return DU_ECN_Task.predict(self, lsColDir)
+
+
+
+
+class DU_ABPTable_GAT(DU_ECN_Task):
+ """
+ ECN Models
+ """
+ bHTR = True # do we have text from an HTR?
+ bPerPage = True # do we work per document or per page?
+ bTextLine = True # if False then act as TextRegion
+
+ sMetadata_Creator = "NLE Document Understanding GAT"
+
+
+ sXmlFilenamePattern = "*.bar_mpxml"
+
+ # sLabeledXmlFilenamePattern = "*.a_mpxml"
+ sLabeledXmlFilenamePattern = "*.bar_mpxml"
+
+ sLabeledXmlFilenameEXT = ".bar_mpxml"
+
+
+ dLearnerConfigOriginalGAT ={
+ 'nb_iter': 500,
+ 'lr': 0.001,
+ 'num_layers': 2,#2 Train Acc is lower 5 overfit both reach 81% accuracy on Fold-1
+ 'nb_attention': 5,
+ 'stack_convolutions': True,
+ # 'node_indim': 50 , worked well 0.82
+ 'node_indim': -1,
+ 'dropout_rate_node': 0.0,
+ 'dropout_rate_attention': 0.0,
+ 'ratio_train_val': 0.15,
+ "activation_name": 'tanh',
+ "patience": 50,
+ "mu": 0.00001,
+ "original_model" : True
+
+ }
+
+
+ dLearnerConfigNewGAT = {'nb_iter': 500,
+ 'lr': 0.001,
+ 'num_layers': 5,
+ 'nb_attention': 5,
+ 'stack_convolutions': True,
+ 'node_indim': -1,
+ 'dropout_rate_node': 0.0,
+ 'dropout_rate_attention' : 0.0,
+ 'ratio_train_val': 0.15,
+ "activation_name": 'tanh',
+ "patience":50,
+ "original_model": False,
+ "attn_type":0
+ }
+ dLearnerConfig = dLearnerConfigNewGAT
+ #dLearnerConfig = dLearnerConfigOriginalGAT
+ # === CONFIGURATION ====================================================================
+ @classmethod
+ def getConfiguredGraphClass(cls):
+ """
+ In this class method, we must return a configured graph class
+ """
+ lLabels = ['heading', 'header', 'page-number', 'resolution-number', 'resolution-marginalia', 'resolution-paragraph', 'other']
+
+ lIgnoredLabels = None
+
+ """
+ if you play with a toy collection, which does not have all expected classes, you can reduce those.
+ """
+
+ lActuallySeen = None
+ if lActuallySeen:
+ print("REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
+ lIgnoredLabels = [lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen]
+ lLabels = [lLabels[i] for i in lActuallySeen]
+ print(len(lLabels), lLabels)
+ print(len(lIgnoredLabels), lIgnoredLabels)
+
+
+ # DEFINING THE CLASS OF GRAPH WE USE
+ if cls.bPerPage:
+ DU_GRAPH = Graph_MultiSinglePageXml # consider each age as if indep from each other
+ else:
+ DU_GRAPH = Graph_MultiPageXml
+
+ if cls.bHTR:
+ ntClass = NodeType_PageXml_type
+ else:
+ #ignore text
+ ntClass = NodeType_PageXml_type_woText
+
+
+ nt = ntClass("bar" # some short prefix because labels below are prefixed with it
+ , lLabels
+ , lIgnoredLabels
+ , False # no label means OTHER
+ , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3))
+ # we reduce overlap in this way
+ )
+ nt.setLabelAttribute("DU_sem")
+ if cls.bTextLine:
+ nt.setXpathExpr( (".//pc:TextRegion/pc:TextLine" #how to find the nodes
+ , "./pc:TextEquiv")
+ )
+ else:
+ nt.setXpathExpr( (".//pc:TextRegion" #how to find the nodes
+ , "./pc:TextEquiv") #how to get their text
+ )
+
+
+ DU_GRAPH.addNodeType(nt)
+
+ return DU_GRAPH
+
+ def __init__(self, sModelName, sModelDir, sComment=None,dLearnerConfigArg=None):
+ if self.bHTR:
+ cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes
+ dFeatureConfig = { 'bMultiPage':False, 'bMirrorPage':False
+ , 'n_tfidf_node':500, 't_ngrams_node':(2,4), 'b_tfidf_node_lc':False
+ , 'n_tfidf_edge':250, 't_ngrams_edge':(2,4), 'b_tfidf_edge_lc':False }
+ else:
+ cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes_noText
+ dFeatureConfig = { 'bMultiPage':False, 'bMirrorPage':False
+ , 'n_tfidf_node':None, 't_ngrams_node':None, 'b_tfidf_node_lc':None
+ , 'n_tfidf_edge':None, 't_ngrams_edge':None, 'b_tfidf_edge_lc':None }
+
+
+ if sComment is None: sComment = sModelName
+
+
+ DU_ECN_Task.__init__(self
+ , sModelName, sModelDir
+ , dFeatureConfig=dFeatureConfig
+ , dLearnerConfig= dLearnerConfigArg if dLearnerConfigArg is not None else self.dLearnerConfig
+ , sComment=sComment
+ , cFeatureDefinition=cFeatureDefinition
+ , cModelClass=DU_Model_GAT
+ )
+
+ if options.bBaseline:
+ self.bsln_mdl = self.addBaseline_LogisticRegression() # use a LR model trained by GridSearch as baseline
+
+ # === END OF CONFIGURATION =============================================================
+ def predict(self, lsColDir):
+ """
+ Return the list of produced files
+ """
+ self.sXmlFilenamePattern = "*.bar_mpxml"
+ return DU_ECN_Task.predict(self, lsColDir)
+
+
+
+
+# ----------------------------------------------------------------------------
+
+def main(sModelDir, sModelName, options):
+ if options.use_ecn:
+ if options.ecn_json_config is not None and options.ecn_json_config is not []:
+ f = open(options.ecn_json_config[0])
+ djson=json.loads(f.read())
+
+ if "ecn_learner_config" in djson:
+ dLearnerConfig=djson["ecn_learner_config"]
+ f.close()
+ doer = DU_ABPTable_ECN(sModelName, sModelDir,dLearnerConfigArg=dLearnerConfig)
+ elif "ecn_ensemble" in djson:
+ dLearnerConfig = djson
+ f.close()
+ doer = DU_ABPTable_ECN(sModelName, sModelDir, dLearnerConfigArg=dLearnerConfig)
+
+ else:
+ doer = DU_ABPTable_ECN(sModelName, sModelDir)
+ elif options.use_gat:
+ if options.gat_json_config is not None and options.gat_json_config is not []:
+
+ f = open(options.gat_json_config[0])
+ djson=json.loads(f.read())
+ dLearnerConfig=djson["gat_learner_config"]
+ f.close()
+ doer = DU_ABPTable_GAT(sModelName, sModelDir,dLearnerConfigArg=dLearnerConfig)
+
+ else:
+ doer = DU_ABPTable_GAT(sModelName, sModelDir)
+
+ else:
+ doer = m(DU_BAR_sem)
+
+
+ if options.rm:
+ doer.rm()
+ return
+
+
+
+
+ lTrn, lTst, lRun, lFold = [_checkFindColDir(lsDir) for lsDir in [options.lTrn, options.lTst, options.lRun, options.lFold]]
+
+ traceln("- classes: ", doer.getGraphClass().getLabelNameList())
+
+ ## use. a_mpxml files
+ doer.sXmlFilenamePattern = doer.sLabeledXmlFilenamePattern
+
+
+ if options.iFoldInitNum or options.iFoldRunNum or options.bFoldFinish:
+ if options.iFoldInitNum:
+ """
+ initialization of a cross-validation
+ """
+ splitter, ts_trn, lFilename_trn = doer._nfold_Init(lFold, options.iFoldInitNum, test_size=0.25, random_state=None, bStoreOnDisk=True)
+ elif options.iFoldRunNum:
+ """
+ Run one fold
+ """
+ oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm, options.pkl)
+ traceln(oReport)
+ elif options.bFoldFinish:
+ tstReport = doer._nfold_Finish()
+ traceln(tstReport)
+ else:
+ assert False, "Internal error"
+ #no more processing!!
+ exit(0)
+ #-------------------
+
+
+
+
+
+ if lFold:
+ loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl)
+ import graph.GraphModel
+ sReportPickleFilename = os.path.join(sModelDir, sModelName + "__report.txt")
+ traceln("Results are in %s"%sReportPickleFilename)
+ graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, loTstRpt)
+ elif lTrn:
+ doer.train_save_test(lTrn, lTst, options.warm, options.pkl)
+ try: traceln("Baseline best estimator: %s"%doer.bsln_mdl.best_params_) #for GridSearch
+ except: pass
+ traceln(" --- CRF Model ---")
+ traceln(doer.getModel().getModelInfo())
+ elif lTst:
+ doer.load()
+ tstReport = doer.test(lTst)
+ traceln(tstReport)
+ if options.bDetailedReport:
+ traceln(tstReport.getDetailledReport())
+ import graph.GraphModel
+ for test in lTst:
+ sReportPickleFilename = os.path.join('..',test, sModelName + "__report.pkl")
+ traceln('Report dumped into %s'%sReportPickleFilename)
+ graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, tstReport)
+
+ if lRun:
+ if options.storeX or options.applyY:
+ try: doer.load()
+ except: pass #we only need the transformer
+ lsOutputFilename = doer.runForExternalMLMethod(lRun, options.storeX, options.applyY, options.bRevertEdges)
+ else:
+ doer.load()
+ lsOutputFilename = doer.predict(lRun)
+
+ traceln("Done, see in:\n %s"%lsOutputFilename)
+
+
+# ----------------------------------------------------------------------------
+
+
+
+
+
+if __name__ == "__main__":
+
+ version = "v.01"
+ usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version)
+# parser.add_option("--annotate", dest='bAnnotate', action="store_true",default=False, help="Annotate the textlines with BIES labels")
+
+ #FOR GCN
+ parser.add_option("--revertEdges", dest='bRevertEdges', action="store_true", help="Revert the direction of the edges")
+ parser.add_option("--detail", dest='bDetailedReport', action="store_true", default=False,help="Display detailled reporting (score per document)")
+ parser.add_option("--baseline", dest='bBaseline', action="store_true", default=False, help="report baseline method")
+ parser.add_option("--ecn",dest='use_ecn',action="store_true", default=False, help="wether to use ECN Models")
+ parser.add_option("--ecn_config", dest='ecn_json_config',action="append", type="string", help="The Config files for the ECN Model")
+ parser.add_option("--gat", dest='use_gat', action="store_true", default=False, help="wether to use ECN Models")
+ parser.add_option("--gat_config", dest='gat_json_config', action="append", type="string",
+ help="The Config files for the Gat Model")
+ # ---
+ #parse the command line
+ (options, args) = parser.parse_args()
+
+ # ---
+ try:
+ sModelDir, sModelName = args
+ except Exception as e:
+ traceln("Specify a model folder and a model name!")
+ _exit(usage, 1, e)
+
+ main(sModelDir, sModelName, options)
+
diff --git a/TranskribusDU/tasks/case_BAR/DU_BAR_sem_sgm.py b/TranskribusDU/tasks/case_BAR/DU_BAR_sem_sgm.py
new file mode 100644
index 0000000..55e3e0a
--- /dev/null
+++ b/TranskribusDU/tasks/case_BAR/DU_BAR_sem_sgm.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+
+"""
+ DU task for BAR - see https://read02.uibk.ac.at/wiki/index.php/Document_Understanding_BAR
+
+ Copyright Xerox(C) 2017 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+
+
+
+import sys, os
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+
+from crf.Graph_MultiPageXml import FactorialGraph_MultiContinuousPageXml
+from crf.NodeType_PageXml import NodeType_PageXml_type_woText
+from .DU_CRF_Task import DU_FactorialCRF_Task
+from crf.FeatureDefinition_PageXml_std_noText import FeatureDefinition_T_PageXml_StandardOnes_noText
+from crf.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText
+
+from .DU_BAR import main
+
+class DU_BAR_sem_sgm(DU_FactorialCRF_Task):
+ """
+ We will do a Factorial CRF model using the Multitype CRF
+ , with the below labels
+ """
+ sLabeledXmlFilenamePattern = "*.du_mpxml"
+
+ # ===============================================================================================================
+ #DEFINING THE CLASS OF GRAPH WE USE
+ DU_GRAPH = FactorialGraph_MultiContinuousPageXml
+
+ #---------------------------------------------
+ lLabels1 = ['heading', 'header', 'page-number', 'resolution-number', 'resolution-marginalia', 'resolution-paragraph', 'other']
+
+ nt1 = NodeType_PageXml_type_woText("sem" #some short prefix because labels below are prefixed with it
+ , lLabels1
+ , None #keep this to None, unless you know very well what you do. (FactorialCRF!)
+ , False #no label means OTHER
+ , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way
+ )
+ nt1.setLabelAttribute("DU_sem")
+ nt1.setXpathExpr( (".//pc:TextRegion" #how to find the nodes, MUST be same as for other node type!! (FactorialCRF!)
+ , "./pc:TextEquiv") #how to get their text
+ )
+ DU_GRAPH.addNodeType(nt1)
+
+ #---------------------------------------------
+ #lLabels2 = ['heigh', 'ho', 'other']
+ #lLabels2 = ['heigh', 'ho']
+ lLabels2 = ['B', 'I', 'E'] #we never see any S... , 'S']
+ lLabels2 = ['B', 'I', 'E', 'S', 'O'] #we never see any S... , 'S']
+
+ nt2 = NodeType_PageXml_type_woText("sgm" #some short prefix because labels below are prefixed with it
+ , lLabels2
+ , None #keep this to None, unless you know very well what you do. (FactorialCRF!)
+ , False #no label means OTHER
+ , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way
+ )
+ nt2.setLabelAttribute("DU_sgm")
+ nt2.setXpathExpr( (".//pc:TextRegion" #how to find the nodes, MUST be same as for other node type!! (FactorialCRF!)
+ , "./pc:TextEquiv") #how to get their text
+ )
+ DU_GRAPH.addNodeType(nt2)
+
+ #=== CONFIGURATION ====================================================================
+ def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None):
+
+# #edge feature extractor config is a bit teddious...
+# dFeatureConfig = { lbl:None for lbl in self.lLabels1+self.lLabels2 }
+# for lbl1 in self.lLabels1:
+# for lbl2 in self.lLabels2:
+# dFeatureConfig["%s_%s"%(lbl1, lbl2)] = None
+
+ DU_FactorialCRF_Task.__init__(self
+ , sModelName, sModelDir
+ , self.DU_GRAPH
+ , dLearnerConfig = {
+ 'C' : .1 if C is None else C
+ , 'njobs' : 8 if njobs is None else njobs
+ , 'inference_cache' : 50 if inference_cache is None else inference_cache
+ #, 'tol' : .1
+ , 'tol' : .05 if tol is None else tol
+ , 'save_every' : 50 #save every 50 iterations,for warm start
+ , 'max_iter' : 1000 if max_iter is None else max_iter
+ }
+ , sComment=sComment
+ , cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
+# , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText
+# {
+# #config for the extractor of nodes of each type
+# "text": None,
+# "sprtr": None,
+# #config for the extractor of edges of each type
+# "text_text": None,
+# "text_sprtr": None,
+# "sprtr_text": None,
+# "sprtr_sprtr": None
+# }
+ )
+
+ traceln("- classes: ", self.DU_GRAPH.getLabelNameList())
+
+ self.bsln_mdl = self.addBaseline_LogisticRegression() #use a LR model trained by GridSearch as baseline
+
+ #=== END OF CONFIGURATION =============================================================
+
+
+ def predict(self, lsColDir,sDocId):
+ """
+ Return the list of produced files
+ """
+# self.sXmlFilenamePattern = "*.a_mpxml"
+ return DU_FactorialCRF_Task.predict(self, lsColDir,sDocId)
+
+
+if __name__ == "__main__":
+ main(DU_BAR_sem_sgm)
\ No newline at end of file
diff --git a/TranskribusDU/tasks/case_BAR/DU_BAR_sgm.py b/TranskribusDU/tasks/case_BAR/DU_BAR_sgm.py
new file mode 100644
index 0000000..0b05f60
--- /dev/null
+++ b/TranskribusDU/tasks/case_BAR/DU_BAR_sgm.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+
+"""
+ DU task for BAR - see https://read02.uibk.ac.at/wiki/index.php/Document_Understanding_BAR
+
+ Copyright Xerox(C) 2017 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+import sys, os
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+
+from crf.Graph_MultiPageXml import Graph_MultiContinousPageXml
+from crf.NodeType_PageXml import NodeType_PageXml_type_woText
+from DU_CRF_Task import DU_CRF_Task
+from crf.FeatureDefinition_PageXml_std_noText import FeatureDefinition_T_PageXml_StandardOnes_noText
+from crf.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText
+
+from DU_BAR import main
+
+class DU_BAR_sgm(DU_CRF_Task):
+ """
+ We will do a typed CRF model for a DU task
+ , with the below labels
+ """
+ sLabeledXmlFilenamePattern = "*.du_mpxml"
+
+ # ===============================================================================================================
+ #DEFINING THE CLASS OF GRAPH WE USE
+ DU_GRAPH = Graph_MultiContinousPageXml
+
+
+ #lLabels2 = ['heigh', 'ho', 'other']
+ #lLabels2 = ['heigh', 'ho']
+ lLabels2 = ['B', 'I', 'E'] #we never see any S... , 'S']
+
+ # Some TextRegion have no segmentation label at all, and were labelled'other' by the converter
+ lIgnoredLabels2 = None
+
+ # """
+ # if you play with a toy collection, which does not have all expected classes, you can reduce those.
+ # """
+ #
+ # lActuallySeen = None
+ # if lActuallySeen:
+ # print "REDUCING THE CLASSES TO THOSE SEEN IN TRAINING"
+ # lIgnoredLabels = [lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen]
+ # lLabels = [lLabels[i] for i in lActuallySeen ]
+ # print len(lLabels) , lLabels
+ # print len(lIgnoredLabels) , lIgnoredLabels
+ # nbClass = len(lLabels) + 1 #because the ignored labels will become OTHER
+
+ nt2 = NodeType_PageXml_type_woText("sgm" #some short prefix because labels below are prefixed with it
+ , lLabels2
+ , lIgnoredLabels2
+ , False #no label means OTHER
+ , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way
+ )
+ nt2.setLabelAttribute("DU_sgm")
+ nt2.setXpathExpr( (".//pc:TextRegion" #how to find the nodes
+ , "./pc:TextEquiv") #how to get their text
+ )
+ DU_GRAPH.addNodeType(nt2)
+
+ #=== CONFIGURATION ====================================================================
+ def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None):
+
+ DU_CRF_Task.__init__(self
+ , sModelName, sModelDir
+ , self.DU_GRAPH
+ , dLearnerConfig = {
+ 'C' : .1 if C is None else C
+ , 'njobs' : 8 if njobs is None else njobs
+ , 'inference_cache' : 50 if inference_cache is None else inference_cache
+ #, 'tol' : .1
+ , 'tol' : .05 if tol is None else tol
+ , 'save_every' : 50 #save every 50 iterations,for warm start
+ , 'max_iter' : 1000 if max_iter is None else max_iter
+ }
+ , sComment=sComment
+ , cFeatureDefinition=FeatureDefinition_PageXml_StandardOnes_noText
+# , cFeatureDefinition=FeatureDefinition_T_PageXml_StandardOnes_noText
+# , dFeatureConfig = {
+# #config for the extractor of nodes of each type
+# "text": None,
+# "sprtr": None,
+# #config for the extractor of edges of each type
+# "text_text": None,
+# "text_sprtr": None,
+# "sprtr_text": None,
+# "sprtr_sprtr": None
+# }
+ )
+
+ traceln("- classes: ", self.DU_GRAPH.getLabelNameList())
+
+ self.bsln_mdl = self.addBaseline_LogisticRegression() #use a LR model trained by GridSearch as baseline
+
+ #=== END OF CONFIGURATION =============================================================
+
+
+ def predict(self, lsColDir,sDocId):
+ """
+ Return the list of produced files
+ """
+# self.sXmlFilenamePattern = "*.a_mpxml"
+ return DU_CRF_Task.predict(self, lsColDir,sDocId)
+
+
+if __name__ == "__main__":
+ main(DU_BAR_sgm)
\ No newline at end of file
diff --git a/TranskribusDU/tasks/case_BAR/__init__.py b/TranskribusDU/tasks/case_BAR/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/TranskribusDU/tasks/case_GTBooks/DU_GTBooks.py b/TranskribusDU/tasks/case_GTBooks/DU_GTBooks.py
new file mode 100644
index 0000000..0faead3
--- /dev/null
+++ b/TranskribusDU/tasks/case_GTBooks/DU_GTBooks.py
@@ -0,0 +1,237 @@
+# -*- coding: utf-8 -*-
+
+"""
+ Example DU task for Dodge, using the logit textual feature extractor
+
+ Copyright Xerox(C) 2017 JL. Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union�s Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+import sys, os
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+from tasks import _checkFindColDir, _exit
+
+from graph.Graph_MultiPageXml import Graph_MultiPageXml
+from graph.NodeType_PageXml import NodeType_PageXml_type_NestedText
+from tasks.DU_Task_Factory import DU_Task_Factory
+from tasks.DU_CRF_Task import DU_CRF_Task
+from graph.FeatureDefinition_PageXml_logit_v2 import FeatureDefinition_PageXml_LogitExtractorV2
+
+# ===============================================================================================================
+
+lLabels = ['TOC-entry' #0
+ , 'caption'
+ , 'catch-word'
+ , 'footer'
+ , 'footnote' #4
+ , 'footnote-continued'
+ , 'header' #6
+ , 'heading' #7
+ , 'marginalia'
+ , 'page-number' #9
+ , 'paragraph' #10
+ , 'signature-mark']
+lIgnoredLabels = None
+
+nbClass = len(lLabels)
+
+"""
+if you play with a toy collection, which does not have all expected classes, you can reduce those.
+"""
+lActuallySeen = [4, 6, 7, 9, 10]
+#lActuallySeen = [4, 6]
+"""
+ 0- TOC-entry 5940 occurences ( 2%) ( 2%)
+ 1- caption 707 occurences ( 0%) ( 0%)
+ 2- catch-word 201 occurences ( 0%) ( 0%)
+ 3- footer 11 occurences ( 0%) ( 0%)
+ 4- footnote 36942 occurences ( 11%) ( 11%)
+ 5- footnote-continued 1890 occurences ( 1%) ( 1%)
+ 6- header 15910 occurences ( 5%) ( 5%)
+ 7- heading 18032 occurences ( 6%) ( 6%)
+ 8- marginalia 4292 occurences ( 1%) ( 1%)
+ 9- page-number 40236 occurences ( 12%) ( 12%)
+ 10- paragraph 194927 occurences ( 60%) ( 60%)
+ 11- signature-mark 4894 occurences ( 2%) ( 2%)
+"""
+lActuallySeen = None
+if lActuallySeen:
+ traceln("REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
+ lIgnoredLabels = [lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen]
+ lLabels = [lLabels[i] for i in lActuallySeen ]
+ traceln(len(lLabels) , lLabels)
+ traceln(len(lIgnoredLabels) , lIgnoredLabels)
+ nbClass = len(lLabels) + 1 #because the ignored labels will become OTHER
+
+ #DEFINING THE CLASS OF GRAPH WE USE
+ DU_GRAPH = Graph_MultiPageXml
+ nt = NodeType_PageXml_type_NestedText("gtb" #some short prefix because labels below are prefixed with it
+ , lLabels
+ , lIgnoredLabels
+ , True #no label means OTHER
+ )
+else:
+ #DEFINING THE CLASS OF GRAPH WE USE
+ DU_GRAPH = Graph_MultiPageXml
+ nt = NodeType_PageXml_type_NestedText("gtb" #some short prefix because labels below are prefixed with it
+ , lLabels
+ , lIgnoredLabels
+ , False #no label means OTHER
+ )
+nt.setXpathExpr( (".//pc:TextRegion" #how to find the nodes
+ , "./pc:TextEquiv") #how to get their text
+ )
+DU_GRAPH.addNodeType(nt)
+
+"""
+The constraints must be a list of tuples like ( , , , )
+where:
+- operator is one of 'XOR' 'XOROUT' 'ATMOSTONE' 'OR' 'OROUT' 'ANDOUT' 'IMPLY'
+- states is a list of unary state names, 1 per involved unary. If the states are all the same, you can pass it directly as a single string.
+- negated is a list of boolean indicated if the unary must be negated. Again, if all values are the same, pass a single boolean value instead of a list
+"""
+if False:
+ DU_GRAPH.setPageConstraint( [ ('ATMOSTONE', nt, 'pnum' , False) #0 or 1 catch_word per page
+ , ('ATMOSTONE', nt, 'title' , False) #0 or 1 heading pare page
+ ] )
+
+# ===============================================================================================================
+
+
+class DU_GTBooks(DU_CRF_Task):
+ """
+ We will do a CRF model for a DU task
+ , working on a DS XML document at BLOCK level
+ , with the below labels
+ """
+ sXmlFilenamePattern = "*.mpxml"
+
+# #In case you want to change the Logistic Regression gird search parameters...
+# dGridSearch_LR_conf = {'C':[0.01, 0.1, 1.0, 10.0] } #Grid search parameters for LR baseline method training
+# dGridSearch_LR_n_jobs = 4 #Grid search: number of jobs
+
+ #=== CONFIGURATION ====================================================================
+ def __init__(self, sModelName, sModelDir, sComment=None, C=None, tol=None, njobs=None, max_iter=None, inference_cache=None):
+ #NOTE: we might get a list in C tol max_iter inference_cache (in case of gridsearch)
+
+ DU_CRF_Task.__init__(self
+ , sModelName, sModelDir
+ , DU_GRAPH
+ , dFeatureConfig = {
+ 'nbClass' : nbClass
+ , 't_ngrams_node' : (2,4)
+ , 'b_node_lc' : False
+ , 't_ngrams_edge' : (2,4)
+ , 'b_edge_lc' : False
+ , 'n_jobs' : 5 #n_jobs when fitting the internal Logit feat extractor model by grid search
+ }
+ , dLearnerConfig = {
+ 'C' : .1 if C is None else C
+ , 'njobs' : 5 if njobs is None else njobs
+ , 'inference_cache' : 50 if inference_cache is None else inference_cache
+ #, 'tol' : .1
+ , 'tol' : .05 if tol is None else tol
+ , 'save_every' : 50 #save every 50 iterations,for warm start
+ , 'max_iter' : 1000 if max_iter is None else max_iter
+ }
+ , sComment=sComment
+ , cFeatureDefinition=FeatureDefinition_PageXml_LogitExtractorV2
+ )
+
+ self.setNbClass(nbClass) #so that we check if all classes are represented in the training set
+
+ self.bsln_mdl = self.addBaseline_LogisticRegression() #use a LR model trained by GridSearch as baseline
+ #=== END OF CONFIGURATION =============================================================
+
+
+if __name__ == "__main__":
+
+ version = "v.01"
+ usage, description, parser = DU_Task_Factory.getStandardOptionsParser(sys.argv[0], version)
+
+ # ---
+ #parse the command line
+ (options, args) = parser.parse_args()
+
+ # ---
+ try:
+ sModelDir, sModelName = args
+ except Exception as e:
+ traceln("Specify a model folder and a model name!")
+ _exit(usage, 1, e)
+
+ doer = DU_GTBooks(sModelName, sModelDir,
+ C = options.crf_C,
+ tol = options.crf_tol,
+ njobs = options.crf_njobs,
+ max_iter = options.max_iter,
+ inference_cache = options.crf_inference_cache)
+
+ if options.rm:
+ doer.rm()
+ sys.exit(0)
+
+ traceln("- classes: ", DU_GRAPH.getLabelNameList())
+
+ if options.best_params:
+ dBestParams = doer.getModelClass().loadBestParams(sModelDir, options.best_params)
+ doer.setLearnerConfiguration(dBestParams)
+
+ lTrn, lTst, lRun, lFold = [_checkFindColDir(lsDir) for lsDir in [options.lTrn, options.lTst, options.lRun, options.lFold]]
+
+ if options.iFoldInitNum or options.iFoldRunNum or options.bFoldFinish:
+ if options.iFoldInitNum:
+ """
+ initialization of a cross-validation
+ """
+ splitter, ts_trn, lFilename_trn = doer._nfold_Init(lFold, options.iFoldInitNum, bStoreOnDisk=True)
+ elif options.iFoldRunNum:
+ """
+ Run one fold
+ """
+ oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm)
+ traceln(oReport)
+ elif options.bFoldFinish:
+ tstReport = doer._nfold_Finish()
+ traceln(tstReport)
+ else:
+ assert False, "Internal error"
+ #no more processing!!
+ exit(0)
+ #-------------------
+
+ if lFold:
+ loTstRpt = doer.nfold_Eval(lFold, 3, .25, None)
+ import graph.GraphModel
+ sReportPickleFilename = os.path.join(sModelDir, sModelName + "__report.txt")
+ traceln("Results are in %s"%sReportPickleFilename)
+ graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, loTstRpt)
+ elif lTrn:
+ doer.train_save_test(lTrn, lTst, options.warm)
+ try: traceln("Baseline best estimator: %s"%doer.bsln_mdl.best_params_) #for GridSearch
+ except: pass
+ traceln(" --- CRF Model ---")
+ traceln(doer.getModel().getModelInfo())
+ elif lTst:
+ doer.load()
+ tstReport = doer.test(lTst)
+ traceln(tstReport)
+
+ if lRun:
+ doer.load()
+ lsOutputFilename = doer.predict(lRun)
+ traceln("Done, see in:\n %s"%lsOutputFilename)
+
diff --git a/TranskribusDU/tasks/case_GTBooks/DU_GTBooks_BL.py b/TranskribusDU/tasks/case_GTBooks/DU_GTBooks_BL.py
new file mode 100644
index 0000000..62f6d9b
--- /dev/null
+++ b/TranskribusDU/tasks/case_GTBooks/DU_GTBooks_BL.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+"""
+ Example DU task for Dodge, using the logit textual feature extractor
+
+ Copyright Xerox(C) 2017 JL. Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union�s Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+import sys, os
+from crf import FeatureDefinition_PageXml_GTBooks
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+from tasks import _checkFindColDir, _exit
+
+from crf.Graph_MultiPageXml import Graph_MultiPageXml
+from crf.NodeType_PageXml import NodeType_PageXml_type_NestedText
+from DU_CRF_Task import DU_CRF_Task
+from DU_BL_Task import DU_Baseline
+from crf.FeatureDefinition_PageXml_GTBooks import FeatureDefinition_GTBook
+
+# ===============================================================================================================
+
+lLabels = ['TOC-entry' #0
+ , 'caption'
+ , 'catch-word'
+ , 'footer'
+ , 'footnote' #4
+ , 'footnote-continued'
+ , 'header' #6
+ , 'heading' #7
+ , 'marginalia'
+ , 'page-number' #9
+ , 'paragraph' #10
+ , 'signature-mark']
+lIgnoredLabels = None
+
+nbClass = len(lLabels)
+
+"""
+if you play with a toy collection, which does not have all expected classes, you can reduce those.
+"""
+lActuallySeen = [4, 6, 7, 9, 10]
+#lActuallySeen = [4, 6]
+"""
+ 0- TOC-entry 5940 occurences ( 2%) ( 2%)
+ 1- caption 707 occurences ( 0%) ( 0%)
+ 2- catch-word 201 occurences ( 0%) ( 0%)
+ 3- footer 11 occurences ( 0%) ( 0%)
+ 4- footnote 36942 occurences ( 11%) ( 11%)
+ 5- footnote-continued 1890 occurences ( 1%) ( 1%)
+ 6- header 15910 occurences ( 5%) ( 5%)
+ 7- heading 18032 occurences ( 6%) ( 6%)
+ 8- marginalia 4292 occurences ( 1%) ( 1%)
+ 9- page-number 40236 occurences ( 12%) ( 12%)
+ 10- paragraph 194927 occurences ( 60%) ( 60%)
+ 11- signature-mark 4894 occurences ( 2%) ( 2%)
+"""
+lActuallySeen = None
+if lActuallySeen:
+ traceln("REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
+ lIgnoredLabels = [lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen]
+ lLabels = [lLabels[i] for i in lActuallySeen ]
+ traceln(len(lLabels) , lLabels)
+ traceln(len(lIgnoredLabels) , lIgnoredLabels)
+ nbClass = len(lLabels) + 1 #because the ignored labels will become OTHER
+
+ #DEFINING THE CLASS OF GRAPH WE USE
+ DU_GRAPH = Graph_MultiPageXml
+ nt = NodeType_PageXml_type_NestedText("gtb" #some short prefix because labels below are prefixed with it
+ , lLabels
+ , lIgnoredLabels
+ , True #no label means OTHER
+ )
+else:
+ #DEFINING THE CLASS OF GRAPH WE USE
+ DU_GRAPH = Graph_MultiPageXml
+ nt = NodeType_PageXml_type_NestedText("gtb" #some short prefix because labels below are prefixed with it
+ , lLabels
+ , lIgnoredLabels
+ , False #no label means OTHER
+ )
+nt.setXpathExpr( (".//pc:TextRegion" #how to find the nodes
+ , "./pc:TextEquiv") #how to get their text
+ )
+DU_GRAPH.addNodeType(nt)
+
+"""
+The constraints must be a list of tuples like ( , , , )
+where:
+- operator is one of 'XOR' 'XOROUT' 'ATMOSTONE' 'OR' 'OROUT' 'ANDOUT' 'IMPLY'
+- states is a list of unary state names, 1 per involved unary. If the states are all the same, you can pass it directly as a single string.
+- negated is a list of boolean indicated if the unary must be negated. Again, if all values are the same, pass a single boolean value instead of a list
+"""
+if False:
+ DU_GRAPH.setPageConstraint( [ ('ATMOSTONE', nt, 'pnum' , False) #0 or 1 catch_word per page
+ , ('ATMOSTONE', nt, 'title' , False) #0 or 1 heading pare page
+ ] )
+
+# ===============================================================================================================
+
+
+class DU_BL_V1(DU_Baseline):
+ def __init__(self, sModelName, sModelDir,logitID,sComment=None):
+ DU_Baseline.__init__(self, sModelName, sModelDir,DU_GRAPH,logitID)
+
+
+
+if __name__ == "__main__":
+
+ version = "v.01"
+ usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version)
+
+ # ---
+ #parse the command line
+ (options, args) = parser.parse_args()
+ # ---
+ try:
+ sModelDir, sModelName = args
+ except Exception as e:
+ _exit(usage, 1, e)
+
+ doer = DU_BL_V1(sModelName, sModelDir,'logit_5')
+
+ if options.rm:
+ doer.rm()
+ sys.exit(0)
+
+ traceln("- classes: ", DU_GRAPH.getLabelNameList())
+
+ if hasattr(options,'l_train_files') and hasattr(options,'l_test_files'):
+ f=open(options.l_train_files)
+ lTrn=[]
+ for l in f:
+ fname=l.rstrip()
+ lTrn.append(fname)
+ f.close()
+
+ g=open(options.l_test_files)
+ lTst=[]
+ for l in g:
+ fname=l.rstrip()
+ lTst.append(fname)
+
+ tstReport=doer.train_save_test(lTrn, lTst, options.warm,filterFilesRegexp=False)
+ traceln(tstReport)
+
+
+ else:
+
+ lTrn, lTst, lRun = [_checkFindColDir(lsDir) for lsDir in [options.lTrn, options.lTst, options.lRun]]
+
+ if lTrn:
+ doer.train_save_test(lTrn, lTst, options.warm)
+ elif lTst:
+ doer.load()
+ tstReport = doer.test(lTst)
+ traceln(tstReport)
+
+ if lRun:
+ doer.load()
+ lsOutputFilename = doer.predict(lRun)
+ traceln("Done, see in:\n %s"%lsOutputFilename)
+
diff --git a/TranskribusDU/tasks/case_GTBooks/__init__.py b/TranskribusDU/tasks/case_GTBooks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/TranskribusDU/tasks/cluster2Region.py b/TranskribusDU/tasks/cluster2Region.py
new file mode 100644
index 0000000..fa155b1
--- /dev/null
+++ b/TranskribusDU/tasks/cluster2Region.py
@@ -0,0 +1,182 @@
+# -*- coding: utf-8 -*-
+
+"""
+Transform clusters into TextRegions and populate them with TextLines
+
+Created on August 2019
+
+Copyright NAVER LABS Europe 2019
+@author: Hervé Déjean
+"""
+
+import sys, os, glob
+from optparse import OptionParser
+from copy import deepcopy
+from collections import Counter
+from collections import defaultdict
+
+from lxml import etree
+import numpy as np
+from shapely.ops import cascaded_union
+
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+TranskribusDU_version
+
+from common.trace import traceln, trace
+from xml_formats.PageXml import PageXml
+from util.Shape import ShapeLoader
+dNS = {"pg":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
+# ----------------------------------------------------------------------------
+
+
+def getClusterCoords(lElts):
+
+ lp = []
+ for e in lElts:
+ try:
+ lp.append(ShapeLoader.node_to_Polygon(e))
+ except ValueError:
+ pass
+ contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lp ])
+ # print(contour.wkt)
+ try:spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in contour.convex_hull.exterior.coords)
+ except:
+ try: spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in contour.convex_hull.coords)
+ # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence
+ except: spoints = ""
+ return spoints
+
+def deleteRegionsinDOM(page,lRegionsNd):
+ [page.remove(c) for c in lRegionsNd]
+
+def main(sInputDir
+ , bVerbose=False):
+
+ lSkippedFile = []
+
+ # filenames without the path
+ lsFilename = [os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith("_du.mpxml")]
+ traceln(" - %d .mpxml files to process" % len(lsFilename))
+ for sMPXml in lsFilename:
+ trace(" - .mpxml FILE : ", sMPXml)
+ if bVerbose: traceln()
+
+ # 0 - load input file
+ doc = etree.parse(os.path.join(sInputDir,sMPXml))
+ cluster2Region(doc,bVerbose)
+
+ doc.write(os.path.join(sInputDir,sMPXml),
+ xml_declaration = True,
+ encoding="utf-8",
+ pretty_print=True
+ #compression=0, #0 to 9
+ )
+
+
+def propagateTypeToRegion(ndRegion):
+ """
+ compute the most frequent type in the Textlines and assigns it to the new region
+ """
+ dType=Counter()
+ for t in ndRegion:
+ dType[t.get('type')]+=1
+ mc = dType.most_common(1)
+ if mc :
+ if mc[0][0]:ndRegion.set('type',mc[0][0])
+ # structure {type:page-number;}
+ # custom="structure {type:page-number;}"
+ if mc[0][0]:ndRegion.set('custom',"structure {type:%s;}"%mc[0][0])
+
+
+def addRegionToDom(page,ipage,lc,bVerbose):
+ """
+ create a dom node for each cluster
+ update DU_cluster for each Textline
+ """
+ for ic,dC in enumerate(lc):
+ ndRegion = PageXml.createPageXmlNode('TextRegion')
+
+ #update elements
+ lTL = lc[dC]
+ print (lTL)
+# for id in c.get('content').split():
+# elt = page.xpath('.//*[@id="%s"]'%id)[0]
+# elt.getparent().remove(elt)
+# ndRegion.append(elt)
+# lTL.append((elt))
+ ndRegion.set('id',"p%d_r%d"%(ipage,ic))
+ coords = PageXml.createPageXmlNode('Coords')
+ ndRegion.append(coords)
+ coords.set('points',getClusterCoords(lTL))
+ propagateTypeToRegion(ndRegion)
+
+ page.append(ndRegion)
+
+def getCLusters(ndPage):
+ dCluster=defaultdict(list)
+ lTL= ndPage.xpath(".//*[@DU_cluster]", namespaces=dNS)
+ for x in lTL:dCluster[x.get('DU_cluster')].append(x)
+ return dCluster
+
+def cluster2Region(doc, fTH=0.5,bVerbose=True):
+ """
+
+ """
+ root = doc.getroot()
+
+ # no use @DU_CLuster:
+ xpCluster = ".//pg:Cluster"
+ xpTextRegions = ".//pg:TextRegion"
+
+ # get pages
+ for iPage, ndPage in enumerate(PageXml.xpath(root, "//pc:Page")):
+ # get cluster
+ dClusters= getCLusters(ndPage) #ndPage.xpath(xpCluster, namespaces=dNS)
+ lRegionsNd = ndPage.xpath(xpTextRegions, namespaces=dNS)
+ if bVerbose:traceln("\n%d clusters and %d regions found" %(len(dClusters),len(lRegionsNd)))
+
+ addRegionToDom(ndPage,iPage+1,dClusters,bVerbose)
+ if bVerbose:traceln("%d regions created" %(len(dClusters)))
+ deleteRegionsinDOM(ndPage, lRegionsNd)
+
+ return doc
+
+
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+
+ version = "v.01"
+ sUsage="""
+Usage: %s
+
+""" % (sys.argv[0], 90)
+
+ parser = OptionParser(usage=sUsage)
+ parser.add_option("-v", "--verbose", dest='bVerbose', action="store_true"
+ , help="Verbose mode")
+ (options, args) = parser.parse_args()
+
+ try:
+ sInputDir = args[0]
+ except ValueError:
+ sys.stderr.write(sUsage)
+ sys.exit(1)
+
+ # ... checking folders
+ if not os.path.normpath(sInputDir).endswith("col") : sInputDir = os.path.join(sInputDir, "col")
+ # all must be ok by now
+ lsDir = [sInputDir]
+ if not all(os.path.isdir(s) for s in lsDir):
+ for s in lsDir:
+ if not os.path.isdir(s): sys.stderr.write("Not a directory: %s\n"%s)
+ sys.exit(2)
+
+ main(sInputDir, bVerbose=options.bVerbose)
+
+ traceln("Done.")
\ No newline at end of file
diff --git a/TranskribusDU/tasks/compareReport.py b/TranskribusDU/tasks/compareReport.py
index 15c3d01..5073c37 100644
--- a/TranskribusDU/tasks/compareReport.py
+++ b/TranskribusDU/tasks/compareReport.py
@@ -6,18 +6,7 @@
Copyright Naber Labs Europe(C) 2018
@author H. Déjean
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tasks/do_keep_if_text.py b/TranskribusDU/tasks/do_keep_if_text.py
new file mode 100644
index 0000000..da4d229
--- /dev/null
+++ b/TranskribusDU/tasks/do_keep_if_text.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+"""
+ Keep doc with more than given ratio of empty TextLine
+
+ Copyright Naver Labs Europe(C) 2018 JL Meunier
+
+
+
+
+ Developed for the EU project READ. The READ project has received funding
+ from the European Union's Horizon 2020 research and innovation programme
+ under grant agreement No 674943.
+
+"""
+
+
+
+
+import sys, os
+from optparse import OptionParser
+import shutil
+
+from lxml import etree
+
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+
+from common.trace import traceln
+from xml_formats.PageXml import PageXml
+from tasks import _exit
+
+
+def isTexted(sFilename, fRatio):
+ parser = etree.XMLParser(remove_blank_text=True)
+ doc = etree.parse(sFilename, parser)
+
+ cntTxt, cnt = PageXml.countTextLineWithText(doc)
+
+ fDocRatio = float(cntTxt) / cnt
+
+ del doc
+
+ if fDocRatio > fRatio:
+ return True
+ elif fDocRatio > 0:
+ traceln("Warning: %d texted out of %d (%.2f) %s" % (cntTxt, cnt, fDocRatio, sFilename))
+
+ return False
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+ usage = """ """
+ version = "v.01"
+ parser = OptionParser(usage=usage, version="0.1")
+ parser.add_option("--ratio", dest='fRatio', action="store"
+ , type=float
+ , help="Keep doc with more than given ratio of empty TextLine"
+ , default=0.75)
+
+ # ---
+ #parse the command line
+ (options, args) = parser.parse_args()
+
+ traceln(options)
+
+ if len(args) == 2 and os.path.isdir(args[0]) and os.path.isdir(args[1]):
+ # ok, let's work differently...
+ sFromDir,sToDir = args
+ for s in os.listdir(sFromDir):
+ if not s.endswith("pxml"): pass
+ sFilename = sFromDir + "/" + s
+ if isTexted(sFilename, options.fRatio):
+ traceln(sFilename," --> ", sToDir)
+ shutil.copy(sFilename, sToDir)
+ else:
+ traceln(" skipping: ", sFilename)
+ else:
+ for sFilename in args:
+ if isTexted(sFilename, options.fRatio):
+ traceln("texted : %s"%sFilename)
+ else:
+ traceln("no text: %s"%sFilename)
diff --git a/TranskribusDU/tasks/ecn_16Lay1Conv.json b/TranskribusDU/tasks/ecn_16Lay1Conv.json
new file mode 100644
index 0000000..4bfc40d
--- /dev/null
+++ b/TranskribusDU/tasks/ecn_16Lay1Conv.json
@@ -0,0 +1,19 @@
+{
+ "ecn_learner_config":
+ {
+ "name":"8Lay1Conv",
+ "dropout_rate_edge": 0.2,
+ "dropout_rate_edge_feat": 0.2,
+ "dropout_rate_node": 0.2,
+ "lr": 0.001,
+ "mu": 0.0001,
+ "nb_iter": 4800,
+ "nconv_edge": 1,
+ "node_indim": -1,
+ "num_layers": 16,
+ "ratio_train_val": 0.3,
+ "patience":100,
+ "activation_name":"relu",
+ "stack_convolutions": false
+ }
+}
diff --git a/TranskribusDU/tasks/ecn_1Lay1Conv.json b/TranskribusDU/tasks/ecn_1Lay1Conv.json
new file mode 100644
index 0000000..8153cec
--- /dev/null
+++ b/TranskribusDU/tasks/ecn_1Lay1Conv.json
@@ -0,0 +1,19 @@
+{
+ "ecn_learner_config":
+ {
+ "name":"8Lay1Conv",
+ "dropout_rate_edge": 0.2,
+ "dropout_rate_edge_feat": 0.2,
+ "dropout_rate_node": 0.2,
+ "lr": 0.001,
+ "mu": 0.0001,
+ "nb_iter": 1200,
+ "nconv_edge": 1,
+ "node_indim": -1,
+ "num_layers": 1,
+ "ratio_train_val": 0.2,
+ "patience":100,
+ "activation_name":"relu",
+ "stack_convolutions": false
+ }
+}
diff --git a/TranskribusDU/tasks/ecn_4Lay1Conv.json b/TranskribusDU/tasks/ecn_4Lay1Conv.json
new file mode 100644
index 0000000..ec9b351
--- /dev/null
+++ b/TranskribusDU/tasks/ecn_4Lay1Conv.json
@@ -0,0 +1,19 @@
+{
+ "ecn_learner_config":
+ {
+ "name":"8Lay1Conv",
+ "dropout_rate_edge": 0.2,
+ "dropout_rate_edge_feat": 0.2,
+ "dropout_rate_node": 0.2,
+ "lr": 0.001,
+ "mu": 0.0001,
+ "nb_iter": 500,
+ "nconv_edge": 1,
+ "node_indim": 32,
+ "num_layers": 4,
+ "ratio_train_val": 0.3,
+ "patience":10,
+ "activation_name":"relu",
+ "stack_convolutions": false
+ }
+}
diff --git a/TranskribusDU/tasks/ecn_8Lay1Conv.json b/TranskribusDU/tasks/ecn_8Lay1Conv.json
new file mode 100644
index 0000000..1ced732
--- /dev/null
+++ b/TranskribusDU/tasks/ecn_8Lay1Conv.json
@@ -0,0 +1,19 @@
+{
+ "ecn_learner_config":
+ {
+ "name":"8Lay1Conv",
+ "dropout_rate_edge": 0.2,
+ "dropout_rate_edge_feat": 0.2,
+ "dropout_rate_node": 0.2,
+ "lr": 0.001,
+ "mu": 0.0001,
+ "nb_iter": 1200,
+ "nconv_edge": 1,
+ "node_indim": -1,
+ "num_layers": 8,
+ "ratio_train_val": 0.2,
+ "patience":100,
+ "activation_name":"relu",
+ "stack_convolutions": false
+ }
+}
diff --git a/TranskribusDU/tasks/ecn_8Lay1ConvLR.json b/TranskribusDU/tasks/ecn_8Lay1ConvLR.json
new file mode 100644
index 0000000..8cb0017
--- /dev/null
+++ b/TranskribusDU/tasks/ecn_8Lay1ConvLR.json
@@ -0,0 +1,19 @@
+{
+ "ecn_learner_config":
+ {
+ "name":"8Lay1Conv",
+ "dropout_rate_edge": 0.2,
+ "dropout_rate_edge_feat": 0.2,
+ "dropout_rate_node": 0.2,
+ "lr": 0.001,
+ "mu": 0.0001,
+ "nb_iter": 800,
+ "nconv_edge": 1,
+ "node_indim": -1,
+ "num_layers": 8,
+ "ratio_train_val": 0.5,
+ "patience":100,
+ "activation_name":"relu",
+ "stack_convolutions": false
+ }
+}
diff --git a/TranskribusDU/tasks/ecn_8Lay1Conv_dropout.json b/TranskribusDU/tasks/ecn_8Lay1Conv_dropout.json
new file mode 100644
index 0000000..d6ff942
--- /dev/null
+++ b/TranskribusDU/tasks/ecn_8Lay1Conv_dropout.json
@@ -0,0 +1,19 @@
+{
+ "ecn_learner_config":
+ {
+ "name":"8Lay1Conv",
+ "dropout_rate_edge": 0.0,
+ "dropout_rate_edge_feat": 0.0,
+ "dropout_rate_node": 0.0,
+ "lr": 0.001,
+ "mu": 0.0001,
+ "nb_iter": 1200,
+ "nconv_edge": 1,
+ "node_indim": -1,
+ "num_layers": 8,
+ "ratio_train_val": 0.2,
+ "patience":100,
+ "activation_name":"relu",
+ "stack_convolutions": false
+ }
+}
diff --git a/TranskribusDU/tasks/ensemble.json b/TranskribusDU/tasks/ensemble.json
new file mode 100644
index 0000000..4add23a
--- /dev/null
+++ b/TranskribusDU/tasks/ensemble.json
@@ -0,0 +1,74 @@
+{
+ "_comment": "1 relu and 1 tanh models, twice, defined in a configuration file",
+ "ratio_train_val": 0.2,
+ "ecn_ensemble": [
+ {
+ "type": "ecn",
+ "name": "default_8Lay1Conv_A",
+ "dropout_rate_edge": 0.2,
+ "dropout_rate_edge_feat": 0.2,
+ "dropout_rate_node": 0.2,
+ "lr": 0.0001,
+ "mu": 0.0001,
+ "nb_iter": 1200,
+ "nconv_edge": 1,
+ "node_indim": 64,
+ "num_layers": 8,
+ "ratio_train_val": 0.1,
+ "patience": 50,
+ "activation_name": "relu",
+ "stack_convolutions": false
+ },
+ {
+ "type": "ecn",
+ "name": "default_8Lay1Conv_A",
+ "dropout_rate_edge": 0.2,
+ "dropout_rate_edge_feat": 0.2,
+ "dropout_rate_node": 0.2,
+ "lr": 0.0001,
+ "mu": 0.0001,
+ "nb_iter": 1200,
+ "nconv_edge": 1,
+ "node_indim": 64,
+ "num_layers": 8,
+ "ratio_train_val": 0.1,
+ "patience": 50,
+ "activation_name": "tanh",
+ "stack_convolutions": false
+ },
+ {
+ "type": "ecn",
+ "name": "default_8Lay1Conv_B",
+ "dropout_rate_edge": 0.2,
+ "dropout_rate_edge_feat": 0.2,
+ "dropout_rate_node": 0.2,
+ "lr": 0.0001,
+ "mu": 0.0001,
+ "nb_iter": 1200,
+ "nconv_edge": 1,
+ "node_indim": 64,
+ "num_layers": 8,
+ "ratio_train_val": 0.1,
+ "patience": 50,
+ "activation_name": "relu",
+ "stack_convolutions": false
+ },
+ {
+ "type": "ecn",
+ "name": "default_8Lay1Conv_B",
+ "dropout_rate_edge": 0.2,
+ "dropout_rate_edge_feat": 0.2,
+ "dropout_rate_node": 0.2,
+ "lr": 0.0001,
+ "mu": 0.0001,
+ "nb_iter": 1200,
+ "nconv_edge": 1,
+ "node_indim": 64,
+ "num_layers": 8,
+ "ratio_train_val": 0.1,
+ "patience": 50,
+ "activation_name": "tanh",
+ "stack_convolutions": false
+ }
+ ]
+}
\ No newline at end of file
diff --git a/TranskribusDU/tasks/intersect_cluster.py b/TranskribusDU/tasks/intersect_cluster.py
new file mode 100644
index 0000000..3c7f5c9
--- /dev/null
+++ b/TranskribusDU/tasks/intersect_cluster.py
@@ -0,0 +1,278 @@
+# -*- coding: utf-8 -*-
+
+"""
+We expect XML file with cluster defined by several algo.
+For each Page:
+ We intersect the cluster of one algo with cluster of the other and
+ We generate new clusters named after the algo names, e.g. (A_I_B)
+
+Overwrite the input XML files, adding new cluster definitions
+
+Created on 9/9/2019
+
+Copyright NAVER LABS Europe 2019
+
+@author: JL Meunier
+"""
+
+import sys, os
+from optparse import OptionParser
+
+from lxml import etree
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+TranskribusDU_version
+
+from common.trace import traceln, trace
+from util.Shape import ShapeLoader
+from xml_formats.PageXml import PageXml
+
+# ----------------------------------------------------------------------------
+xpCluster = ".//pg:Cluster"
+# sFMT = "(%s_∩_%s)" pb with visu
+sFMT = "(%s_I_%s)"
+sAlgoAttr = "algo"
+xpPage = ".//pg:Page"
+dNS = {"pg":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
+# ----------------------------------------------------------------------------
+
+class Cluster:
+ cnt = 0
+
+ def __init__(self, name, setID, shape=None):
+ self.name = name
+ self.setID = setID
+ self.shape = shape
+ # self.node = ... the load method can set a .node attribute pointing to the DOM node
+
+ def getSetID(self): return self.setID
+
+ def __len__(self): return len(self.setID)
+
+ @classmethod
+ def remove(cls, ndPage, sAlgo):
+ """
+ Given an algo, remove all its clusters from a page
+ """
+ i = 0
+ for nd in ndPage.xpath(xpCluster+"[@%s='%s']"%(sAlgoAttr, sAlgo)
+ , namespaces=dNS):
+ ndPage.remove(nd)
+ i += 1
+ return i
+
+ @classmethod
+ def load(cls, ndPage, sAlgo, bNode=False):
+ """
+ Given an algo, load all its cluster from the page.
+ Compute their shape, if not provided in the XML, as a minimum rotated rectangle
+ """
+ l = []
+ for nd in ndPage.xpath(xpCluster+"[@%s='%s']"%(sAlgoAttr, sAlgo)
+ , namespaces=dNS):
+ c = cls.loadClusterNode(ndPage, nd, sAlgo)
+ if not c is None:
+ if bNode: c.node = nd
+ l.append(c)
+ return l
+
+ @classmethod
+ def loadClusterNode(cls, ndPage, nd, sAlgo, bComputeShape=True):
+ """
+ Load a cluster from its XML node
+ Compute its shape, if not provided in the XML, as a minimum rotated rectangle
+ """
+ name = nd.get("name")
+ if name is None:
+ name = "%s_%d"%(sAlgo, cls.cnt)
+ cls.cnt += 1
+ nd.set("name", name)
+ setID = set(nd.get("content").split())
+ if bool(setID):
+ try:
+ shape = ShapeLoader.node_to_Polygon(nd)
+ except IndexError:
+ if bComputeShape:
+ shape = cls.computeShape(ndPage, setID)
+ else:
+ shape = None
+ return cls(name, setID, shape)
+ else:
+ return None
+
+ @classmethod
+ def store(cls, ndPage, lCluster, sAlgo):
+ """
+ Store those "qlgo" clusters in the page node
+ """
+ ndPage.append(etree.Comment("\nClusters created by cluster intersection\n"))
+
+ for c in lCluster:
+ ndPage.append(c.makeClusterNode(sAlgo))
+
+ def makeClusterNode(self, sAlgo):
+ """
+ Create an XML node reflecting the cluster
+ """
+ ndCluster = PageXml.createPageXmlNode('Cluster')
+ ndCluster.set("name", self.name)
+ ndCluster.set("algo", sAlgo)
+ # add the space separated list of node ids
+ ndCluster.set("content", " ".join(self.setID))
+ ndCoords = PageXml.createPageXmlNode('Coords')
+ ndCluster.append(ndCoords)
+ if self.shape is None:
+ ndCoords.set('points', "")
+ else:
+ ndCoords.set('points', ShapeLoader.getCoordsString(self.shape))
+ ndCluster.tail = "\n"
+ return ndCluster
+
+ @classmethod
+ def intersect(cls, one, other):
+ """
+ return None or a cluster made by intersecting two cluster
+ the shape of the intersection if the intersection of shapes, or None if not applicable
+ """
+ setID = one.setID.intersection(other.setID)
+ if bool(setID):
+ try:
+ shapeInter = one.shape.intersection(other.shape)
+ except ValueError:
+ shapeInter = None
+ return cls(sFMT % (one.name, other.name), setID, shapeInter)
+ else:
+ return None
+
+ @classmethod
+ def computeShape(cls, ndPage, setID, bConvexHull=False):
+ """
+ compute a shape for this cluster, as the minimum rotated rectangle of its content
+ or optionally as the convex hull
+ """
+ # let's find the nodes and compute the shape
+ lNode = [ndPage.xpath(".//*[@id='%s']"%_id, namespaces=dNS)[0] for _id in setID]
+ return ShapeLoader.convex_hull(lNode, bShapelyObject=True) \
+ if bConvexHull \
+ else ShapeLoader.minimum_rotated_rectangle(lNode, bShapelyObject=True)
+
+
+def main(sInputDir, sAlgoA, sAlgoB, bShape=False, bConvexHull=False, bVerbose=False):
+ sAlgoC = sFMT % (sAlgoA, sAlgoB)
+
+ # filenames without the path
+ lsFilename = [os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith("_du.pxml") or name.endswith("_du.mpxml")]
+ traceln(" - %d files to process, to produce clusters '%s'" % (
+ len(lsFilename)
+ , sAlgoC))
+
+ for sFilename in lsFilename:
+ sFullFilename = os.path.join(sInputDir, sFilename)
+ traceln(" - FILE : ", sFullFilename)
+ cntCluster, cntPage = 0, 0
+ doc = etree.parse(sFullFilename)
+
+ for iPage, ndPage in enumerate(doc.getroot().xpath(xpPage, namespaces=dNS)):
+ nRemoved = Cluster.remove(ndPage, sAlgoC)
+
+ lClusterA = Cluster.load(ndPage, sAlgoA)
+ lClusterB = Cluster.load(ndPage, sAlgoB)
+
+ if bVerbose:
+ trace("Page %d : (%d clusters REMOVED), %d cluster '%s' %d clusters '%s'" %(iPage+1
+ , nRemoved
+ , len(lClusterA), sAlgoA
+ , len(lClusterB), sAlgoB))
+
+ lClusterC = []
+ for A in lClusterA:
+ for B in lClusterB:
+ C = Cluster.intersect(A, B)
+ if not C is None:
+ lClusterC.append(C)
+
+ if bVerbose: traceln( " -> %d clusters" % (len(lClusterC)))
+ if bShape or bConvexHull:
+ for c in lClusterC:
+ c.shape = Cluster.computeShape(ndPage, c.setID, bConvexHull=bConvexHull)
+
+ cntCluster += len(lClusterC)
+ cntPage += 1
+
+ Cluster.store(ndPage, lClusterC, sAlgoC)
+
+ doc.write(sFullFilename,
+ xml_declaration=True,
+ encoding="utf-8",
+ pretty_print=True
+ #compression=0, #0 to 9
+ )
+
+ del doc
+ traceln(" %d clusters over %d pages" % (cntCluster, cntPage))
+
+ traceln(" done (%d files)" % len(lsFilename))
+
+
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+
+ version = "v.01"
+ sUsage="""
+Produce the intersection of two types of clusters, selected by their @algo attrbute.
+
+Usage: %s
+
+""" % (sys.argv[0])
+
+ parser = OptionParser(usage=sUsage)
+ parser.add_option("-v", "--verbose", dest='bVerbose', action="store_true"
+ , help="Verbose mode")
+ parser.add_option("-s", "--shape", dest='bShape', action="store_true"
+ , help="Compute the shape of the intersection content as minimum rotated rectangle, instead of intersection of shapes")
+ parser.add_option("--hull", dest='bConvexHull', action="store_true"
+ , help="Compute the shape of the intersection content as convex hull, instead of intersection of shapes")
+ (options, args) = parser.parse_args()
+
+ try:
+ sInputDir, sA, sB = args
+ except ValueError:
+ sys.stderr.write(sUsage)
+ sys.exit(1)
+
+ # ... checking folders
+ if not os.path.normpath(sInputDir).endswith("col") : sInputDir = os.path.join(sInputDir, "col")
+
+ if not os.path.isdir(sInputDir):
+ sys.stderr.write("Not a directory: %s\n"%sInputDir)
+ sys.exit(2)
+
+ # ok, go!
+ traceln("Input is : ", os.path.abspath(sInputDir))
+ traceln("algo A is : ", sA)
+ traceln("algo B is : ", sB)
+ if options.bShape or options.bConvexHull:
+ traceln("Shape of intersections based on content!")
+ else:
+ traceln("Shape of intersections is the intersection of shapes!")
+
+ main(sInputDir, sA, sB, options.bShape, options.bConvexHull, options.bVerbose)
+
+ traceln("Input was : ", os.path.abspath(sInputDir))
+ traceln("algo A was : ", sA)
+ traceln("algo B was : ", sB)
+ if options.bShape or options.bConvexHull:
+ trace("Shape of intersections based on content: ")
+ if options.bConvexHull:
+ traceln(" as a convex hull")
+ else:
+ traceln(" as a minimum rotated rectangle")
+ else:
+ traceln("Shape of intersections is the intersection of shapes!")
+
+ traceln("Done.")
\ No newline at end of file
diff --git a/TranskribusDU/tasks/performCVLLA.py b/TranskribusDU/tasks/performCVLLA.py
index 58763cc..79cbd96 100644
--- a/TranskribusDU/tasks/performCVLLA.py
+++ b/TranskribusDU/tasks/performCVLLA.py
@@ -10,18 +10,7 @@
copyright Xerox 2017
READ project
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -622,7 +611,7 @@ def regularTextLines(self,doc):
#plg = Polygon(lXY)
try: line=LineString(lXY)
except ValueError: continue # LineStrings must have at least 2 coordinate tuples
- topline=translate(line,yoff=-40).simplify(10)
+ topline=translate(line,yoff=-20)
#iHeight = 20 # in pixel
#x1,y1, x2,y2 = topline.getBoundingBox()
if coord is not None:
@@ -635,7 +624,7 @@ def regularTextLines(self,doc):
coord.set('points',spoints)
else:
print (tl)
-# print tl
+# print tl
def run(self,doc):
"""
diff --git a/TranskribusDU/tasks/project_GT_by_location.py b/TranskribusDU/tasks/project_GT_by_location.py
new file mode 100644
index 0000000..335aa61
--- /dev/null
+++ b/TranskribusDU/tasks/project_GT_by_location.py
@@ -0,0 +1,475 @@
+# -*- coding: utf-8 -*-
+
+"""
+Typically for use with ABP tables, to match the GT documents with their HTRed
+ counterpart.
+
+We have:
+- an input collection obtained by downloading a Transkribus collection using
+ (PyClient) Transkribus_downloader.py
+- a GT collection containing the definition of areas in each page. (Can be
+ table cells, or menu region, or whatever)
+
+We want
+1 - to generate a new document, where the "elements of interest" (e.g TextLine)
+ of the input collection are matched against the GT areas by the location,
+ so that each element is either inserted in an area that matches or left
+ outside any area.
+2 - (optionnally) to normalize the bounding area of the "element of interest"
+ This is done by making a box of predefined height from the Baseline
+
+Generate a new collection, with input documents enriched with GT areas.
+
+Any input document without GT counterpart is ignored.
+
+Created on 23 août 2019
+
+Copyright NAVER LABS Europe 2019
+@author: JL Meunier
+"""
+
+import sys, os
+from optparse import OptionParser
+from copy import deepcopy
+from collections import defaultdict
+
+from lxml import etree
+from numpy import argmax as argmax
+from shapely.affinity import translate
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+TranskribusDU_version
+
+from common.trace import traceln, trace
+from util.Shape import ShapeLoader as ShapeLoader
+
+# ----------------------------------------------------------------------------
+iNORMALIZED_HEIGHT = 43
+xpELEMENT1 = ".//pg:TextRegion"
+xpELEMENT2 = ".//pg:TextLine"
+
+xpAREA1 = ".//pg:TableRegion"
+xpAREA2 = ".//pg:TableCell"
+
+xpBASELINE = ".//pg:Baseline"
+dNS = {"pg":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
+# ----------------------------------------------------------------------------
+
+def main(sInputDir, sGTDir, sOutputDir
+ , xpElement1, xpElement2
+ , xpArea1, xpArea2
+ , bNorm, iNorm, bNormOnly
+ , bSep
+ , lsRmId
+ , bEval
+ , bWarm
+ , bVerbose=False):
+
+ lSkippedFile = []
+
+ # filenames without the path
+ lsFilename = [os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith(".mpxml") and not name.endswith("_du.mpxml")]
+ traceln(" - %d .mpxml files to process" % len(lsFilename))
+ for sMPXml in lsFilename:
+ trace(" - .mpxml FILE : ", sMPXml)
+ if bVerbose: traceln()
+
+ # -- find individual subfiles
+ sSubDir = os.path.join(sInputDir, sMPXml[:-len(".mpxml")])
+ if os.path.isdir(sSubDir):
+ traceln(" (-> ", sSubDir, ")")
+ lsPXml = [os.path.basename(name) for name in os.listdir(sSubDir) if name.endswith(".pxml")]
+ if bVerbose: traceln("\t%d files to process"%len(lsPXml))
+ else:
+ sSubDir = sInputDir
+ lsPXml = [sMPXml]
+ if bVerbose: traceln("\tprocessing the .mpxml file")
+
+ # -- find GT...
+ for sInputXml in lsPXml:
+ trace("\t", sMPXml, " -- ", sInputXml)
+
+ sGTFN = os.path.join(sGTDir, sInputXml)
+ if not os.path.isfile(sGTFN):
+ # maybe it is also a folder downloaded from Transkribus?
+ if os.path.isfile(os.path.join(sGTDir, sMPXml[:-len(".mpxml")], sInputXml)):
+ sGTFN = os.path.join(sGTDir, sMPXml[:-len(".mpxml")], sInputXml)
+ else:
+ # hummm, maybe it is a mpxml instead... :-/
+ sGTFN = sGTFN[:-len(".pxml")] + ".mpxml"
+ if not os.path.isfile(sGTFN):
+ traceln(" *** NO GT *** file skipped ")
+ lSkippedFile.append(sInputXml)
+ continue
+ # ok GT file found
+ trace(" ...")
+
+ # input Xml
+ sInFN = os.path.join(sSubDir, sInputXml)
+ sOutFN = os.path.join(sOutputDir, sInputXml)
+
+ if bWarm and os.path.exists(sOutFN):
+ # check existence and freshness
+ t_in = os.path.getmtime(sInFN)
+ t_gt = os.path.getmtime(sGTFN)
+ t_out = os.path.getmtime(sOutFN)
+ if t_out > t_in and t_out > t_gt:
+ traceln("\t\t fresh output file found on disk: %s - skipping it!"%sOutFN)
+ continue
+
+ # 0 - load input file
+ doc = etree.parse(sInFN)
+
+ # 1 - normalize input elements
+ if bNorm:
+ doc = normaliseDocElements(doc, xpElement2, iNorm)
+
+ # 2 - project GT
+ if not bNormOnly:
+ gtdoc = etree.parse(sGTFN)
+ if True:
+ doc = project_Elt_to_GT(gtdoc, doc
+ , xpElement1, xpElement2
+ , xpArea2, bSep, lsRmId, bEval)
+ else:
+ doc = project_Areas_to_Input(gtdoc, doc
+ , xpElement1, xpElement2, xpArea1, xpArea2
+ , bSep, lsRmId, bEval)
+
+ # 3 - save
+ doc.write(sOutFN,
+ xml_declaration=True,
+ encoding="utf-8",
+ pretty_print=True
+ #compression=0, #0 to 9
+ )
+
+ # done
+
+ del doc
+ traceln(" done")
+
+
+ traceln(" - %d .pxml files skipped" % len(lSkippedFile))
+
+
+# ---------------------------------------------------------------------------
+# Normalizing the box of TextElement, by translating a copy of the Baseline
+def normaliseDocElements(doc, xpElement, iNorm):
+ for ndPage in doc.getroot().xpath("//pg:Page", namespaces=dNS):
+ for ndElt in ndPage.xpath(xpElement, namespaces=dNS):
+ try:
+ normaliseElement(ndElt, iNorm)
+ except NormaliseException as e:
+ traceln(str(e))
+ traceln("Removing this element")
+ ndElt.getparent().remove(ndElt)
+
+ return doc
+
+
+class NormaliseException(Exception):
+ pass
+
+
+def normaliseElement(nd, iNorm):
+ try:
+ ndBaseline = nd.xpath(xpBASELINE, namespaces=dNS)[0]
+ except IndexError:
+ raise NormaliseException("WARNING: skipped element normalisation: no Baseline: %s" % etree.tostring(nd))
+
+ try:
+ line = ShapeLoader.node_to_LineString(ndBaseline)
+ except ValueError:
+ raise NormaliseException("WARNING: skipped element normalisation: invalid Coords: %s" % etree.tostring(nd))
+ topline = translate(line, yoff=-iNorm)
+
+ # serialise both in circular sequence
+ spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in line.coords)
+ lp=list(topline.coords)
+ lp.reverse()
+ spoints = spoints+ ' ' +' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in lp)
+
+ # ad-hoc way of setting the element coodinates
+ ndCoords = nd.xpath(".//pg:Coords", namespaces=dNS)[0]
+ ndCoords.set("points",spoints)
+
+ return
+
+# ---------------------------------------------------------------------------
+# projection of the GT area onto the doc
+
+class GTProjectionException(Exception): pass
+
+def project_Elt_to_GT(gtdoc, doc
+ , xpElement1, xpElement2
+ , xpArea2
+ , bSep, lsRmId, bEval
+ , fTH=0.5):
+ """
+ Here we take the element out of the production file to put them in the GT
+ doc
+
+ WE IGNORE xpArea1 (no need for it)
+
+ We return the GT doc
+ """
+ gtroot = gtdoc.getroot()
+
+ # Evaluation
+ # we build a table of list of TextLineId from the GT to check this SW
+ # table_id -> row -> col -> list of element id
+ dTable = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
+ nOk, nTot = 0, 0
+
+ if lsRmId:
+ nbEltRemoved = 0
+ for sRmId in lsRmId:
+ # for _nd in gtroot.xpath('//pg:*[@id="%s"]'%sRmId, namespaces=dNS):
+ for _nd in gtroot.xpath('//*[@id="%s"]'%sRmId):
+ _nd.getparent().remove(_nd)
+ nbEltRemoved += 1
+ trace(" (Rm by ID: %d elements removed)" % nbEltRemoved)
+
+ # remove all elements of interest from GT
+ # inside TableRegion, we have TextLine, outside we have TextRegion
+ for ndElt in gtroot.xpath(xpElement1, namespaces=dNS):
+ if bEval:
+ for ndElt2 in ndElt.xpath(xpElement2, namespaces=dNS):
+ dTable[None][None][None].append(ndElt2.get("id"))
+ ndElt.getparent().remove(ndElt)
+ for ndElt in gtroot.xpath(xpElement2, namespaces=dNS):
+ ndCell = ndElt.getparent()
+ if bEval: dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")].append(ndElt.get("id"))
+ ndCell.remove(ndElt)
+ if bEval: traceln("\npEvaluation mode")
+
+ if bSep:
+ nbSepRemoved, nbSepAdded = 0, 0
+ for _nd in gtroot.xpath('//pg:SeparatorRegion', namespaces=dNS):
+ _nd.getparent().remove(_nd)
+ nbSepRemoved += 1
+ trace(" (Separators: %d removed" % nbSepRemoved)
+
+ # project the GT areas, page by page
+ lNdPage = doc.getroot().xpath("//pg:Page", namespaces=dNS)
+ lNdPageGT = gtroot.xpath("//pg:Page", namespaces=dNS)
+ if len(lNdPage) != len(lNdPageGT):
+ raise GTProjectionException("GT and input have different numbers of pages")
+
+ uniqID = 1
+ for ndPage, ndPageGT in zip(lNdPage, lNdPageGT):
+ lNdArea2 = ndPageGT.xpath(xpArea2, namespaces=dNS)
+ loArea2 = [ShapeLoader.node_to_Polygon(nd) for nd in lNdArea2]
+
+ for ndElt in ndPage.xpath(xpElement2, namespaces=dNS):
+ oElt = ShapeLoader.node_to_Polygon(ndElt)
+
+ lOvrl = [oElt.intersection(o).area for o in loArea2]
+ iMax = argmax(lOvrl)
+ vMax = lOvrl[iMax]
+
+ # where to add it?
+ if vMax > 0 and vMax / oElt.area > fTH:
+ # ok, this is a match
+ ndCell = lNdArea2[iMax]
+ # add it directly to the area2 (TableCell)
+ ndCell.append(deepcopy(ndElt))
+ if bEval:
+ if ndElt.get("id") in dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")]:
+ nOk += 1
+ else:
+ try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text))
+ except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id")))
+
+ else:
+ # add it outside of any area
+ bestNd = ndPageGT
+ # add it in its own TextRegion
+ ndTR = etree.Element("TextRegion")
+ ndTR.set("id", "prjct_region_%d" % uniqID)
+ uniqID += 1
+ ndTR.set("custom", "")
+ ndTR.append(deepcopy(ndElt.xpath("./pg:Coords", namespaces=dNS)[0]))
+ ndTR.append(deepcopy(ndElt))
+ bestNd.append(ndTR)
+ if bEval:
+ if ndElt.get("id") in dTable[None][None][None]:
+ nOk += 1
+ else:
+ try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text))
+ except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id")))
+
+ nTot += 1
+
+ if bSep:
+ for _nd in ndPage.xpath('//pg:SeparatorRegion', namespaces=dNS):
+ ndPageGT.append(deepcopy(_nd))
+ nbSepAdded += 1
+ if bSep: trace(", %d added.) " % nbSepAdded)
+
+ if bEval:
+ traceln("-"*40)
+ trace(" - evaluation: %d ok out of %d = %.2f%%\n" % (nOk, nTot, 100*nOk / (nTot+0.0001)))
+
+ return gtdoc
+
+
+def project_Areas_to_Input(gtdoc, doc, xpElement, xpArea1, xpArea2, bSep, lsRmId, bEval):
+ """
+ Here we extract teh areas and put them in the input file
+ The element must be moved to the right areas
+ we return the doc
+ """
+ raise GTProjectionException("Not implemented")
+
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+
+ version = "v.01"
+ sUsage="""
+Typically for use with ABP tables, to match the GT documents with their HTRed
+ counterpart.
+We want to extract the HTRed text and , optionally, the separators from a
+ Transkribus processed collection, and inject them in a GT collection, to
+ replace the GT text, (and possibly the GT separators).
+
+We have:
+- an input collection obtained by downloading a Transkribus collection using
+ (PyClient) Transkribus_downloader.py
+- a GT collection containing the definition of nested areas in each page.
+ (Can be table cells in a table region, or whatever)
+ The nesting has 2 levels for now.
+
+In term of nesting, we assume:
+ [not CURRENTLY - xpArea1 are under Page XML element (xpArea1 is IGNORED and USELESS)
+ - xpArea2 (TableCell) are nested under xpArea1 (TableRegion)
+ - xpElement1 are under Page XML element
+ - xpElement2 (TextLine) are either under xpElement1 (TextRegion) or under xpArea2 (TableCell)
+ - SeparatorRegion are under PAGE XML element
+
+We want
+1 - to generate a new document, where the "elements of interest" (e.g TextLine)
+ of the input collection are matched against the GT areas by the location,
+ so that each element is either inserted in an area that matches or left
+ outside any area.
+2 - (optionnally) to normalize the bounding area of the "element of interest"
+ This is done by making a box of predefined height from the Baseline, which
+ becomes the bottom side of the box.
+3 - (optionnaly) to discard SeparatorRegion from the GT and get instead those
+ from Transkribus.
+
+This is done page by page, for each document.
+
+Generate a new collection, with input documents enriched with GT areas.
+
+Any input document without GT counterpart is ignored.
+
+Usage: %s
+ [--normalize (%d above the Baseline)
+ [--normalize_height = (this height above the Baseline)
+ [--normalize-only]
+ [--separator] replace GT SeparatorRegion by those from input.
+ [--xpElement1 = ] (defaults to "%s")
+ [--xpElement2 = ] (defaults to "%s")
+ [--xparea1 = ] (defaults to "%s") (CURRENTLY IGNORED and USELESS)
+ [--xparea2 = ] (defaults to "%s")
+ [--eval]
+
+""" % (sys.argv[0], iNORMALIZED_HEIGHT
+ , xpELEMENT1, xpELEMENT2
+ , xpAREA1, xpAREA1)
+
+ parser = OptionParser(usage=sUsage)
+ parser.add_option("--xpElement1", dest='xpElement1', action="store", type="string"
+ , help="xpath of the elements lvl1"
+ , default=xpELEMENT1)
+ parser.add_option("--xpElement2", dest='xpElement2', action="store", type="string"
+ , help="xpath of the elements lvl2 to project"
+ , default=xpELEMENT2)
+ parser.add_option("--xpArea1", dest='xpArea1', action="store", type="string"
+ , help="xpath of the areas level 1 in GT"
+ , default=xpAREA1)
+ parser.add_option("--xpArea2", dest='xpArea2', action="store", type="string"
+ , help="xpath of the areas level 2 (nested) in GT"
+ , default=xpAREA2)
+ parser.add_option("--normalize", dest='bNorm', action="store_true"
+ , help="normalise the box of elements of interest")
+ parser.add_option("--separator", dest='bSep', action="store_true"
+ , help="replace any separator by those from the Transkribus collection")
+ parser.add_option("--normalize_height", dest='iNormHeight', action="store", type="int"
+ , help="normalise the box of elements of interest")
+ parser.add_option("--normalize-only", dest='bNormOnly', action="store_true"
+ , help="only normalize, does not project GT")
+ parser.add_option("--rm_by_id", dest='lsRmId', action="append"
+ , help="Remove those elements from the output XML")
+ parser.add_option("--eval", dest='bEval', action="store_true"
+ , help="evaluation mode, pass GT as input!!")
+ parser.add_option("--warm", dest='bWarm', action="store_true"
+ , help="Warm mode: skipped input files with a fresh output already there")
+ parser.add_option("-v", "--verbose", dest='bVerbose', action="store_true"
+ , help="Verbose mode")
+ (options, args) = parser.parse_args()
+
+ try:
+ sInputDir, sGTDir, sOutputDir = args
+ except ValueError:
+ sys.stderr.write(sUsage)
+ sys.exit(1)
+
+ # ... normalization
+ bNorm = bool(options.bNorm) or bool(options.iNormHeight) or bool(options.bNormOnly)
+ iNorm = options.iNormHeight if bool(options.iNormHeight) else iNORMALIZED_HEIGHT
+
+ # ... checking folders
+ if not os.path.normpath(sInputDir).endswith("col") : sInputDir = os.path.join(sInputDir, "col")
+ if not os.path.normpath(sGTDir).endswith("col") : sGTDir = os.path.join(sGTDir, "col")
+ if os.path.isdir(sInputDir) and os.path.isdir(sGTDir):
+ # create the output fodlers if required
+ if os.path.normpath(sOutputDir).endswith("col") :
+ pass # we expect the user knows what s/he does
+ else:
+ # try to create them
+ try: os.mkdir(sOutputDir);
+ except: pass
+ sOutputDir = os.path.join(sOutputDir, "col")
+ try: os.mkdir(sOutputDir);
+ except: pass
+ # all must be ok by now
+ lsDir = [sInputDir, sGTDir, sOutputDir]
+ if not all(os.path.isdir(s) for s in lsDir):
+ for s in lsDir:
+ if not os.path.isdir(s): sys.stderr.write("Not a directory: %s\n"%s)
+ sys.exit(2)
+
+ # ok, go!
+ traceln("Input is : ", os.path.abspath(sInputDir))
+ traceln("GT is in : ", os.path.abspath(sGTDir))
+ traceln("Ouput in : ", os.path.abspath(sOutputDir))
+ traceln("Elements lvl 1: ", repr(options.xpElement1))
+ traceln("Elements lvl 2: ", repr(options.xpElement2))
+ traceln("GT areas lvl 1 : " , repr(options.xpArea1))
+ traceln("GT areas lvl 2 (nested) : " , repr(options.xpArea2))
+ traceln("Normalise elements : ", bNorm)
+ traceln("Normalise to height : ", iNorm)
+ traceln("Get separators : ", options.bSep)
+ traceln("Remove elements with @id: ", options.lsRmId)
+
+ if os.listdir(sOutputDir): traceln("WARNING: *** output folder NOT EMPTY ***")
+
+ main(sInputDir, sGTDir, sOutputDir
+ , options.xpElement1, options.xpElement2
+ , options.xpArea1, options.xpArea2
+ , bNorm, iNorm, options.bNormOnly
+ , options.bSep
+ , options.lsRmId
+ , options.bEval
+ , options.bWarm
+ , options.bVerbose)
+
+ traceln("Done.")
\ No newline at end of file
diff --git a/TranskribusDU/tasks/tabulate_cell_cluster.py b/TranskribusDU/tasks/tabulate_cell_cluster.py
new file mode 100644
index 0000000..219ead8
--- /dev/null
+++ b/TranskribusDU/tasks/tabulate_cell_cluster.py
@@ -0,0 +1,644 @@
+# -*- coding: utf-8 -*-
+
+"""
+We expect XML file with cluster defined by one algo.
+
+For each Page:
+ We tabulate the clusters (build a table where each cluster is a cell)
+ We compute the row, col, row_span, col_span attributes of each cluster
+
+Overwrite the input XML files, adding attributes to the cluster definitions
+
+If the cluster do not have a defined shape, we compute a shape based on a minimum_rotated_rectangle
+
+Created on 26/9/2019
+
+Copyright NAVER LABS Europe 2019
+
+@author: JL Meunier
+"""
+
+import sys, os
+from optparse import OptionParser
+from collections import defaultdict
+from lxml import etree
+
+import numpy as np
+import shapely.ops
+from shapely import affinity
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+TranskribusDU_version
+
+from common.trace import traceln, trace
+from xml_formats.PageXml import PageXml
+
+from tasks.intersect_cluster import Cluster
+from graph.Block import Block
+from util.Shape import ShapeLoader
+
+# ----------------------------------------------------------------------------
+xpCluster = ".//pg:Cluster"
+xpClusterEdge = ".//pg:ClusterEdge"
+xpEdge = ".//pg:Edge"
+# sFMT = "(%s_∩_%s)" pb with visu
+sAlgoAttr = "algo"
+xpPage = ".//pg:Page"
+dNS = {"pg":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
+# ----------------------------------------------------------------------------
+
+
+class TableCluster(Cluster, Block):
+ thTopAligned = 20 # a difference less than 20 pixel on y1 means top-aliogned
+ # scale BB by these ratio (horizontally and vertically)
+ scale_H = 0.66 # better if same as in DU_Table_Col_Cut
+ # scale_H = 1.0 # to get hard cases
+ scale_V = 1 # do not shrink
+
+ cnt = 0
+
+ def __init__(self, name, setID, shape=None):
+ Cluster.__init__(self, name, setID, shape=shape)
+ # we do not __init__ Block - useless, we just need a few methods
+ self.dsEdge = defaultdict(set) # dic edge_type -> neighbours set
+ self.cnt = TableCluster.cnt
+ TableCluster.cnt += 1
+
+ @classmethod
+ def induceClusterEdge(cls, ndPage, lCluster):
+ """
+ compute inter- cluster edges from inter- cluster-item edges
+
+ no so good for horizontal edges... :-/
+ """
+ # revert dictionay itemID -Cluster
+ dCluster_by_Item = { x:c for c in lCluster for x in c.getSetID() }
+ for _nd in ndPage.xpath(xpEdge, namespaces=dNS):
+ _A, _B = _nd.get("src"), _nd.get("tgt")
+ _AC, _BC = dCluster_by_Item[_A], dCluster_by_Item[_B]
+ if _AC != _BC:
+ TableCluster.link(_AC, _BC, edge_type=_nd.get("type"))
+ del dCluster_by_Item
+
+ @classmethod
+ def computeClusterEdge(cls, _ndPage, lCluster):
+ """
+ compute edge using g2 method from class Block :-)
+ A bit computationally heavy, but safe code...
+ """
+ lHEdge, lVEdge = Block.findPageNeighborEdges(lCluster, bShortOnly=False, iGraphMode=2)
+ for edge in lHEdge:
+ TableCluster.link(edge.A, edge.B, "HorizontalEdge")
+ for edge in lVEdge:
+ TableCluster.link(edge.A, edge.B, "VerticalEdge")
+
+ @classmethod
+ def addEdgesToXml(cls, ndPage, sAlgo, lCluster):
+ cnt = 0
+ ndPage.append(etree.Comment("\nInter-cluster edges by tabulate_cluster scale_H=%.2f sclae_V=%.2f\n" %(
+ cls.scale_H, cls.scale_V)))
+
+ setEdges = set()
+
+ for A in lCluster:
+ for edge_type, lLinked in A.dsEdge.items():
+ for B in lLinked:
+ if A.cnt >= B.cnt: continue
+ if (A, B, edge_type) not in setEdges:
+ # ok, let's add the edge A <--> B
+ ndEdge = PageXml.createPageXmlNode("ClusterEdge")
+ ndEdge.set("src", A.name)
+ ndEdge.set("tgt", B.name)
+ ndEdge.set("type", edge_type)
+ ndEdge.set("algo", sAlgo)
+ if True:
+ ptA = A.shape.representative_point()
+ ptB = B.shape.representative_point()
+
+ else:
+ ptA, ptB = shapely.ops.nearest_points(A.shape, B.shape)
+ PageXml.setPoints(ndEdge, list(ptA.coords) + list(ptB.coords))
+ ndEdge.tail = "\n"
+ ndPage.append(ndEdge)
+
+ setEdges.add((A, B, edge_type))
+ cnt += 1
+ del setEdges
+
+ return cnt
+
+ @classmethod
+ def removeEdgesFromXml(cls, ndPage):
+ """
+ Given an algo, remove all its clusters from a page
+ """
+ i = 0
+ for nd in ndPage.xpath(xpClusterEdge, namespaces=dNS):
+ ndPage.remove(nd)
+ i += 1
+ return i
+
+ @classmethod
+ def link(cls, A, B, edge_type=""):
+ """
+ record an edge between those 2 clusters
+ """
+ assert A != B
+ A.dsEdge[edge_type].add(B)
+ B.dsEdge[edge_type].add(A)
+
+ @classmethod
+ def computeClusterBoundingBox(cls, lCluster):
+ for c in lCluster:
+ c.setBB(c.shape.bounds)
+ assert c.x1 < c.x2
+ assert c.y1 < c.y2
+ if cls.scale_H != 1 or cls.scale_V != 1:
+ c.scaled_shape = affinity.scale(c.shape, xfact=cls.scale_H, yfact=cls.scale_V)
+ else:
+ c.scaled_shape = c.shape
+
+ @classmethod
+ def setTableAttribute(self, ndPage, setID, sAttr1, s1, sAttr2=None, s2=None):
+ """
+ set attributes such as "col" and "colSPan" of a set of objects given by their ID
+ """
+ lNode = [ndPage.xpath(".//*[@id='%s']"%_id, namespaces=dNS)[0] for _id in setID]
+ for nd in lNode:
+ nd.set(sAttr1, str(s1))
+ if bool(sAttr2):
+ nd.set(sAttr2, str(s2))
+
+ @classmethod
+ def tabulate(cls, ndPage, lCluster, bVerbose=False):
+ """
+ Top-down tabulation in the 4 directions
+ """
+
+ cls.tabulate_top_down(lCluster)
+ for c in lCluster:
+ c.row1 = c.minrow
+ c.node.set("row", str(c.row1))
+ maxRow = max(c.row1 for c in lCluster)
+ #c.node.set("col", str(c.mincol))
+ #c.node.set("rowSpan", str(c.maxrow - c.minrow + 1))
+ #c.node.set("colSpan", str(c.maxcol - c.mincol + 1))
+
+ cls.rotateClockWise90deg(lCluster, bVerbose=bVerbose)
+ cls.tabulate_top_down(lCluster)
+ for c in lCluster:
+ c.col1 = c.minrow
+ c.node.set("col", str(c.col1))
+ maxCol = max(c.col1 for c in lCluster)
+
+ cls.rotateClockWise90deg(lCluster, bVerbose=bVerbose)
+ cls.tabulate_top_down(lCluster)
+ for c in lCluster:
+ c.row2 = maxRow - c.minrow
+ rowSpan = str(1 + c.row2 - c.row1)
+ c.node.set("rowSpan", rowSpan)
+ cls.setTableAttribute(ndPage, c.getSetID(), "row", c.row1, "rowSpan", rowSpan)
+
+ cls.rotateClockWise90deg(lCluster, bVerbose=bVerbose)
+ cls.tabulate_top_down(lCluster)
+ for c in lCluster:
+ c.col2 = maxCol - c.minrow
+ colSpan = str(1 + c.col2 - c.col1)
+ c.node.set("colSpan", colSpan)
+ cls.setTableAttribute(ndPage, c.getSetID(), "col", c.col1, "colSpan", colSpan)
+
+ @classmethod
+ def tabulate_rows(cls, ndPage, lCluster, bVerbose=False):
+ """
+ Top-down and bottom-up tabulations
+ """
+
+ cls.tabulate_top_down(lCluster)
+
+ maxRow = max(c.minrow for c in lCluster)
+ traceln(" maxRow=", maxRow)
+
+# if False:
+# for c in lCluster:
+# c.row1 = c.minrow
+# c.node.set("row", str(c.row1))
+# cls.rotateClockWise180deg(lCluster, bVerbose=bVerbose)
+# cls.tabulate_top_down(lCluster)
+# for c in lCluster:
+# c.row2 = max(maxRow - c.minrow, c.row1)
+# rowSpan = str(1 + c.row2 - c.row1)
+# c.node.set("rowSpan", rowSpan)
+# cls.setTableAttribute(ndPage, c.getSetID(), "row", c.row1, "rowSpan", rowSpan)
+# elif False:
+# for c in lCluster:
+# c.node.set("row", str(c.minrow))
+# rowSpan = str(9)
+# c.node.set("rowSpan", rowSpan)
+# cls.setTableAttribute(ndPage, c.getSetID(), "row", c.minrow, "rowSpan", rowSpan)
+ # tabulate top-down, then compute the separators and use them for
+ # deciding the row and rowSpan
+ # we get a list of linear separators, to be reflected as SeparatorRegion
+ cls.map_to_rows(ndPage, maxRow, lCluster)
+
+ for c in lCluster:
+ c.node.set("row", str(c.row1))
+ rowSpan = str(1 + c.row2 - c.row1)
+ c.node.set("rowSpan", rowSpan)
+ cls.setTableAttribute(ndPage, c.getSetID(), "row", c.row1, "rowSpan", rowSpan)
+
+
+ @classmethod
+ def use_cut_columns(cls, ndPage):
+ """
+ use the name of the cut cluster to compute the col
+ colSPan is always 1 in that case
+ """
+ #
+ for ndCluster in ndPage.xpath(xpCluster+"[@algo='cut']", namespaces=dNS):
+ col = str(int(ndCluster.get("name")) - 1)
+ setID = set(ndCluster.get("content").split())
+ ndCluster.set("col", col)
+ ndCluster.set("colSpan", "1")
+ cls.setTableAttribute(ndPage, setID, "col", col, "colSpan", "1")
+
+ @classmethod
+ def tabulate_top_down(cls, lCluster):
+ """
+ compute minrow and maxrow values
+ """
+ for c in lCluster:
+ assert c.x1 <= c.x2
+ assert c.y1 <= c.y2
+
+ step = 1
+ step_max = len(lCluster) + 1
+
+ for c in lCluster: c.minrow = -1
+
+ lTodoCluster = lCluster
+ prevSetUpdated = None
+ bNoLoop = True
+ while lTodoCluster and bNoLoop:
+ setUpdated = set()
+ traceln(" - STEP %d"%step)
+ # since we keep increasing the minrow, its maximum value cannot
+ # exceed len(lCluster), which is reached with at most step_max steps
+ assert step <= step_max, "algorithm error"
+
+ # visit all vertically from top cluster
+ lTodoCluster.sort(key=lambda o: o.y1)
+ # faster?? lCurCluster.sort(key=operator.attrgetter("x1"))
+# print([c.name for c in lTodoCluster])
+# for i in [0, 1]:
+# print(lCluster[i].name, " y1=", lCluster[i].y1, " y2=", lCluster[i].y2)
+ for c in lTodoCluster:
+ setUpdated.update(c.visitStackDown(0))
+ # visit all, horizontally from leftest clusters
+ lTodoCluster.sort(key=lambda o: o.x1)
+ for c in lTodoCluster:
+ setUpdated.update(c.visitPeerRight())
+
+ lTodoCluster.sort(key=lambda o: o.x2, reverse=True)
+ for c in lTodoCluster:
+ setUpdated.update(c.visitPeerLeft())
+
+ if not prevSetUpdated is None and prevSetUpdated == setUpdated:
+ traceln(" - loop detected - stopping now.")
+ bNoLoop = False
+ prevSetUpdated = setUpdated
+ lTodoCluster = list(setUpdated)
+ traceln(" ... %d updated" % len(lTodoCluster))
+ step += 1
+
+ if not bNoLoop:
+ # need to fix the problem...
+ # because of the loop, we have holes in the list of row numbers
+ lMinrow = list(set(c.minrow for c in lCluster))
+ lMinrow.sort()
+ curRow = 0
+ for iMinrow in range(len(lMinrow)):
+ minrow = lMinrow[iMinrow]
+ if minrow > curRow:
+ # missing row number...
+ delta = minrow - curRow
+ for c in lCluster:
+ if c.minrow >= curRow:
+ c.minrow -= delta
+ for j in range(iMinrow, len(lMinrow)):
+ lMinrow[j] = lMinrow[j] - delta
+ curRow += 1
+
+ def visitStackDown(self, minrow, setVisited=set()):
+ """
+ e.g., stacking from top to bottom, we get a visit from upward, so we update our minrow accordingly
+ return the set of updated items
+ """
+ #if self.name == "(6_I_agglo_345866)" and minrow > 17: print(self.name, minrow)
+ setUpdated = set()
+
+ if minrow > self.minrow:
+ # the stack above us tells us about our minrow!
+ self.minrow = minrow
+ setUpdated.add(self)
+
+ for c in self.dsEdge["VerticalEdge"]:
+ # make sure we go downward
+ # if c.y1 > self.y1:
+ # and that the edge is a valid one
+ # which implies the 1st condition!
+ if self.y2 < c.y1:
+ if self.minrow >= c.minrow:
+ # otherwise no need...
+ setUpdated.update(c.visitStackDown(self.minrow + 1, setVisited))
+ elif self.y1 < c.y1:
+ # c starts within self...
+ # maybe there is skewing?
+ if self.scaled_shape.intersects(c.scaled_shape):
+ # since we do not increase minrow, we need to make sure
+ # we do not infinite loop...
+ # (increasing minrow forces us to move downward the page and to end at some point)
+ if self.minrow > c.minrow or not self in setVisited:
+ setVisited.add(self)
+ setUpdated.update(c.visitStackDown(self.minrow, setVisited))
+ else:
+ # I believe one is mostly above the other
+ if self.minrow >= c.minrow:
+ setUpdated.update(c.visitStackDown(self.minrow + 1, setVisited))
+
+ return setUpdated
+
+ def visitPeerRight(self):
+ """
+ go from left to right, making sure the minrow is consistent with the geometric relationships
+ """
+ setUpdated = set()
+ a = self
+ for b in self.dsEdge["HorizontalEdge"]:
+ # make sure we go in good direction: rightward
+ if a.x2 <= b.x1:
+ minrow = max(a.minrow, b.minrow)
+ bAB = TableCluster.isTopAligned(a, b) # top justified
+ bA = bAB or a.y1 > b.y1 # a below b
+ bB = bAB or a.y1 < b.y1 # a above b
+
+ if bA and minrow > a.minrow:
+ a.minrow = minrow
+ setUpdated.add(a)
+
+ if bB and minrow > b.minrow:
+ b.minrow = minrow
+ setUpdated.add(b)
+ setUpdated.update(b.visitPeerRight())
+ return setUpdated
+
+ def visitPeerLeft(self):
+ """
+ go from left to right, making sure the minrow is consistent with the geometric relationships
+ """
+ setUpdated = set()
+ a = self
+ for b in self.dsEdge["HorizontalEdge"]:
+ # make sure we go in good direction: leftward
+ if b.x2 <= a.x1:
+ minrow = max(a.minrow, b.minrow)
+ bAB = TableCluster.isTopAligned(a, b) # top justified
+ bA = bAB or a.y1 > b.y1 # a below b
+ bB = bAB or a.y1 < b.y1 # a above b
+
+ if bA and minrow > a.minrow:
+ a.minrow = minrow
+ setUpdated.add(a)
+
+ if bB and minrow > b.minrow:
+ b.minrow = minrow
+ setUpdated.add(b)
+ setUpdated.update(b.visitPeerRight())
+
+ return setUpdated
+
+ @classmethod
+ def isTopAligned(cls, a, b):
+ return abs(a.y1 - b.y1) < cls.thTopAligned
+
+
+ @classmethod
+ def rotateClockWise90deg(cls, lCluster, bVerbose=True):
+ if bVerbose: traceln(" -- rotation 90° clockwise")
+ for c in lCluster:
+ c.x1, c.y1, c.x2, c.y2 = -c.y2, c.x1, -c.y1, c.x2
+ c.dsEdge["HorizontalEdge"], c.dsEdge["VerticalEdge"] = c.dsEdge["VerticalEdge"], c.dsEdge["HorizontalEdge"]
+ return
+
+ @classmethod
+ def rotateClockWise180deg(cls, lCluster, bVerbose=True):
+ if bVerbose: traceln(" -- rotation 180° clockwise")
+ for c in lCluster:
+ c.x1, c.y1, c.x2, c.y2 = -c.x2, -c.y2, -c.x1, -c.y1
+ return
+
+ @classmethod
+ def map_to_rows(cls, ndPage, maxRow, lCluster):
+ """
+ find lienar separators separating rows
+ """
+ # reflect each cluster by the highest point (highest ending points of baselines)
+ dMinYByRow = defaultdict(lambda :9999999999)
+ n = 2 * sum(len(c) for c in lCluster)
+ X = np.zeros(shape=(n, 2)) # x,y coordinates
+ i = 0
+ for c in lCluster:
+ c.maxY = -1
+ c.minY = 9999999999
+ for _id in c.getSetID():
+ """
+
+
+
+ ung.
+
+ """
+ nd = ndPage.xpath(".//*[@id='%s']/pg:Baseline"%_id, namespaces=dNS)[0]
+ ls = ShapeLoader.node_to_LineString(nd)
+ pA, pB = ls.boundary.geoms
+ minY = min(pA.y, pB.y)
+ c.minY = min(c.minY, minY)
+ c.maxY = max(c.maxY, max((pA.y, pB.y)))
+ dMinYByRow[c.minrow] = min(dMinYByRow[c.minrow], minY)
+ # for the linear separators
+ X[i,:] = (pA.x, pA.y)
+ i = i + 1
+ X[i,:] = (pB.x, pB.y)
+ i = i + 1
+
+ # check consistency
+ for c in lCluster:
+ for i in range(maxRow, c.minrow, -1):
+ if c.minY > dMinYByRow[i]:
+ assert c.minrow < i
+ # how possible??? fix!!
+ c.minrow = i
+ break
+
+ # compute row1 and row2
+ for c in lCluster:
+ c.row1 = c.minrow
+ c.row2 = c.minrow
+ for i in range(0, maxRow+1):
+ if c.maxY > dMinYByRow[i]:
+ c.row2 = i
+ else:
+ break
+
+ # now compute maxRow - 1 separators!
+ w = float(ndPage.get("imageWidth"))
+ Y = np.zeros(shape=(n,)) # labels
+# lAB = [getLinearSeparator(X, np.clip(Y, row, row+1))
+# for row in range(maxRow-1)]
+
+ for nd in ndPage.xpath(".//pg:SeparatorRegion[@algo]", namespaces=dNS):
+ ndPage.remove(nd)
+
+ for row in range(maxRow+1):
+ Y0 = dMinYByRow[row] - 20
+ Yw = Y0
+ ndSep = PageXml.createPageXmlNode("SeparatorRegion")
+ ndSep.set("algo", "tabulate_rows")
+ ndCoords = PageXml.createPageXmlNode("Coords")
+ ndCoords.set("points", "%d,%d %d,%d" %(0, Y0, w, Yw))
+ ndSep.append(ndCoords)
+ ndSep.tail = "\n"
+ ndPage.append(ndSep)
+
+ return
+
+
+def main(sInputDir, sAlgo, bCol=False, scale_H=None, scale_V=None, bVerbose=False):
+
+ if not scale_H is None: TableCluster.scale_H = scale_H
+ if not scale_V is None: TableCluster.scale_V = scale_V
+
+ traceln("scale_H=", TableCluster.scale_H)
+ traceln("scale_V=", TableCluster.scale_V)
+
+ # filenames without the path
+ lsFilename = [os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith("_du.pxml") or name.endswith("_du.mpxml")]
+ traceln(" - %d files to process, to tabulate clusters '%s'" % (
+ len(lsFilename)
+ , sAlgo))
+ lsFilename.sort()
+ for sFilename in lsFilename:
+ sFullFilename = os.path.join(sInputDir, sFilename)
+ traceln(" -------- FILE : ", sFullFilename)
+ cnt = 0
+ doc = etree.parse(sFullFilename)
+
+ for iPage, ndPage in enumerate(doc.getroot().xpath(xpPage, namespaces=dNS)):
+ lCluster = TableCluster.load(ndPage, sAlgo, bNode=True) # True to keep a pointer to the DOM node
+
+ if bVerbose:
+ trace(" --- Page %d : %d cluster '%s' " %(iPage+1, len(lCluster), sAlgo))
+ if len(lCluster) == 0:
+ traceln("*** NO cluster '%s' *** we keep this page unchanged"%sAlgo)
+ continue
+ _nbRm = TableCluster.removeEdgesFromXml(ndPage)
+ if bVerbose:
+ traceln("\n %d ClusterEdge removed"%_nbRm)
+
+ TableCluster.computeClusterBoundingBox(lCluster)
+
+ if True:
+ # edges are better this way!
+ lBB = []
+ for c in lCluster:
+ lBB.append(c.getBB())
+ c.scale(TableCluster.scale_H, TableCluster.scale_V)
+ TableCluster.computeClusterEdge(ndPage, lCluster)
+ for c, bb in zip(lCluster, lBB):
+ c.setBB(bb)
+ # for c in lCluster: c.scale(1.0/TableCluster.scale_H, 1.0/TableCluster.scale_V)
+ else:
+ # compute inter- cluster edges from inter- cluster-item edges
+ TableCluster.induceClusterEdge(ndPage, lCluster)
+
+ # store inter-cluster edges
+ cntPage = TableCluster.addEdgesToXml(ndPage, sAlgo, lCluster)
+ if bVerbose:
+ traceln(" %d inter-cluster edges " %(cntPage))
+
+ # compute min/max row/col for each cluster
+ # WARNING - side effect on lCluster content and edges
+ if bCol:
+ TableCluster.tabulate(ndPage, lCluster, bVerbose=bVerbose)
+ else:
+ TableCluster.tabulate_rows(ndPage, lCluster, bVerbose=bVerbose)
+ TableCluster.use_cut_columns(ndPage)
+
+ cnt += cntPage
+ traceln("%d inter-cluster edges" %(cnt))
+
+
+ doc.write(sFullFilename,
+ xml_declaration=True,
+ encoding="utf-8",
+ pretty_print=True
+ #compression=0, #0 to 9
+ )
+
+ del doc
+
+ traceln(" done (%d files)" % len(lsFilename))
+
+
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+
+ version = "v.01"
+ sUsage="""
+Tabulate the clusters from given @algo and compute the row, col, row_span, col_span attributes of each cluster
+
+Usage: %s
+
+""" % (sys.argv[0])
+
+ parser = OptionParser(usage=sUsage)
+ parser.add_option("--scale_h", dest='fScaleH', action="store", type="float"
+ , help="objects are horizontally scaled by this factor")
+ parser.add_option("--scale_v", dest='fScaleV', action="store", type="float"
+ , help="objects are vertically scaled by this factor")
+ parser.add_option("--col", dest='bCol', action="store_true"
+ , help="Columns also tabulated instead of derived from 'cut' clusters")
+ parser.add_option("-v", "--verbose", dest='bVerbose', action="store_true"
+ , help="Verbose mode")
+ (options, args) = parser.parse_args()
+
+ try:
+ sInputDir, sA = args
+ except ValueError:
+ sys.stderr.write(sUsage)
+ sys.exit(1)
+
+ # ... checking folders
+ if not os.path.normpath(sInputDir).endswith("col") : sInputDir = os.path.join(sInputDir, "col")
+
+ if not os.path.isdir(sInputDir):
+ sys.stderr.write("Not a directory: %s\n"%sInputDir)
+ sys.exit(2)
+
+ # ok, go!
+ traceln("Input is : ", os.path.abspath(sInputDir))
+ traceln("algo is : ", sA)
+ if options.bCol:
+ traceln("columns also tabulated")
+ else:
+ traceln("columns are those of projection profile")
+
+ main(sInputDir, sA, bCol=options.bCol
+ , scale_H=options.fScaleH, scale_V=options.fScaleV
+ , bVerbose=options.bVerbose)
+
+ traceln("Done.")
\ No newline at end of file
diff --git a/TranskribusDU/tasks/tabulate_final.py b/TranskribusDU/tasks/tabulate_final.py
new file mode 100644
index 0000000..c5aef3a
--- /dev/null
+++ b/TranskribusDU/tasks/tabulate_final.py
@@ -0,0 +1,248 @@
+# -*- coding: utf-8 -*-
+
+"""
+We expect XML file with TextLine having the row, col, rowSpan, colSpan attributes
+
+For each Page:
+ We delete any empty table (or complain if not empty)
+ We select TextLine with rowSPan=1 and colSpan=1
+ We create one cell for each pair of row and col number
+ We inject the TexLine into its cell
+ We create a TableRegion to contain the cells
+ We delete empty regions
+ We resize non-empty regions
+
+We compute the cell and table geometries and store them.
+
+Created on 21/10/2019
+
+Copyright NAVER LABS Europe 2019
+
+@author: JL Meunier
+"""
+
+import sys, os
+from optparse import OptionParser
+from collections import defaultdict
+from lxml import etree
+
+from shapely.ops import cascaded_union
+
+try: #to ease the use without proper Python installation
+ import TranskribusDU_version
+except ImportError:
+ sys.path.append( os.path.dirname(os.path.dirname( os.path.abspath(sys.argv[0]) )) )
+ import TranskribusDU_version
+TranskribusDU_version
+
+from common.trace import traceln
+from xml_formats.PageXml import PageXml
+
+from util.Shape import ShapeLoader
+
+# ----------------------------------------------------------------------------
+xpPage = ".//pg:Page"
+dNS = {"pg":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
+# ----------------------------------------------------------------------------
+
+
+def processRegions(ndPage,bVerbose=False):
+ """
+ Delete empty regions
+ resize no empty regions
+ """
+ lDel=[]
+ lndRegions = ndPage.xpath(".//pg:TextRegion", namespaces=dNS)
+ for ndRegion in lndRegions:
+ lTL= ndRegion.xpath(".//pg:TextLine", namespaces=dNS)
+ if lTL == []:
+ # to be deleted
+ lDel.append(ndRegion)
+ else:
+ #resize it
+ oHull = ShapeLoader.convex_hull(lTL, bShapelyObject=True)
+ PageXml.getChildByName(ndRegion,'Coords')[0].set("points", ShapeLoader.getCoordsString(oHull, bFailSafe=True))
+# contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lTL ])
+# o = contour.minimum_rotated_rectangle
+# ndRegion.getChildByName('Coords').set("points", ShapeLoader.getCoordsString(o, bFailSafe=True))
+
+ # delete empty regions
+ [ ndRegion.getparent().remove(ndRegion) for ndRegion in lDel]
+
+ if bVerbose:
+ traceln(" - %d regions deleted"%(len(lDel)))
+ traceln(" - %d regions updated"%(len(lndRegions) - len(lDel)))
+
+class TableRegion:
+
+ def __init__(self, pagenum, tablenum):
+ self.pagenum = pagenum
+ self.tablenum = tablenum
+ # (row, col) -> list of nodes
+ self._dCellNd = defaultdict(list)
+
+ def addToCell(self, row, col, nd):
+ self._dCellNd[(row, col)].append(nd)
+
+ def makeTableNode(self):
+ """
+ Make a DOM tree for this table
+ """
+ lK = self._dCellNd.keys()
+ lRow = list(set(_row for _row, _col in lK))
+ lRow.sort()
+ lCol = list(set(_col for _row, _col in lK))
+ lCol.sort()
+
+ ndTable = PageXml.createPageXmlNode("TableRegion")
+ ndTable.set("id", "p%s_%s" % (self.pagenum, self.tablenum))
+ ndTable.tail = "\n"
+ lCellShape = []
+ lNdCell = []
+ for row in lRow:
+ for col in lCol:
+ lNdText = self._dCellNd[(row, col)]
+ #
+ #
+
+ if lNdText:
+ ndCell = PageXml.createPageXmlNode("TableCell")
+ ndCell.set("id", "p%s_t%s_r%s_c%s"%(self.pagenum, self.tablenum, row, col))
+
+ # shape of the cell
+ oHull = ShapeLoader.convex_hull(lNdText, bShapelyObject=True)
+ lCellShape.append(oHull) # keep those to compute table contour
+
+ # Coords sub-element
+ ndCoords = PageXml.createPageXmlNode("Coords")
+ ndCoords.set("points", ShapeLoader.getCoordsString(oHull, bFailSafe=True))
+ ndCoords.tail = "\n"
+ ndCell.append(ndCoords)
+
+ # row="0" col="0" rowSpan="1" colSpan="1" leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false" bottomBorderVisible="false"
+ ndCell.set("row" , str(row))
+ ndCell.set("rowSpan", "1")
+ ndCell.set("col" , str(col))
+ ndCell.set("colSpan", "1")
+ ndCell.tail = "\n"
+
+ for nd in lNdText: ndCell.append(nd)
+
+ lNdCell.append(ndCell)
+
+ # Table geometry
+ ndCoords = PageXml.createPageXmlNode("Coords")
+ contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lCellShape ])
+ o = contour.minimum_rotated_rectangle
+ ndCoords.set("points", ShapeLoader.getCoordsString(o, bFailSafe=True))
+ ndCoords.tail = "\n"
+ ndTable.append(ndCoords)
+
+ for nd in lNdCell:
+ ndTable.append(nd)
+
+ return ndTable
+
+
+def main(sInputDir, bForce=False, bVerbose=False):
+
+ # filenames without the path
+ lsFilename = [os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith("_du.pxml") or name.endswith("_du.mpxml")]
+ traceln(" - %d files to process, to tabulate clusters" % (
+ len(lsFilename)))
+ lsFilename.sort()
+ for sFilename in lsFilename:
+ sFullFilename = os.path.join(sInputDir, sFilename)
+ traceln(" -------- FILE : ", sFullFilename)
+ cnt = 0
+ doc = etree.parse(sFullFilename)
+
+ for iPage, ndPage in enumerate(doc.getroot().xpath(xpPage, namespaces=dNS)):
+
+ # find and delete any pre-existing table
+ # if bForce, then move any TextLMine under Page before tabe deletion
+ lNdTable = ndPage.xpath(".//pg:TableRegion", namespaces=dNS)
+ if bVerbose:
+ if bForce:
+ traceln(" - %d pre-existing table to be deleted, preserving its contents by moving it under Page node" % len(lNdTable))
+ else:
+ traceln(" - %d pre-existing table to be deleted IF EMPTY" % len(lNdTable))
+ for ndTable in lNdTable:
+ lNd = ndTable.xpath(".//pg:TextLine", namespaces=dNS)
+ if lNd:
+ if bForce:
+ for nd in lNd:
+ nd.getparent().remove(nd)
+ ndPage.append(nd)
+ else:
+ raise ValueError("Pre-existing Table not empty")
+ ndTable.getparent().remove(ndTable)
+
+ # enumerate text, and add to cell
+ # ignore any text in col|row-spanning cells
+ table = TableRegion(iPage+1, 1) # only one table for now!
+ lNdText = ndPage.xpath('.//pg:TextLine[@rowSpan="1" and @colSpan="1"]', namespaces=dNS)
+ for ndText in lNdText:
+ ndText.getparent().remove(ndText)
+ table.addToCell( int(ndText.get("row"))
+ , int(ndText.get("col"))
+ , ndText)
+
+ # make the !
+ ndTable = table.makeTableNode()
+ # add it to the page
+ ndPage.append(ndTable)
+
+ processRegions(ndPage,bVerbose)
+
+ doc.write(sFullFilename,
+ xml_declaration=True,
+ encoding="utf-8",
+ pretty_print=True
+ #compression=0, #0 to 9
+ )
+
+ del doc
+
+ traceln(" done (%d files)" % len(lsFilename))
+
+
+
+# ----------------------------------------------------------------------------
+if __name__ == "__main__":
+
+ version = "v.01"
+ sUsage="""
+Create a TableRegion for non-spanning cells.
+Rely on row, col, rowSpan, colSpan attributes of the TextLine
+
+Usage: %s
+
+""" % (sys.argv[0])
+
+ parser = OptionParser(usage=sUsage)
+ parser.add_option("-v", "--verbose", dest='bVerbose', action="store_true"
+ , help="Verbose mode")
+ parser.add_option("-f", "--force", dest='bForce', action="store_true"
+ , help="Force deletion of pre-existing tables, if not empty keeps its contents")
+ (options, args) = parser.parse_args()
+
+ try:
+ [sInputDir] = args
+ except ValueError:
+ sys.stderr.write(sUsage)
+ sys.exit(1)
+
+ # ... checking folders
+ if not os.path.normpath(sInputDir).endswith("col") : sInputDir = os.path.join(sInputDir, "col")
+
+ if not os.path.isdir(sInputDir):
+ sys.stderr.write("Not a directory: %s\n"%sInputDir)
+ sys.exit(2)
+
+ # ok, go!
+ traceln("Input is : ", os.path.abspath(sInputDir))
+
+ main(sInputDir, bForce=options.bForce, bVerbose=options.bVerbose)
+
+ traceln("Done.")
\ No newline at end of file
diff --git a/TranskribusDU/test_install/test_install.py b/TranskribusDU/test_install/test_install.py
index bd6d21e..3a35216 100644
--- a/TranskribusDU/test_install/test_install.py
+++ b/TranskribusDU/test_install/test_install.py
@@ -5,18 +5,7 @@
Copyright Xerox(C) 2016 JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/tests/test_DU_ABPTABLE.py b/TranskribusDU/tests/test_DU_ABPTABLE.py
index 99ed3af..82802a4 100644
--- a/TranskribusDU/tests/test_DU_ABPTABLE.py
+++ b/TranskribusDU/tests/test_DU_ABPTABLE.py
@@ -8,8 +8,6 @@
@author: meunier
'''
-from __future__ import absolute_import, print_function
-
import sys
import os.path
@@ -19,7 +17,7 @@
sDATA_DIR = os.path.join(sTESTS_DIR, "data")
sys.path.append(os.path.dirname(sTESTS_DIR))
-import crf.Graph
+import graph.Graph
import tasks.DU_ABPTable
@@ -37,7 +35,7 @@ def __init__(self):
self.pkl = False
self.rm = False
self.crf_njobs = 2
- self.crf_max_iter = 2
+ self.max_iter = 2
self.crf_C = None
self.crf_tol = None
self.crf_inference_cache = None
@@ -47,7 +45,7 @@ def __init__(self):
self.applyY = None
def test_ABPTable_train():
- crf.Graph.Graph.resetNodeTypes()
+ graph.Graph.Graph.resetNodeTypes()
sModelDir = os.path.join(sTESTS_DIR, "models")
sModelName = "test_ABPTable_train"
diff --git a/TranskribusDU/util/CollectionSplitter.py b/TranskribusDU/util/CollectionSplitter.py
index 24cee52..356851c 100644
--- a/TranskribusDU/util/CollectionSplitter.py
+++ b/TranskribusDU/util/CollectionSplitter.py
@@ -7,18 +7,7 @@
Copyright NAVER(C) 2019 Jean-Luc Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
from the European Union's Horizon 2020 research and innovation programme
diff --git a/TranskribusDU/util/Polygon.py b/TranskribusDU/util/Polygon.py
index 7bb5395..6f512d0 100644
--- a/TranskribusDU/util/Polygon.py
+++ b/TranskribusDU/util/Polygon.py
@@ -6,18 +6,7 @@
Copyright Xerox(C) 2016 H. Déjean, JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/util/Shape.py b/TranskribusDU/util/Shape.py
index 12047d9..e4395a2 100644
--- a/TranskribusDU/util/Shape.py
+++ b/TranskribusDU/util/Shape.py
@@ -4,20 +4,9 @@
Utilities to deal with the PageXMl 2D objects using shapely
- Copyright NAVER(C) 2018 JL. Meunier
+ Copyright Xerox(C) 2018 JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -28,6 +17,7 @@
import shapely.geometry as geom
from shapely.prepared import prep
+from shapely.ops import cascaded_union
from rtree import index
import numpy as np
@@ -36,6 +26,77 @@
class ShapeLoader:
+ @classmethod
+ def getCoordsString(cls, o, bFailSafe=False):
+ """
+ Produce the usual content of the "Coords" attribute, e.g.:
+ "3162,1205 3162,1410 126,1410 126,1205 3162,1205"
+ may raise an exception
+ """
+ try:
+ lt2 = o.exterior.coords # e.g. [(0.0, 0.0), (1.0, 1.0), (1.0, 0.0)]
+ except:
+ if bFailSafe:
+ try:
+ lt2 = o.coords
+ except:
+ return ""
+ else:
+ lt2 = o.coords
+ return " ".join("%d,%d" % (a,b) for a,b in lt2)
+
+
+ @classmethod
+ def contourObject(cls, lNd):
+ """
+ return the stringified list of coordinates of the contour
+ for the list of PageXml node.
+ e.g. "3162,1205 3162,1410 126,1410 126,1205 3162,1205"
+ return "" upon error
+
+ if bShapelyObjecy is True, then return the Shapely object
+ raise an Exception upon error
+ """
+ lp = []
+ for nd in lNd:
+ try:
+ lp.append(ShapeLoader.node_to_Polygon(nd))
+ except:
+ pass
+
+ o = cascaded_union([p if p.is_valid else p.convex_hull for p in lp ])
+ return o
+
+ @classmethod
+ def minimum_rotated_rectangle(cls, lNd, bShapelyObject=False):
+ """
+ return the stringified list of coordinates of the minimum rotated
+ rectangle for the list of PageXml node.
+ e.g. "3162,1205 3162,1410 126,1410 126,1205 3162,1205"
+ return "" upon error
+
+ if bShapelyObjecy is True, then return the Shapely object
+ raise an Exception upon error
+ """
+ contour = cls.contourObject(lNd)
+ o = contour.minimum_rotated_rectangle
+ return o if bShapelyObject else cls.getCoordsString(o, bFailSafe=True)
+
+ @classmethod
+ def convex_hull(cls, lNd, bShapelyObject):
+ """
+ return the stringified list of coordinates of the minimum rotated
+ rectangle for the list of PageXml node.
+ e.g. "3162,1205 3162,1410 126,1410 126,1205 3162,1205"
+ return "" upon error
+
+ if bShapelyObjecy is True, then return the Shapely object
+ raise an Exception upon error
+ """
+ contour = cls.contourObject(lNd)
+ o = contour.convex_hull
+ return o if bShapelyObject else cls.getCoordsString(o, bFailSafe=True)
+
@classmethod
def node_to_Point(cls, nd):
"""
@@ -73,14 +134,18 @@ def node_to_SingleLine(cls, nd):
return cls.LinearRegression(o)
@classmethod
- def node_to_Polygon(cls, nd):
+ def node_to_Polygon(cls, nd, bValid=True):
"""
Find the points attribute (either in the DOM node itself or in a
children Coord node)
Parse the points series
Return a Polygon shapely object
"""
- return cls._shapeFromNodePoints(nd, geom.Polygon)
+ p = cls._shapeFromNodePoints(nd, geom.Polygon)
+ if bValid and not p.is_valid:
+ # making sure it is a valid shape
+ p = p.buffer(0)
+ return p
@classmethod
def children_to_LineString(cls, node, name, fun=None):
@@ -393,7 +458,12 @@ def test_ShapeLoader():
o = ShapeLoader._shapeFromPoints("0,0 0,9", geom.LineString)
assert o.length == 9
assert o.area == 0.0
-
+
+def test_ShapeLoader_Coords():
+ s = "3162,1205 3162,1410 126,1410 3162,1205"
+ o = ShapeLoader._shapeFromPoints(s, geom.Polygon)
+ assert ShapeLoader.getCoordsString(o) == s
+
# -----------------------------------------------------------------------
def test_ShapePartition_object_above(capsys):
with capsys.disabled():
diff --git a/TranskribusDU/util/dtw.py b/TranskribusDU/util/dtw.py
index 6e25989..6e13a23 100644
--- a/TranskribusDU/util/dtw.py
+++ b/TranskribusDU/util/dtw.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2019 H. Déjean, JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/util/hungarian.py b/TranskribusDU/util/hungarian.py
index fac52ca..42b54f2 100644
--- a/TranskribusDU/util/hungarian.py
+++ b/TranskribusDU/util/hungarian.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2019 H. Déjean, JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -84,4 +73,18 @@ def test_simple():
assert evalHungarian([(1,)], lref, 0.6) == (0, 1, 3)
+def test_simple_unordered():
+
+ lref = [ (3,4), (1,2), (99,6) ]
+
+ l1 = [ (2,1), (4,3), ( 5,6) ]
+
+ assert evalHungarian(l1, l1, 0.4) == (3, 0, 0)
+ assert evalHungarian(l1, lref, 0.3) == (3, 0, 0)
+ assert evalHungarian(l1, lref, 0.6) == (2, 1, 1)
+
+ l2 = [ (3,4), (1,2), (66,6), (99, 999)]
+ assert evalHungarian(l2, lref, 0.6) == (2, 2, 1)
+
+ assert evalHungarian([(1,)], lref, 0.6) == (0, 1, 3)
\ No newline at end of file
diff --git a/TranskribusDU/util/iou.py b/TranskribusDU/util/iou.py
index b394f79..ae91d09 100644
--- a/TranskribusDU/util/iou.py
+++ b/TranskribusDU/util/iou.py
@@ -7,18 +7,7 @@
Copyright Naver Labs Europe(C) 2019 H. Déjean, JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/util/jaccard.py b/TranskribusDU/util/jaccard.py
index 5d2b833..3103dce 100644
--- a/TranskribusDU/util/jaccard.py
+++ b/TranskribusDU/util/jaccard.py
@@ -7,18 +7,7 @@
Copyright Naver Labs Europe(C) 2019 H. Déjean, JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/util/lcs.py b/TranskribusDU/util/lcs.py
index c913176..baf9c13 100644
--- a/TranskribusDU/util/lcs.py
+++ b/TranskribusDU/util/lcs.py
@@ -37,53 +37,7 @@ def matchLCS(perc, t1, t2):
return ((val >= perc), val)
-def testlcs(self,X,Y,m,n):
- L = [[0 for x in range(n+1)] for x in range(m+1)]
-
- # Following steps build L[m+1][n+1] in bottom up fashion. Note
- # that L[i][j] contains length of LCS of X[0..i-1] and Y[0..j-1]
- for i in range(m+1):
- for j in range(n+1):
- if i == 0 or j == 0:
- L[i][j] = 0
- elif X[i-1] == Y[j-1]:
- L[i][j] = L[i-1][j-1] + 1
- else:
- L[i][j] = max(L[i-1][j], L[i][j-1])
-
- # Following code is used to print LCS
- index = L[m][n]
-
- # Create a character array to store the lcs string
- lcs = [""] * (index+1)
- lcs[index] = ""
- lmapping = []
- # Start from the right-most-bottom-most corner and
- # one by one store characters in lcs[]
- i = m
- j = n
- while i > 0 and j > 0:
-
- # If current character in X[] and Y are same, then
- # current character is part of LCS
- if X[i-1] == Y[j-1]:
- lcs[index-1] = X[i-1]
- lmapping.append((i-1,j-1))
- i-=1
- j-=1
- index-=1
-
- # If not same, then find the larger of two and
- # go in the direction of larger value
- elif L[i-1][j] > L[i][j-1]:
- i-=1
- else:
- j-=1
-
- lmapping.reverse()
- xx =[(X[x],Y[y]) for x,y in lmapping]
- return xx
-
+
#--------- LCS code
# Return the length of the longest common string of a and b.
def lcs(a, b):
@@ -104,7 +58,6 @@ def lcs(a, b):
else:
curLcs = max(prevRow[j+1], curRow[j])
curRow[j+1] = curLcs
- print (curRow)
return curRow[na]
def fastlcs(a,b,Dmax=None):
diff --git a/TranskribusDU/util/masking.py b/TranskribusDU/util/masking.py
index a489caa..bca7825 100644
--- a/TranskribusDU/util/masking.py
+++ b/TranskribusDU/util/masking.py
@@ -5,18 +5,7 @@
Copyright Naver Labs Europe(C) 2019 JL. Meunier
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -74,7 +63,7 @@ def applyMask2(lView, lViewMask):
Assumes the input views do not overlap each other
Garanties that the output view do not overlap each other
"""
- for a,b in lView: assert a < b, "invalid view: %s, %s" %(a,b)
+ for a,b in lView: assert a <= b, "invalid view: %s, %s" %(a,b)
ovrl = 0 # total overlap with the masks
# apply each mask in turn
@@ -94,6 +83,8 @@ def applyMask2(lView, lViewMask):
ovrl += (_right - _left)
else:
# keep it as it is
+ # filter our when a == b
+ #if a != b:
lNewView.append( (a,b) )
lView = lNewView
if not lView: break # stop if the view is empty!
diff --git a/TranskribusDU/util/partitionEvaluation.py b/TranskribusDU/util/partitionEvaluation.py
index aabfa62..079f349 100644
--- a/TranskribusDU/util/partitionEvaluation.py
+++ b/TranskribusDU/util/partitionEvaluation.py
@@ -9,18 +9,7 @@
copyright Naver Labs 2018
READ project
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/TranskribusDU/util/statSeparator.py b/TranskribusDU/util/statSeparator.py
new file mode 100644
index 0000000..5d79d6b
--- /dev/null
+++ b/TranskribusDU/util/statSeparator.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+"""
+Currently provides only the computation of a linear separator
+
+H. Déjean, JL Meunier, Copyright Naver Labs Europe 2019
+"""
+
+from sklearn import svm
+
+
+def getLinearSeparator(X, Y):
+ """
+ Linear separator
+
+ return a,b so that the linear separator has the form Y = a X + b
+ """
+
+ #C = 1.0 # SVM regularization parameter
+ # clf = svm.SVC(kernel = 'linear', gamma=0.7, C=C )
+ clf = svm.SVC(kernel = 'linear')
+ clf.fit(X, Y)
+ w = clf.coef_[0]
+ a = -w[0] / w[1]
+ b = - (clf.intercept_[0]) / w[1]
+ return a, b
+
+
+def test_getLinearSeparator():
+ import numpy as np
+
+ lP = [(i, 10) for i in range(10)]
+ lV = [(i, -2) for i in range(10)]
+ X = np.array(lP+lV)
+ Y = np.array([1]*10 + [0]*10)
+
+ a,b = getLinearSeparator(X, Y)
+ assert abs(a) < 0.001
+ assert abs(b-4) < 0.001
+ #print(a,b)
+
+ lP = [(i, 10+i) for i in range(10)]
+ lV = [(i, -2+i) for i in range(10)]
+ X = np.array(lP+lV)
+ Y = np.array([1]*10 + [0]*10)
+
+ a,b = getLinearSeparator(X, Y)
+ assert abs(a-1) < 0.001
+ assert abs(b-4) < 0.001
+ # print(a,b)
+
diff --git a/TranskribusDU/visu/MyFrame.py b/TranskribusDU/visu/MyFrame.py
index 571f78f..9aa785b 100644
--- a/TranskribusDU/visu/MyFrame.py
+++ b/TranskribusDU/visu/MyFrame.py
@@ -9,6 +9,7 @@
from document import Document
from config import Config
+from deco import DecoImage
# begin wxGlade: dependencies
# end wxGlade
@@ -97,10 +98,12 @@ def __init__(self, *args, **kwds):
id_saveas = wx.NewId()
id_close = wx.NewId()
id_quit = wx.NewId()
+ id_imgfolder = wx.NewId()
wxglade_tmp_menu.Append(id_load, "&Load Xml File", "", wx.ITEM_NORMAL)
wxglade_tmp_menu.Append(id_reload, "&Re-load the Xml File", "", wx.ITEM_NORMAL)
wxglade_tmp_menu.Append(id_save, "&Save Xml File", "", wx.ITEM_NORMAL)
wxglade_tmp_menu.Append(id_saveas, "Save &As Xml File", "", wx.ITEM_NORMAL)
+ wxglade_tmp_menu.Append(id_imgfolder, "Select image folder", "", wx.ITEM_NORMAL)
#MARCHE PAS wxglade_tmp_menu.Append(id_reloadini, "&Reload INI File", "", wx.ITEM_NORMAL)
#wxglade_tmp_menu.Append(id_close, "&Close", "", wx.ITEM_NORMAL)
@@ -129,6 +132,7 @@ def __init__(self, *args, **kwds):
self.Bind(wx.EVT_MENU, self.OnMenu_ReloadINI, id=id_reloadini)
self.Bind(wx.EVT_MENU, self.OnMenu_SaveXML, id=id_save)
self.Bind(wx.EVT_MENU, self.OnMenu_SaveAsXML, id=id_saveas)
+ self.Bind(wx.EVT_MENU, self.OnMenu_ImgFolder, id=id_imgfolder)
self.Bind(wx.EVT_MENU, self.OnMenu_Quit, id=id_quit)
self.Bind(wx.EVT_MENU, self.OnMenu_Help, id=id_help)
@@ -330,6 +334,18 @@ def OnMenu_SaveAsXML(self, event):
dlg.Destroy()
if ret: self.bModified = False
+ def OnMenu_ImgFolder(self, event):
+ curdir = os.path.dirname(self.doc.getFilename())
+ if not curdir: curdir = os.getcwd()
+ dlg = wx.DirDialog (None, "Select the image folder", "",
+ wx.DD_DEFAULT_STYLE | wx.DD_DIR_MUST_EXIST)
+ dlg.CenterOnScreen()
+ val = dlg.ShowModal()
+ if val == wx.ID_OK:
+ DecoImage.sImageFolder = dlg.GetPath()
+ dlg.Destroy()
+ self.bModified = True
+ self.display_page()
def OnMenu_Quit(self, event):
if self.bModified:
@@ -433,22 +449,37 @@ def OnToolbar_ChangePage(self, evt):
def OnCanvas_RightMouse(self, obj):
"""Click on a widget in the canvas with the right mouse"""
- menu = wx.Menu()
- # get the id of the corresponding node
- self.n = self.doc.obj_n[obj]
- tree_id = wx.NewId()
- self.Bind(wx.EVT_MENU, self.OnPopup_RightCanvas, id=tree_id)
- menu.Append(tree_id, "XPath lab")
- c = self.wysi.Canvas
- pos = (c.PixelToWorld(obj.XY[0]),
- c.PixelToWorld(c.GetSize()[1]-obj.XY[1]))
- self.PopupMenu(menu, pos)
- menu.Destroy()
+# menu = wx.Menu()
+# # get the id of the corresponding node
+# self.n = self.doc.obj_n[obj]
+# tree_id = wx.NewId()
+# self.Bind(wx.EVT_MENU, self.OnPopup_RightCanvas, id=tree_id)
+# menu.Append(tree_id, "XPath lab")
+# c = self.wysi.Canvas
+# pos = (c.PixelToWorld(obj.XY[0]),
+# c.PixelToWorld(c.GetSize()[1]-obj.XY[1]))
+# self.PopupMenu(menu, pos)
+# menu.Destroy()
+ print("Clicked: ", obj)
+ try:
+ txt = etree.tostring(self.doc.obj_n[obj].getparent())
+ except KeyError:
+ print("No deco associated")
+ return
+ txt = unicode(txt, sEncoding)
+ tip = wx.TipWindow(self, txt, maxLength=1200)
+ wx.FutureCall(30000, tip.Close)
+
def OnCanvas_LeftMouse(self, obj):
- txt = etree.tostring(self.doc.obj_n[obj])
+ print("Clicked: ", obj)
+ try:
+ txt = etree.tostring(self.doc.obj_n[obj])
+ except KeyError:
+ print("No deco associated")
+ return
txt = unicode(txt, sEncoding)
- tip = wx.TipWindow(self, txt)
+ tip = wx.TipWindow(self, txt, maxLength=1200)
wx.FutureCall(30000, tip.Close)
def OnCanvas_LeftMouseDecoAction(self, obj):
@@ -493,10 +524,18 @@ def OnPopup_RightCanvas(self, event):
def cbkDecoCheckBox(self, event): # wxGlade: MyFrame.
"""enable or disbale a decoration type"""
deco = self.dChekBox2Deco[ event.GetEventObject() ]
- deco.setEnabled(event.IsChecked())
+
+ # try to only update the page, when the user adds a decoration, for display speedup
+ # of course, the deco will be on top of all others, even if not at bottom of list of deco
+ b = event.IsChecked()
+ deco.setEnabled(b)
if self.doc:
d = self.doc.displayed
- self.display_page(d)
+ if b:
+ # deco was enabled
+ self.update_page(d, deco)
+ else:
+ self.display_page(d)
def cbkDecoNext(self, event):
"""jump on the next page that has such a decoration"""
@@ -606,7 +645,7 @@ def display_page(self, i=0, (x,y,w,h)=(None, None, None, None)):
LineColor=self.config.page_border_color,
FillColor=self.config.page_background_color,
FillStyle="Solid")
- self.doc.obj_n[page_rect] = self.current_page_node
+ # useless self.doc.obj_n[page_rect] = self.current_page_node
page_rect.Bind(FloatCanvas.EVT_FC_RIGHT_DOWN, self.OnCanvas_RightMouse)
page_rect.Bind(FloatCanvas.EVT_FC_LEFT_DOWN, self.OnCanvas_LeftMouse)
# page_rect.Bind(FloatCanvas.EVT_FC_ENTER_OBJECT, self.OnCanvas_Enter)
@@ -629,14 +668,14 @@ def display_page(self, i=0, (x,y,w,h)=(None, None, None, None)):
#let's bind on the first object of the list
if lo:
- obj = lo[0]
- self.doc.obj_n[obj] = n
- obj.Bind(FloatCanvas.EVT_FC_RIGHT_DOWN, self.OnCanvas_RightMouse)
+ for obj in lo: self.doc.obj_n[obj] = n
+ obj0 = lo[0]
+ obj0.Bind(FloatCanvas.EVT_FC_RIGHT_DOWN, self.OnCanvas_RightMouse)
if deco.isActionable():
- self.doc.obj_deco[obj] = deco
- obj.Bind(FloatCanvas.EVT_FC_LEFT_DOWN, self.OnCanvas_LeftMouseDecoAction)
+ self.doc.obj_deco[obj0] = deco
+ obj0.Bind(FloatCanvas.EVT_FC_LEFT_DOWN, self.OnCanvas_LeftMouseDecoAction)
else:
- obj.Bind(FloatCanvas.EVT_FC_LEFT_DOWN, self.OnCanvas_LeftMouse)
+ obj0.Bind(FloatCanvas.EVT_FC_LEFT_DOWN, self.OnCanvas_LeftMouse)
# obj.Bind(FloatCanvas.EVT_FC_ENTER_OBJECT, self.OnCanvas_Enter)
deco.endPage(self.current_page_node)
@@ -654,3 +693,53 @@ def display_page(self, i=0, (x,y,w,h)=(None, None, None, None)):
c.ZoomToBB()
+ def update_page(self, i, deco):
+ """Update the page in the interface by drawing a single deco (just enabled)
+ """
+ assert deco.isEnabled()
+
+ c = self.wysi.Canvas
+ # self.doc.new_page(i)
+
+ try:
+ self.current_page_node = self.doc.getPageByIndex(i)
+ except IndexError:
+ dlg = wx.MessageDialog(self, message="This XML file has no such page (%dth '%s' element) Try passing another .ini file as application parameter."%(i+1, self.config.page_tag),
+ caption="Error",
+ style=wx.ICON_ERROR)
+ dlg.CenterOnScreen()
+ val = dlg.ShowModal()
+ dlg.Destroy()
+ return
+
+ #Now let's decorate the page according to the configuration
+ ln = self.doc.xpathEval( deco.getMainXPath(), self.current_page_node )
+ deco.beginPage(self.current_page_node)
+
+ for n in ln:
+ #TODO: deal with that!!!
+ inc = 1
+ try:
+ lo = deco.draw(c, n)
+ except:
+ lo = None
+ traceback.print_exc()
+
+ #let's bind on the first object of the list
+ if lo:
+ # bind all objects... (since the click fails from time to time...)
+# obj = lo[0]
+# self.doc.obj_n[obj] = n
+ for obj in lo: self.doc.obj_n[obj] = n
+ obj0 = lo[0]
+ obj0.Bind(FloatCanvas.EVT_FC_RIGHT_DOWN, self.OnCanvas_RightMouse)
+ if deco.isActionable():
+ self.doc.obj_deco[obj0] = deco
+ obj0.Bind(FloatCanvas.EVT_FC_LEFT_DOWN, self.OnCanvas_LeftMouseDecoAction)
+ else:
+ obj0.Bind(FloatCanvas.EVT_FC_LEFT_DOWN, self.OnCanvas_LeftMouse)
+# obj.Bind(FloatCanvas.EVT_FC_ENTER_OBJECT, self.OnCanvas_Enter)
+ deco.endPage(self.current_page_node)
+
+ c.ZoomToBB()
+
diff --git a/TranskribusDU/visu/deco.py b/TranskribusDU/visu/deco.py
index 08c1f1d..8d2bddc 100644
--- a/TranskribusDU/visu/deco.py
+++ b/TranskribusDU/visu/deco.py
@@ -6,6 +6,8 @@
import types, os
from collections import defaultdict
import glob
+import logging
+import random
from lxml import etree
#import cStringIO
import wx
@@ -90,19 +92,49 @@ def setXPathContext(self, xpCtxt):
def xpathError(self, node, xpExpr, eExcpt, sMsg=""):
"""report an xpath error"""
+ try:
+ Deco._s_prev_xpath_error
+ except AttributeError:
+ Deco._s_prev_xpath_error = ""
+ Deco._prev_xpath_error_count = 0
+
iMaxLen = 200 # to truncate the node serialization
- print "-"*60
- print "--- XPath ERROR on class %s"%self.__class__
- print "--- xpath=%s" % xpExpr
- print "--- Python Exception=%s" % str(eExcpt)
- if sMsg: print "--- Info: %s" % sMsg
+ s = "-"*60
+ s += "\n--- XPath ERROR on class %s"%self.__class__
+ s += "\n--- xpath=%s" % xpExpr
+ s += "\n--- Python Exception=%s" % str(eExcpt)
+ if sMsg: s += "\n--- Info: %s" % sMsg
+
+ if s == Deco._s_prev_xpath_error:
+ # let's not overload the console.
+ return
+ Deco._s_prev_xpath_error = s
+
+ Deco._prev_xpath_error_count += 1
+ if Deco._prev_xpath_error_count > 10:
+ return
+
try:
sNode = etree.tostring(node)
except:
sNode = str(node)
if len(sNode) > iMaxLen: sNode = sNode[:iMaxLen] + "..."
- print "--- XML node = %s" % sNode
- print "-"*60
+ s += "\n--- XML node = %s" % sNode
+ s += "\n" + "-"*60 + "\n"
+ logging.warning(s)
+
+ def warning(self, sMsg):
+ """report an xpath error"""
+ try:
+ Deco._s_prev_warning
+ except AttributeError:
+ Deco._s_prev_warning = ""
+ Deco._warning_count = 0
+ # if sMsg != Deco._s_prev_warning and Deco._warning_count < 1000:
+ if sMsg != Deco._s_prev_warning:
+ logging.warning(sMsg)
+ Deco._warning_count += 1
+ Deco._s_prev_warning = sMsg
def toInt(cls, s):
try:
@@ -338,7 +370,7 @@ def getText(self, wxh, node):
try:
return eval('u"\\u%04x"' % int(sEncodedText, self.base))
except ValueError:
- print "DecoUnicodeChar: ERROR: base=%d code=%s"%(self.base, sEncodedText)
+ logging.error("DecoUnicodeChar: ERROR: base=%d code=%s"%(self.base, sEncodedText))
return ""
@@ -370,7 +402,7 @@ def draw(self, wxh, node):
obj = wxh.AddScaledBitmap(img, (x,-y), h)
lo.append(obj)
except Exception, e:
- print "DecoImageBox ERROR: File %s: %s"%(sFilePath, str(e))
+ self.warning("DecoImageBox ERROR: File %s: %s"%(sFilePath, str(e)))
lo.append( DecoRectangle.draw(self, wxh, node) )
return lo
@@ -379,6 +411,8 @@ def draw(self, wxh, node):
class DecoImage(DecoBBXYWH):
"""An image
"""
+ # in case the use wants to specify it via the menu
+ sImageFolder = None
def __init__(self, cfg, sSurname, xpCtxt):
DecoBBXYWH.__init__(self, cfg, sSurname, xpCtxt)
@@ -399,6 +433,20 @@ def draw(self, wxh, node):
x,y,w,h,inc = self.runXYWHI(node)
sFilePath = self.xpathToStr(node, self.xpHRef, "")
if sFilePath:
+ if self.sImageFolder:
+ sCandidate = os.path.join(self.sImageFolder, sFilePath)
+ if os.path.exists(sCandidate):
+ sFilePath = sCandidate
+ else:
+ # maybe the file is in a subfolder ?
+ # e.g. "S_Aicha_an_der_Donau_004-03_0005.jpg" is in folder "S_Aicha_an_der_Donau_004-03"
+ try:
+ sDir = sFilePath[:sFilePath.rindex("_")]
+ sCandidate = os.path.join(self.sImageFolder, sDir, sFilePath)
+ if os.path.exists(sCandidate):
+ sFilePath = sCandidate
+ except ValueError:
+ pass
if not os.path.exists(sFilePath):
#maybe the image is in a folder with same name as XML file? (Transkribus style)
sUrl = node.getroottree().docinfo.URL.decode('utf-8') # py2 ...
@@ -421,7 +469,7 @@ def draw(self, wxh, node):
bKO = False
break
if bKO:
- print "WARNING: deco Image: file does not exists: '%s'"%sFilePath
+ self.warning("WARNING: deco Image: file does not exists: '%s'"%sFilePath)
sFilePath = None
if bool(sFilePath):
img = wx.Image(sFilePath, wx.BITMAP_TYPE_ANY)
@@ -432,7 +480,7 @@ def draw(self, wxh, node):
obj = wxh.AddScaledBitmap(img, (x,-y), img.GetHeight())
lo.append(obj)
except Exception, e:
- print "DecoImage ERROR: File %s: %s"%(sFilePath, str(e))
+ self.warning("DecoImage ERROR: File %s: %s"%(sFilePath, str(e)))
return lo
@@ -537,13 +585,20 @@ def __init__(self, cfg, sSurname, xpCtxt):
def _getCoordList(self, node):
sCoords = self.xpathToStr(node, self.xpCoords, "")
+ if not sCoords:
+ if node.get("id") is None:
+ self.warning("No coordinates: node = %s" % etree.tostring(node))
+ else:
+ self.warning("No coordinates: node id = %s" % node.get("id"))
+ return [(0,0)]
try:
ltXY = []
for _sPair in sCoords.split(' '):
(sx, sy) = _sPair.split(',')
ltXY.append((Deco.toInt(sx), Deco.toInt(sy)))
- except Exception, e:
- print "ERROR: polyline coords are bad: '%s'"%sCoords
+ except Exception as e:
+ logging.error("ERROR: polyline coords are bad: '%s' -> '%s'" % (
+ self.xpCoords, sCoords))
raise e
return ltXY
@@ -584,27 +639,31 @@ def _getFontSize(self, node, ltXY, txt, Family=wx.FONTFAMILY_TELETYPE):
return iFontSize, ExtentX, ExtentY
"""
(x1, y1), (x2, y2) = self._coordList_to_BB(ltXY)
-
- dc = wx.ScreenDC()
- # compute for font size of 24 and do proportional
- dc.SetFont(wx.Font(24, Family, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL))
- Ex, Ey = dc.GetTextExtent("x")
- try:
- iFontSizeX = 24 * abs(x2-x1) / Ex / len(txt)
- except:
- print "absence of text: cannot compute font size along X axis"
- iFontSizeX = 8
- iFontSizeY = 24 * abs(y2-y1) / Ey
sFit = self.xpathToStr(node, self.xpFit, 'xy', bShowError=False)
- if sFit == "x":
- iFontSize = iFontSizeX
- elif sFit == "y":
- iFontSize = iFontSizeY
- else:
- iFontSize = min(iFontSizeX, iFontSizeY)
- dc.SetFont(wx.Font(iFontSize, Family, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL))
- Ex, Ey = dc.GetTextExtent("x")
- del dc
+
+ try:
+ iFontSize = int(sFit)
+ Ex, Ey = None, None
+ except ValueError:
+ dc = wx.ScreenDC()
+ # compute for font size of 24 and do proportional
+ dc.SetFont(wx.Font(24, Family, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL))
+ Ex, Ey = dc.GetTextExtent("x")
+ try:
+ iFontSizeX = 24 * abs(x2-x1) / Ex / len(txt)
+ except:
+ self.warning("absence of text: cannot compute font size along X axis")
+ iFontSizeX = 8
+ iFontSizeY = 24 * abs(y2-y1) / Ey
+ if sFit == "x":
+ iFontSize = iFontSizeX
+ elif sFit == "y":
+ iFontSize = iFontSizeY
+ else:
+ iFontSize = min(iFontSizeX, iFontSizeY)
+ dc.SetFont(wx.Font(iFontSize, Family, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL))
+ Ex, Ey = dc.GetTextExtent("x")
+ del dc
return iFontSize, Ex, Ey
@@ -624,7 +683,9 @@ def draw(self, wxh, node):
iFontSize, Ex, Ey = self._getFontSize(node, ltXY, txt, Family=wx.FONTFAMILY_TELETYPE)
- x, y = ltXY[0]
+ # x, y = ltXY[0]
+ (x, _y1), (_x2, y) = self._coordList_to_BB(ltXY)
+
obj = wxh.AddScaledText(txt, (x, -y+iFontSize/6), Size=iFontSize
, Family=wx.FONTFAMILY_TELETYPE
, Position='tl'
@@ -845,7 +906,141 @@ def draw(self, wxh, node):
lo.append(obj)
return lo
-
+
+class DecoClusterCircle(DecoREAD):
+ """
+ [Cluster]
+ type=DecoClusterCircle
+ xpath=.//Cluster
+ xpath_content=@content
+ xpath_radius=40
+ xpath_item_lxy=./pg:Coords/@points
+ xpath_LineWidth="1"
+ xpath_FillStyle="Transparent"
+ LineColors="BLUE SIENNA YELLOW ORANGE RED GREEN"
+ FillColors="BLUE SIENNA YELLOW ORANGE RED GREEN"
+ enabled=1
+ """
+ count = 0
+ def __init__(self, cfg, sSurname, xpCtxt):
+ DecoREAD.__init__(self, cfg, sSurname, xpCtxt)
+ self.xpCluster = cfg.get(sSurname, "xpath")
+ self.xpContent = cfg.get(sSurname, "xpath_content")
+ self.xpRadius = cfg.get(sSurname, "xpath_radius")
+ self.xpLineWidth = cfg.get(sSurname, "xpath_LineWidth")
+ self.xpFillStyle = cfg.get(sSurname, "xpath_FillStyle")
+ self.lsLineColor = cfg.get(sSurname, "LineColors").split()
+ self.lsFillColor = cfg.get(sSurname, "FillColors").split()
+ #cached values
+ self._node = None
+ self._laxyr = None
+
+ print "DecoClusterCircle lsLineColor = ", self.lsLineColor
+ print "DecoClusterCircle lsFillColor = ", self.lsFillColor
+
+ def __str__(self):
+ s = "%s="%self.__class__
+ s += "+(coords=%s)" % (self.xpCoords)
+ return s
+
+ def getArea_and_CenterOfMass(self, lXY):
+ """
+ https://fr.wikipedia.org/wiki/Aire_et_centre_de_masse_d'un_polygone
+
+ return A, (Xg, Yg) which are the area and the coordinates (float) of the center of mass of the polygon
+ """
+ if len(lXY) < 2: raise ValueError("Only one point: polygon area is undefined.")
+
+ fA = 0.0
+ xSum, ySum = 0, 0
+
+
+ xprev, yprev = lXY[-1]
+ for x, y in lXY:
+ iTerm = xprev*y - yprev*x
+ fA += iTerm
+ xSum += iTerm * (xprev+x)
+ ySum += iTerm * (yprev+y)
+ xprev, yprev = x, y
+ if fA == 0.0: raise ValueError("surface == 0.0")
+ fA = fA / 2
+ xg, yg = xSum/6/fA, ySum/6/fA
+
+ if fA <0:
+ return -fA, (xg, yg)
+ else:
+ return fA, (xg, yg)
+ assert fA >0 and xg >0 and yg >0, "%s\t%s"%(lXY (fA, (xg, yg)))
+ return fA, (xg, yg)
+
+ def draw(self, wxh, node):
+ """draw itself using the wx handle
+ return a list of created WX objects"""
+
+ DecoClusterCircle.count = DecoClusterCircle.count + 1
+
+ lo = DecoREAD.draw(self, wxh, node)
+ if self._node != node:
+ self._laxyr = []
+ #need to go thru each item
+ ndPage = node.xpath("ancestor::*[local-name()='Page']")[0]
+ sIds = self.xpathEval(node, self.xpContent)[0]
+ for sId in sIds.split():
+ l = self.xpathEval(ndPage, './/*[@id="%s"]'%sId)
+ ndItem = l[0]
+ lxy = self._getCoordList(ndItem)
+ fA, (xg, yg) = self.getArea_and_CenterOfMass(lxy)
+ r = self.xpathToInt(ndItem, self.xpRadius, 1)
+ self._laxyr.append( (fA, xg, yg, r) )
+ self._node = node
+
+ if self._laxyr:
+ iMaxFC = len(self.lsFillColor)
+ iMaxLC = len(self.lsLineColor)
+ if False:
+ Nf = DecoClusterCircle.count
+ Nl = Nf
+ else:
+ Nf = random.randrange(iMaxFC)
+ Nl = random.randrange(iMaxFC)
+
+ iLineWidth = self.xpathToInt(node, self.xpLineWidth, 1)
+ sFillStyle = self.xpathToStr(node, self.xpFillStyle, "Solid")
+ for (_a, x, y, r) in self._laxyr:
+ #draw a circle
+ sFillColor = self.lsFillColor[Nf % iMaxFC]
+ if self.lsLineColor:
+ sLineColor = self.lsLineColor[Nl % iMaxLC]
+ else:
+ sLineColor = sFillColor
+ obj = wxh.AddCircle((x, -y), r,
+ LineWidth=iLineWidth,
+ LineColor=sLineColor,
+ FillColor=sFillColor,
+ FillStyle=sFillStyle)
+# obj = wxh.AddRectangle((x, -y), (20, 20),
+# LineWidth=iLineWidth,
+# LineColor=sLineColor,
+# FillColor=sFillColor,
+# FillStyle=sFillStyle)
+
+ lo.append(obj)
+
+ """
+ lo = DecoBBXYWH.draw(self, wxh, node)
+ x,y,w,h,inc = self.runXYWHI(node)
+ sLineColor = self.xpathToStr(node, self.xpLineColor, "#000000")
+ iLineWidth = self.xpathToInt(node, self.xpLineWidth, 1)
+ sFillColor = self.xpathToStr(node, self.xpFillColor, "#000000")
+ sFillStyle = self.xpathToStr(node, self.xpFillStyle, "Solid")
+ obj = wxh.AddRectangle((x, -y), (w, -h),
+ LineWidth=iLineWidth,
+ LineColor=sLineColor,
+ FillColor=sFillColor,
+ FillStyle=sFillStyle)
+ """
+ return lo
+
class DecoLink(Deco):
"""A link from x1,y1 to x2,y2
"""
diff --git a/TranskribusDU/visu/mpxml_viewer.bat b/TranskribusDU/visu/mpxml_viewer.bat
index c431345..5846fb7 100644
--- a/TranskribusDU/visu/mpxml_viewer.bat
+++ b/TranskribusDU/visu/mpxml_viewer.bat
@@ -1,2 +1,8 @@
-C:\python27\python.exe %0.py %0.ini %1
-rem set /p temp="Hit enter to continue"
\ No newline at end of file
+rem --- install python 2.7
+rem --- install wxpython version 2.9
+rem --- > pip install lxml numpy
+rem --- to use: > C:\python27\python.exe mpxml_viewer.bat.py mpxml_viewer.bat.ini
+
+C:\Python27\python.exe %0.py %0.ini %1
+
+rem set /p temp="Hit enter to continue"
diff --git a/TranskribusDU/visu/mpxml_viewer.bat.ini b/TranskribusDU/visu/mpxml_viewer.bat.ini
index ab27a82..592a2bd 100644
--- a/TranskribusDU/visu/mpxml_viewer.bat.ini
+++ b/TranskribusDU/visu/mpxml_viewer.bat.ini
@@ -16,9 +16,9 @@
decos=Image sprtr TextRegionRectangle TextLineRectangle Baseline
TextLine_Unicode READ_Unicode READ_x_Unicode
- sprtr MENU_section MENU_section_heading MENU_item MENU_other
- sprtr MENU_Item_name MENU_Item_description MENU_Item_price MENU_Item_quantity MENU_Item_number
- sprtr MENU_Rest_name MENU_Rest_address MENU_Rest_phone_number MENU_Rest_url MENU_Rest_hours
+# sprtr MENU_section MENU_section_heading MENU_item MENU_other
+# sprtr MENU_Item_name MENU_Item_description MENU_Item_price MENU_Item_quantity MENU_Item_number
+# sprtr MENU_Rest_name MENU_Rest_address MENU_Rest_phone_number MENU_Rest_url MENU_Rest_hours
# sprtr Word_Unicode Word_Plain TextLine_Unicode TextLine_Plain TextRegion_Unicode TextRegion_Plain
# sprtr type:heading type:page-number type:marginalia type:header type:catch-word type:UNKNOWN
@@ -28,7 +28,8 @@ decos=Image sprtr TextRegionRectangle TextLineRectangle Baseline
row:number_as_text
col:number_as_text
sprtr TableRectangle CellRectangle PredictedCellRectangle
- sprtr Separator
+ sprtr cut Separator Separator_rows
+# Separator_S Separator_I
# sprtr Bsln:S Bsln:I Bsln:O
# sprtr Grid Grid+
# sprtr cut cut:S cut:B cut:I cut:E cut:other
@@ -37,9 +38,13 @@ decos=Image sprtr TextRegionRectangle TextLineRectangle Baseline
# sprtr TableRectangle CellRectangle
# sprtr Separator
#decos=Image ImageRectangle sprtr TextRegionRectangle TextLineRectangle RegionText LineText
- sprtr Edge EdgeCon Cluster
+ sprtr Edge EdgeCon ClusterEdge ClusterEdge_H ClusterEdge_V
+ sprtr Cluster ClusterColor Cluster_cut ClusterColor_cut Cluster_agglo ClusterColor_agglo Cluster_I ClusterColor_I
+# Cluster_edge ClusterColor_edge
# HorizontalEdge VerticalEdge
# Edge_BL Edge_LL
+ dbgTableRow dbgTableCol
+ dbgTableRow_agglo dbgTableCol_agglo
#------------------
# Where the files are situated by default
@@ -166,7 +171,7 @@ xpath_LineColor="SIENNA"
xpath_FillStyle="Transparent"
xpath_incr="0"
enabled=0
-xpath_LineWidth=1
+xpath_LineWidth=2
[Baseline]
type=DecoPolyLine
@@ -287,7 +292,7 @@ xpath_lxy=./pg:Baseline/@points
xpath_content=./pg:TextEquiv/pg:Unicode
xpath_font_color="BLUE"
xpath_LineColor="RED"
-enabled=1
+enabled=0
[MENU_Item_description]
xpath_label="Item-description"
@@ -299,7 +304,7 @@ xpath_content=./pg:TextEquiv/pg:Unicode
xpath_font_color="BLUE"
xpath_LineColor="GREEN"
xpath_incr="0"
-enabled=1
+enabled=0
[MENU_Item_price]
xpath_label="Item-price"
@@ -311,7 +316,7 @@ xpath_content=./pg:TextEquiv/pg:Unicode
xpath_font_color="BLUE"
xpath_LineColor="BLUE"
xpath_incr="0"
-enabled=1
+enabled=0
[MENU_Item_quantity]
xpath_label="Item-quantity"
@@ -323,7 +328,7 @@ xpath_content=./pg:TextEquiv/pg:Unicode
xpath_font_color="BLUE"
xpath_LineColor="VIOLET"
xpath_incr="0"
-enabled=1
+enabled=0
[MENU_Item_number]
xpath_label="Item-number"
@@ -335,7 +340,7 @@ xpath_content=./pg:TextEquiv/pg:Unicode
xpath_font_color="BLUE"
xpath_LineColor="BLACK"
xpath_incr="0"
-enabled=1
+enabled=0
# - - - - - - - - - - -
[MENU_Rest_name]
@@ -349,7 +354,7 @@ xpath_font_color="BLUE"
xpath_LineColor="RED"
xpath_background_color="LIGHT GREY"
xpath_incr="0"
-enabled=1
+enabled=0
[MENU_Rest_address]
xpath_label="Restaurant-address"
@@ -362,7 +367,7 @@ xpath_font_color="BLUE"
xpath_LineColor="GREEN"
xpath_background_color="LIGHT GREY"
xpath_incr="0"
-enabled=1
+enabled=0
[MENU_Rest_phone_number]
xpath_label="Restaurant-phone-number"
@@ -375,7 +380,7 @@ xpath_font_color="BLUE"
xpath_LineColor="BLUE"
xpath_background_color="LIGHT GREY"
xpath_incr="0"
-enabled=1
+enabled=0
[MENU_Rest_hours]
xpath_label="Restaurant-hours"
@@ -388,7 +393,7 @@ xpath_font_color="BLUE"
xpath_LineColor="VIOLET"
xpath_background_color="LIGHT GREY"
xpath_incr="0"
-enabled=1
+enabled=0
[MENU_Rest_url]
xpath_label="Restaurant-url"
@@ -401,7 +406,7 @@ xpath_font_color="BLUE"
xpath_LineColor="BLACK"
xpath_background_color="LIGHT GREY"
xpath_incr="0"
-enabled=1
+enabled=0
# -----------------------------------------------------------------------------
@@ -417,7 +422,7 @@ xpath_w="0"
xpath_h="0"
xpath_font_color="BLUE"
xpath_incr="0"
-enabled=1
+enabled=0
# Here we try to separate the row from the col for a better display
[row:number_as_text]
@@ -431,7 +436,7 @@ xpath_w="0"
xpath_h="0"
xpath_font_color="BLUE"
xpath_incr="0"
-enabled=1
+enabled=0
[col:number_as_text]
type=DecoText
@@ -444,7 +449,7 @@ xpath_w="0"
xpath_h="0"
xpath_font_color="BLUE"
xpath_incr="0"
-enabled=1
+enabled=0
# [row_col:number_as_text]
# type=DecoREADTextLine
@@ -454,7 +459,7 @@ enabled=1
# xpath_lxy=./pg:Coords/@points
# xpath_font_color="BLUE"
# xpath_incr="0"
-# enabled=1
+# enabled=0
# -----------------------------------------------------------------------------
@@ -502,6 +507,33 @@ xpath_FillStyle="Transparent"
xpath_incr="0"
enabled=0
+[Separator_S]
+type=DecoPolyLine
+xpath=.//pg:SeparatorRegion[@DU_Sep="S"]
+xpath_lxy=./pg:Coords/@points
+xpath_LineColor="BLUE"
+xpath_FillStyle="Transparent"
+xpath_incr="0"
+enabled=0
+
+[Separator_I]
+type=DecoPolyLine
+xpath=.//pg:SeparatorRegion[@DU_Sep="I"]
+xpath_lxy=./pg:Coords/@points
+xpath_LineColor="RED"
+xpath_FillStyle="Transparent"
+xpath_incr="0"
+enabled=0
+
+[Separator_rows]
+type=DecoPolyLine
+xpath=.//pg:SeparatorRegion[@algo]
+xpath_lxy=./pg:Coords/@points
+xpath_LineColor="RED"
+xpath_FillStyle="Transparent"
+xpath_incr="0"
+enabled=0
+
#--------------------------------------------------------------------
[row:S]
type=DecoClosedPolyLine
@@ -510,7 +542,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#FFFF00"
xpath_LineWidth=2
xpath_incr="-2"
-enabled=1
+enabled=0
[row:B]
type=DecoClosedPolyLine
@@ -519,7 +551,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#FF0000"
xpath_LineWidth=2
xpath_incr="-2"
-enabled=1
+enabled=0
[row:I]
type=DecoClosedPolyLine
@@ -528,7 +560,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#99ff33"
xpath_LineWidth=2
xpath_incr="-2"
-enabled=1
+enabled=0
[row:E]
type=DecoClosedPolyLine
@@ -537,7 +569,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="LIGHT BLUE"
xpath_LineWidth=2
xpath_incr="-2"
-enabled=1
+enabled=0
[row:T]
type=DecoClosedPolyLine
@@ -546,7 +578,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#0000FF"
xpath_LineWidth=2
xpath_incr="-2"
-enabled=1
+enabled=0
[row:M]
type=DecoClosedPolyLine
@@ -555,7 +587,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#FFFFFF"
xpath_LineWidth=2
xpath_incr="-2"
-enabled=1
+enabled=0
[col:S]
@@ -601,7 +633,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#ffcc00"
xpath_LineWidth=3
xpath_incr="0"
-enabled=1
+enabled=0
[header:D]
type=DecoClosedPolyLine
@@ -621,7 +653,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="purple"
xpath_LineWidth=2
xpath_incr="2"
-enabled=1
+enabled=0
[sep:o]
type=DecoClosedPolyLine
@@ -630,7 +662,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="LIGHT GREY"
xpath_LineWidth=2
xpath_incr="2"
-enabled=1
+enabled=0
[type:heading]
type=DecoClosedPolyLine
@@ -639,7 +671,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#FFFF00"
xpath_LineWidth=2
xpath_incr="-2"
-enabled=1
+enabled=0
[type:page-number]
type=DecoClosedPolyLine
@@ -648,7 +680,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#ffcc00"
xpath_LineWidth=4
xpath_incr="-2"
-enabled=1
+enabled=0
[type:marginalia]
type=DecoClosedPolyLine
@@ -657,7 +689,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#99ff33"
xpath_LineWidth=2
xpath_incr="-2"
-enabled=1
+enabled=0
[type:header]
type=DecoClosedPolyLine
@@ -666,7 +698,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#ffcc99"
xpath_LineWidth=4
xpath_incr="-2"
-enabled=1
+enabled=0
[type:catch-word]
type=DecoClosedPolyLine
@@ -675,7 +707,7 @@ xpath_lxy=./pg:Coords/@points
xpath_LineColor="#ff99cc"
xpath_LineWidth=2
xpath_incr="-2"
-enabled=1
+enabled=0
[type:UNKNOWN]
type=DecoClosedPolyLine
@@ -787,7 +819,7 @@ xpath=.//pg:Edge
xpath_lxy=./@points
xpath_LineColor="PINK"
xpath_FillStyle="Transparent"
-xpath_LineWidth=1
+xpath_LineWidth=2
xpath_incr="0"
enabled=0
@@ -797,10 +829,11 @@ xpath=.//pg:Edge[@label="continue"]
xpath_lxy=./@points
xpath_LineColor="PURPLE"
xpath_FillStyle="Transparent"
-xpath_LineWidth=1
+xpath_LineWidth=2
xpath_incr="0"
enabled=0
+# -----------------------------------------------------------
[Cluster]
type=DecoClosedPolyLine
xpath=.//pg:Cluster
@@ -809,8 +842,138 @@ xpath_LineColor="Orange"
xpath_FillStyle="Transparent"
xpath_LineWidth=2
xpath_incr="0"
-enabled=1
+enabled=0
+
+[ClusterColor]
+type=DecoClusterCircle
+xpath=.//pg:Cluster
+xpath_content=@content
+xpath_radius=100
+xpath_LineWidth="2"
+xpath_FillStyle="Solid"
+# REM no line color list => same line and fill color
+# REM LineColors=""
+LineColors=BLUE RED PINK TURQUOISE ORANGE PURPLE YELLOW FIREBRICK GREEN MAROON
+FillColors=BLUE RED PINK TURQUOISE ORANGE PURPLE YELLOW FIREBRICK GREEN MAROON
+enabled=0
+
+[ClusterEdge]
+type=DecoPolyLine
+xpath=.//pg:ClusterEdge
+xpath_lxy=./@points
+xpath_LineColor="VIOLET"
+xpath_FillStyle="Transparent"
+xpath_LineWidth=2
+xpath_incr="0"
+enabled=0
+[ClusterEdge_H]
+type=DecoPolyLine
+xpath=.//pg:ClusterEdge[@type="HorizontalEdge"]
+xpath_lxy=./@points
+xpath_LineColor="VIOLET"
+xpath_FillStyle="Transparent"
+xpath_LineWidth=2
+xpath_incr="0"
+enabled=0
+[ClusterEdge_V]
+type=DecoPolyLine
+xpath=.//pg:ClusterEdge[@type="VerticalEdge"]
+xpath_lxy=./@points
+xpath_LineColor="VIOLET"
+xpath_FillStyle="Transparent"
+xpath_LineWidth=2
+xpath_incr="0"
+enabled=0
+
+[Cluster_cut]
+type=DecoClosedPolyLine
+xpath=.//pg:Cluster[@algo="cut"]
+xpath_lxy=./pg:Coords/@points
+xpath_LineColor="RED"
+xpath_FillStyle="Transparent"
+xpath_LineWidth=2
+xpath_incr="0"
+enabled=0
+
+[ClusterColor_cut]
+type=DecoClusterCircle
+xpath=.//pg:Cluster[@algo="cut"]
+xpath_content=@content
+xpath_radius=120
+xpath_LineWidth="2"
+xpath_FillStyle="Solid"
+LineColors=BLUE RED PINK TURQUOISE ORANGE PURPLE YELLOW FIREBRICK GREEN MAROON
+FillColors=BLUE RED PINK TURQUOISE ORANGE PURPLE YELLOW FIREBRICK GREEN MAROON
+enabled=0
+
+
+[Cluster_edge]
+type=DecoClosedPolyLine
+xpath=.//pg:Cluster[@algo!="cut"]
+xpath_lxy=./pg:Coords/@points
+xpath_LineColor="GREEN"
+xpath_FillStyle="Transparent"
+xpath_LineWidth=2
+xpath_incr="0"
+enabled=0
+
+[ClusterColor_edge]
+type=DecoClusterCircle
+xpath=.//pg:Cluster[@algo!="cut"]
+xpath_content=@content
+xpath_radius=70
+xpath_LineWidth="2"
+xpath_FillStyle="Solid"
+LineColors=BLUE RED PINK TURQUOISE ORANGE PURPLE YELLOW FIREBRICK GREEN MAROON
+FillColors=BLUE RED PINK TURQUOISE ORANGE PURPLE YELLOW FIREBRICK GREEN MAROON
+enabled=0
+
+
+[Cluster_I]
+type=DecoClosedPolyLine
+xpath=.//pg:Cluster[contains(@algo,"_I_")]
+xpath_lxy=./pg:Coords/@points
+xpath_LineColor="#2E8B57"
+xpath_FillStyle="Transparent"
+xpath_LineWidth=2
+xpath_incr="0"
+enabled=0
+
+[ClusterColor_I]
+type=DecoClusterCircle
+xpath=.//pg:Cluster[contains(@algo, "_I_")]
+xpath_content=@content
+xpath_radius=70
+xpath_LineWidth="2"
+xpath_FillStyle="Solid"
+LineColors=BLUE RED PINK TURQUOISE ORANGE PURPLE YELLOW FIREBRICK GREEN MAROON
+# no line color list => same line and fill color
+FillColors=BLUE RED PINK TURQUOISE ORANGE PURPLE YELLOW FIREBRICK GREEN MAROON
+enabled=0
+
+[Cluster_agglo]
+type=DecoClosedPolyLine
+xpath=.//pg:Cluster[@algo="agglo"]
+xpath_lxy=./pg:Coords/@points
+xpath_LineColor="#808000"
+xpath_FillStyle="Transparent"
+xpath_LineWidth=2
+xpath_incr="0"
+enabled=0
+
+[ClusterColor_agglo]
+type=DecoClusterCircle
+xpath=.//pg:Cluster[@algo="agglo"]
+xpath_content=@content
+xpath_radius=80
+xpath_LineWidth="3"
+xpath_FillStyle="Solid"
+LineColors=BLUE RED PINK TURQUOISE ORANGE PURPLE YELLOW FIREBRICK GREEN MAROON
+FillColors=BLUE RED PINK TURQUOISE ORANGE PURPLE YELLOW FIREBRICK GREEN MAROON
+enabled=0
+
+# -------------------------------------------------------------
[HorizontalEdge]
type=DecoPolyLine
xpath=.//pg:Edge[@DU_type="HorizontalEdge"]
@@ -860,7 +1023,7 @@ xpath_LineColor="#DCDCDC"
xpath_FillStyle="Transparent"
xpath_LineWidth=2
xpath_incr="-6"
-enabled=1
+enabled=0
[cut:S]
type=DecoClosedPolyLine
@@ -910,4 +1073,51 @@ xpath_LineColor="purple"
xpath_FillStyle="Transparent"
xpath_LineWidth=2
xpath_incr="-6"
-enabled=0
\ No newline at end of file
+enabled=0
+
+
+
+[dbgTableRow]
+type=DecoREADTextLine
+#xpath=.//pg:Cluster
+xpath=.//pg:Cluster[@algo="(cut_I_agglo)"]
+# xpath_fit_text_size indicate how to fit the text to the polygon. It is one of: x y xy
+xpath_fit_text_size=36
+xpath_lxy=./pg:Coords/@points
+xpath_content=concat(@row, " (", @rowSpan, ")")
+xpath_font_color="BLUE"
+xpath_incr="0"
+enabled=0
+
+[dbgTableCol]
+type=DecoREADTextLine
+# xpath=.//pg:Cluster
+xpath=.//pg:Cluster[@algo="(cut_I_agglo)"]
+# xpath_fit_text_size indicate how to fit the text to the polygon. It is one of: x y xy
+xpath_fit_text_size=36
+xpath_lxy=./pg:Coords/@points
+xpath_content=concat(@col, " (", @colSpan, ")")
+xpath_font_color="RED"
+xpath_incr="0"
+enabled=0
+
+[dbgTableRow_agglo]
+type=DecoREADTextLine
+xpath=.//pg:Cluster[@algo="agglo"]
+# xpath_fit_text_size indicate how to fit the text to the polygon. It is one of: x y xy
+xpath_fit_text_size=36
+xpath_lxy=./pg:Coords/@points
+xpath_content=concat(@row, " (", @rowSpan, ")")
+xpath_font_color="BLUE"
+xpath_incr="0"
+enabled=1
+
+[dbgTableCol_agglo]
+type=DecoREADTextLine
+xpath=.//pg:Cluster[@algo="cut"]
+# xpath_fit_text_size indicate how to fit the text to the polygon. It is one of: x y xy
+xpath_fit_text_size=36
+xpath_lxy=./pg:Coords/@points
+xpath_content=concat(@col, " (", @colSpan, ")")
+xpath_font_color="RED"
+xpath_incr="0"
diff --git a/TranskribusDU/xml_formats/DS2PageXml.py b/TranskribusDU/xml_formats/DS2PageXml.py
index f47dfb3..0f16b95 100644
--- a/TranskribusDU/xml_formats/DS2PageXml.py
+++ b/TranskribusDU/xml_formats/DS2PageXml.py
@@ -107,7 +107,8 @@ def DSPoint2PagePoints(self,sPoints):
451,246 451,1094 781,1094 781,246
"""
- lPoints = [x for xx in sPoints.split(' ') for x in xx.split(',')]
+ #print (sPoints)
+ lPoints = sPoints.split(" ").split(',')
lx= list(map(lambda x:1.0*float(x)*self.dpi/72.0, lPoints))
# order left right
xx = list(zip(lx[0::2], lx[1::2]))
@@ -163,7 +164,7 @@ def convertDSObject(self,DSObject,pageXmlParentNode):
if DSObject.hasAttribute('points'):
coordsNode.set('points',self.DSPoint2PagePoints(DSObject.getAttribute('points')))
else:
- coordsNode.set('points', self.BB2Polylines(DSObject.getX(),DSObject.getY(), DSObject.getHeight(),DSObject.getWidth()))
+ coordsNode.set('points', self.BB2Polylines(DSObject.getX(),DSObject.getY(), DSObject.getHeight(),DSObject.getWidth()))
domNode.append(coordsNode)
for attr in ['custom', 'structure','col','type','DU_row','DU_header','DU_col']:
@@ -288,7 +289,7 @@ def run(self,domDoc):
conversion
"""
ODoc =XMLDSDocument()
-# ODoc.lastPage=1
+ # ODoc.lastPage=1
ODoc.loadFromDom(domDoc)
lPageXmlDoc=[]
lPages= ODoc.getPages()
diff --git a/TranskribusDU/xml_formats/Page2DS.py b/TranskribusDU/xml_formats/Page2DS.py
index 2a7c6ab..e85f93e 100644
--- a/TranskribusDU/xml_formats/Page2DS.py
+++ b/TranskribusDU/xml_formats/Page2DS.py
@@ -104,10 +104,10 @@ def regionBoundingBox(self,sList):
lList = sList.split(' ')
for x,y in [x.split(',') for x in lList]:
- minx = min(minx,float(x))
- maxx = max(maxx,float(x))
- miny = min(miny,float(y))
- maxy = max(maxy,float(y))
+ minx = min(minx,int(x))
+ maxx = max(maxx,int(x))
+ miny = min(miny,int(y))
+ maxy = max(maxy,int(y))
return [minx,miny,maxy-miny,maxx-minx]
def regionBoundingBox2010(self,lList):
@@ -350,24 +350,7 @@ def getTable(self,tableNode):
return dstable
- def copyEdge(self,child):
- """
-
- """
- node = etree.Element('EDGE')
- node.set('src',child.get('src'))
- node.set('tgt',child.get('tgt'))
- node.set('type',child.get('type'))
- node.set('w',child.get('proba'))
- node.set('label',child.get('label'))
- lPoints = child.get('points')
- lP = lPoints.split(' ')
- if lP != []:
- scaledP= [ list(map(lambda x: 72.0* float(x) / self.dpi , xy.split(','))) for xy in lP]
- scaledP = " ".join([ "%.2f,%.2f"% (x,y) for (x,y) in scaledP])
- node.set('points',scaledP)
- return node
-
+
def createRegion(self,pnode):
"""
create REGION
@@ -498,8 +481,6 @@ def convert2DS(self,mprimedoc,sDocID):
imageHeight = 72 * (float(ipage.get("imageHeight")) / self.dpi)
page.set("width",str(imageWidth))
page.set("height",str(imageHeight))
- page.set("imageWidth",str(imageWidth))
- page.set("imageHeight",str(imageHeight))
self.convertPage(ipage, page)
self.addTagProcessToMetadata(dsdom)
@@ -539,8 +520,6 @@ def run(self):
imageHeight = 72 * (float(ipage.get("imageHeight")) / self.dpi)
page.set("width",str(imageWidth))
page.set("height",str(imageHeight))
- page.set("imageWidth",str(imageWidth))
- page.set("imageHeight",str(imageHeight))
imgNode = etree.Element("IMAGE")
imgNode.set("href",ipage.get("imageFilename"))
imgNode.set("x","0")
@@ -549,8 +528,8 @@ def run(self):
imgNode.set("width",str(imageWidth))
page.append(imgNode)
self.convertPage(ipage, page)
-# except StopIteration, e:
-# traceln("=== done.")
+
+
self.addTagProcessToMetadata(dsdom)
return dsdom
diff --git a/TranskribusDU/xml_formats/PageXml.py b/TranskribusDU/xml_formats/PageXml.py
index a806db8..ca9d115 100644
--- a/TranskribusDU/xml_formats/PageXml.py
+++ b/TranskribusDU/xml_formats/PageXml.py
@@ -9,6 +9,10 @@
@author: meunier
'''
+
+
+
+
import os
import datetime
from copy import deepcopy
@@ -128,18 +132,25 @@ def setMetadata(cls, doc, domNd, Creator, Comments=None):
return the Metadata DOM node
"""
ndMetadata, ndCreator, _ndCreated, ndLastChange, ndComments = cls._getMetadataNodes(doc, domNd)
- ndCreator.text = Creator
+ if bool(Creator):
+ if ndCreator.text:
+ ndCreator.text = ndCreator.text + "\n" + Creator
+ else:
+ ndCreator.text = Creator
#The schema seems to call for GMT date&time (IMU)
#ISO 8601 says: "If the time is in UTC, add a Z directly after the time without a space. Z is the zone designator for the zero UTC offset."
#Python seems to break the standard unless one specifies properly a timezone by sub-classing tzinfo. But too complex stuff
#So, I simply add a 'Z'
ndLastChange.text = datetime.datetime.utcnow().isoformat()+"Z"
- if Comments != None:
+ if bool(Comments):
## if not ndComments: #we need to add one!
## FutureWarning: The behavior of this method will change in future versions. Use specific 'len(elem)' or 'elem is not None' test instead.
if ndComments is None : #we need to add one!
ndComments = etree.SubElement(ndMetadata, cls.sCOMMENTS_ELT)
- ndComments.text = Comments
+ if bool(ndComments.text):
+ ndComments.text = ndComments.text + "\n" + Comments
+ else:
+ ndComments.text = Comments
return ndMetadata
setMetadata = classmethod(setMetadata)
@@ -346,7 +357,12 @@ def _getMetadataNodes(cls, doc=None, domNd=None):
nd4 = nd3.getnext()
if nd4 is not None:
- if etree.QName(nd4.tag).localname not in [cls.sCOMMENTS_ELT,cls.sTranskribusMetadata_ELT] : raise ValueError("PageXMl mal-formed Metadata: LastChange element must be 3rd element")
+ if etree.QName(nd4.tag).localname not in [cls.sCOMMENTS_ELT,cls.sTranskribusMetadata_ELT] : raise ValueError("PageXMl mal-formed Metadata: expected a Transkribus metadata or some comment as 4th element")
+ if etree.QName(nd4.tag).localname == cls.sTranskribusMetadata_ELT:
+ nd4 = nd4.getnext()
+ if nd4 is not None:
+ if etree.QName(nd4.tag).localname != cls.sCOMMENTS_ELT : raise ValueError("PageXMl mal-formed Metadata: expected a comment element")
+
return domNd, nd1, nd2, nd3, nd4
_getMetadataNodes = classmethod(_getMetadataNodes)
@@ -651,29 +667,44 @@ def _iter_splitMultiPageXml(cls, doc, bInPlace=True):
#to jump to the PAGE sibling node (we do it now, defore possibly unlink...)
node = metadataNd.getnext()
+ xmlPAGERoot.append(metadataNd)
+# node = metadataNd.getnext()
+ xmlPAGERoot.append(node)
+
+ """
+ Hervé 28/05/2019: I comment since I don't understand
+ """
# #Add a copy of the METADATA node and sub-tree
- if bInPlace:
- metadataNd.getparent().remove(metadataNd)
- xmlPAGERoot.append(metadataNd)
- else:
- newMetadataNd=deepcopy(metadataNd)
- xmlPAGERoot.append(newMetadataNd)
+# if bInPlace:
+# # metadataNd.unlinkNode()
+# metadataNd.getparent().remove(metadataNd)
+# newRootNd.append(metadataNd)
+# else:
+# # newMetadataNd = metadataNd.copyNode(1)
+# newMetadataNd=deepcopy(metadataNd)
+# metadataNd.getparent().remove(metadataNd)
+# newRootNd.append(newMetadataNd)
# #jump to the PAGE sibling node
# node = metadataNd.next
-
+
while node is not None:
# if node.type == "element": break
# node = node.next
if node.tag != etree.Comment: break
node = node.getnext()
if etree.QName(node.tag).localname != "Page": raise ValueError("Input multi-page PageXml for page %d should have a PAGE node after the METADATA node."%pnum)
+
#Add a copy of the PAGE node and sub-tree
if bInPlace:
- xmlPAGERoot.append(node)
+# node.unlinkNode()
+# newNode = newRootNd.addChild(node)
+ newRootNd.append(node)
newNode= node
else:
+# newPageNd = node.copyNode(1)
+# newNode = newRootNd.addChild(newPageNd)
newNode = deepcopy(node)
newRootNd.append(newNode)
#Remove the prefix on the "id" attributes
@@ -691,10 +722,8 @@ def _iter_splitMultiPageXml(cls, doc, bInPlace=True):
# ctxt.xpathFreeContext()
# for doc in lDocToBeFreed: doc.freeDoc()
- raise StopIteration
+ return
_iter_splitMultiPageXml = classmethod(_iter_splitMultiPageXml)
-
-
# --- Metadata of PageXml --------------------------------
class Metadata:
@@ -785,4 +814,4 @@ def __init__(self, Creator, Created, LastChange, Comments=None):
print ("\t done: %s"%filename)
print ("DONE")
-
\ No newline at end of file
+
diff --git a/TranskribusDU/xml_formats/PageXmlExtractor.py b/TranskribusDU/xml_formats/PageXmlExtractor.py
index 63b7909..fb3a222 100755
--- a/TranskribusDU/xml_formats/PageXmlExtractor.py
+++ b/TranskribusDU/xml_formats/PageXmlExtractor.py
@@ -9,6 +9,10 @@
@author: meunier
'''
+
+
+
+
import os
from io import open
import json
@@ -70,7 +74,7 @@ def iterPageNumber(self):
for a,b in self._ltiRange:
for n in range(a,b+1):
yield n
- raise StopIteration
+ return
# -----
def __str__(self):
diff --git a/TranskribusDU/xml_formats/mpxml2pxml.py b/TranskribusDU/xml_formats/mpxml2pxml.py
index fe69bb5..492df9e 100644
--- a/TranskribusDU/xml_formats/mpxml2pxml.py
+++ b/TranskribusDU/xml_formats/mpxml2pxml.py
@@ -31,18 +31,22 @@
(options, args) = parser.parse_args()
try:
- dir = args[0]
- docid= args[1]
+ _dir = args[0]
+ _docid= args[1]
except:
parser.print_help()
parser.exit(1, "")
- sDocFilename = "%s%scol%s%s.mpxml" % (dir,os.sep,os.sep,docid)
+ sDocFilename = "%s%scol%s%s.mpxml" % (_dir,os.sep,os.sep,_docid)
doc = etree.parse(sDocFilename)
- for pnum, pageDoc in PageXml.MultiPageXml._iter_splitMultiPageXml(doc, bInPlace=False):
- outfilename = "%s%s%s%s%s_%03d.pxml" % (dir,os.sep,options.destdir,os.sep,docid,pnum)
+ ## sDocFilename = "%s%scol%s%s.bar_mpxml" % (_dir,os.sep,os.sep,_docid)
+ ## doc = etree.parse(sDocFilename)
+ ## for pnum, pageDoc in PageXml.MultiPageXml._iter_splitMultiPageXml(doc, bInPlace=False):
+ for pnum, pageDoc in PageXml.MultiPageXml._iter_splitMultiPageXml(doc, bInPlace=True):
+ outfilename = "%s%s%s%s%s_%03d.pxml" % (_dir,os.sep,options.destdir,os.sep,_docid,pnum)
print(outfilename)
pageDoc.write(outfilename, xml_declaration ='UTF-8',encoding="utf-8", pretty_print = bool(options.bIndent))
+ doc.freeDoc()
print ("DONE")
\ No newline at end of file
diff --git a/TranskribusDU/xml_formats/tests/test_PageXml.py b/TranskribusDU/xml_formats/tests/test_PageXml.py
index 6356f17..46413e8 100644
--- a/TranskribusDU/xml_formats/tests/test_PageXml.py
+++ b/TranskribusDU/xml_formats/tests/test_PageXml.py
@@ -149,6 +149,48 @@ def test_CreationPageXmlDocument():
doc= PageXml.createPageXmlDocument(creatorName='HerveforTest', filename='hervefortest.jpg', imgW=100, imgH=100)
print(doc)
+def test_countTextLineWithText():
+ sXml = b"""
+
+
+ Tilla
+ 2016-08-18T13:35:08.252+07:00
+ 2016-12-01T09:53:39.610+01:00
+
+
+
+
+
+
+
+
+ 52.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ """
+ doc = etree.parse(BytesIO(sXml))
+
+ assert (1, 2) == PageXml.countTextLineWithText(doc)
+ return doc
+
if __name__ == "__main__":
test_setMetadata()
test_CreationPageXmlDocument()
\ No newline at end of file
diff --git a/usecases/ABP/src/ABPCSV2XML.py b/usecases/ABP/src/ABPCSV2XML.py
index 069b5ca..db2aa31 100644
--- a/usecases/ABP/src/ABPCSV2XML.py
+++ b/usecases/ABP/src/ABPCSV2XML.py
@@ -13,18 +13,7 @@
copyright Xerox 2017
READ project
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/usecases/ABP/src/ABPIEOntology.py b/usecases/ABP/src/ABPIEOntology.py
index 35e7fed..3565e6c 100644
--- a/usecases/ABP/src/ABPIEOntology.py
+++ b/usecases/ABP/src/ABPIEOntology.py
@@ -1,28 +1,15 @@
# -*- coding: utf-8 -*-
"""
- ABP records IEOntology
-
+ ABP Death records IEOntology
Hervé Déjean
cpy Xerox 2017, NLE 2017
death record
- wedding record (for test)
READ project
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -40,68 +27,6 @@
from lxml import etree
-
-
-class BaptismRecord(recordClass):
- """
- name (firstname only possible) +death date sometimes
- hebamme , was the birth easy , date?
- vater + info
- mutter + info
- location ; instead date of the uxor!
- birth date
- baptism date
- priester + info
- """
-
-class weddingRecord(recordClass):
- sName = 'weddingrecord'
- def __init__(self,sModelName,sModelDir):
- recordClass.__init__(self,deathRecord.sName)
-
- myTagger = ABPTagger()
- myTagger.loadResources(sModelName ,sModelDir )
-
- #bride
- bfnField = firstNameField()
- bfnField.setLabelMapping( ['firstNameGenerator'])
- bfnField.addTagger(myTagger)
- bfnField.setMandatory()
- self.addField(bfnField)
-
- gnfield = lastNameField()
- gnfield.addTagger(myTagger)
- gnfield.setLabelMapping(['lastNameGenerator'])
- gnfield.setMandatory()
- self.addField(gnfield)
-
- #groom
- gfnField = firstNameField()
- gfnField.setLabelMapping( ['firstNameGenerator'])
- gfnField.addTagger(myTagger)
- gfnField.setMandatory()
- self.addField(gfnField)
-
- gnfield = lastNameField()
- gnfield.addTagger(myTagger)
- gnfield.setLabelMapping(['lastNameGenerator'])
- gnfield.setMandatory()
- self.addField(gnfield)
-
- lfield= locationField()
- lfield.addTagger(myTagger)
- lfield.setLabelMapping(['locationGenerator'])
- self.addField(lfield)
-
- wDate= weddingDate()
- wDate.addTagger(myTagger)
-# dDate.setLabelMapping(['weekDayDateGenerator','MonthDayDateGenerator','MonthDateGenerator'])
- xDate.setLabelMapping(['MonthDateGenerator'])
- self.addField(dDate)
-
-
-
-
class deathRecord(recordClass):
sName = 'deathrecord'
def __init__(self,sModelName,sModelDir):
@@ -130,7 +55,7 @@ def __init__(self,sModelName,sModelDir):
lfield= locationField()
lfield.addTagger(myTagger)
- lfield.setLabelMapping(['location2Generator'])
+ lfield.setLabelMapping(['locationGenerator'])
self.addField(lfield)
ofield= occupationField()
@@ -142,45 +67,28 @@ def __init__(self,sModelName,sModelDir):
sfield.addTagger(myTagger)
sfield.setLabelMapping(['familyStatus'])
self.addField(sfield)
-#
-
- # specific tagger for dates ?
+#
dDate= deathDate()
dDate.addTagger(myTagger)
# dDate.setLabelMapping(['weekDayDateGenerator','MonthDayDateGenerator','MonthDateGenerator'])
dDate.setLabelMapping(['MonthDateGenerator'])
self.addField(dDate)
- ddDate= deathDateDay()
- ddDate.addTagger(myTagger)
-# dDate.setLabelMapping(['weekDayDateGenerator','MonthDayDateGenerator','MonthDateGenerator'])
- ddDate.setLabelMapping(['MonthDayDateGenerator'])
- self.addField(ddDate)
-
bDate= burialDate()
bDate.addTagger(myTagger)
# bDate.setLabelMapping(['weekDayDateGenerator','MonthDayDateGenerator','MonthDateGenerator'])
bDate.setLabelMapping(['MonthDateGenerator'])
self.addField(bDate)
- year=deathYear()
- year.addTagger(myTagger)
- year.setLabelMapping(['yearGenerator'])
- self.addField(year)
agefield=age()
agefield.addTagger(myTagger)
agefield.setLabelMapping(['ageValueGenerator'])
self.addField(agefield)
-
- ageUnitfield=ageUnit()
- ageUnitfield.addTagger(myTagger)
- ageUnitfield.setLabelMapping(['AgeUnitGenerator'])
- self.addField(ageUnitfield)
blfield= burialLocation()
blfield.addTagger(myTagger)
- blfield.setLabelMapping(['location2Generator'])
+ blfield.setLabelMapping(['locationGenerator'])
self.addField(blfield)
reasonField = deathreasonField()
@@ -193,29 +101,6 @@ def __init__(self,sModelName,sModelDir):
drField.setLabelMapping(['lastNameGenerator']) #lastNameGenerator
self.addField(drField)
-# def decoratePageXml(self):
-# """
-# ONGOING....
-# add in @custom the field name
-#
-#
-#
-# currenlty
-# """
-# lPages={}
-# for cand in self.getCandidates():
-# try:lPages[cand.getPage()].append(cand)
-# except:lPages[cand.getPage()]=[cand]
-#
-# for page in sorted(lPages):
-# sortedRows = lPages[page]
-# sortedRows.sort(key=lambda x:int(x.getIndex()))
-# for cand in sortedRows:
-# for field in cand.getAllFields():
-# if field.getName() is not None and field.getBestValue() is not None:
-# print (field, field.getOffset()
-
def generateOutput(self,outDom):
"""
generateOutput
@@ -252,10 +137,7 @@ def generateOutput(self,outDom):
key=key[2:]
domp.set('pagenum',key)
- ## -> page has now a year attribute (X-X)
- if page.getAttribute('computedyear') is None:
- page.addAttribute('computedyear','')
- domp.set('years',str(page.getAttribute('computedyear')))
+ domp.set('years','NA')
root.append(domp)
sortedRows = lPages[page]
sortedRows.sort(key=lambda x:int(x.getIndex()))
@@ -264,17 +146,13 @@ def generateOutput(self,outDom):
record = etree.Element('RECORD')
# record fields
nbRecords = 0
- lSeenField=[]
for field in cand.getAllFields():
- # take the best one
if field.getName() is not None and field.getBestValue() is not None:
- record.set(field.getName().lower(),field.getBestValue())
- lSeenField.append(field.getName().lower())
- nbRecords=1
- elif field.getName().lower() not in lSeenField:record.set(field.getName().lower(),"")
+ record.set(field.getName(),field.getBestValue())
+ nbRecords+=1
if nbRecords > 0:
domp.append(record)
- domp.set('nbrecords',str(len(domp)))
+ domp.set('nbrecords',str(nbRecords))
return outDom
@@ -287,27 +165,12 @@ class locationField(fieldClass):
sName='location'
def __init__(self):
fieldClass.__init__(self, locationField.sName)
-
-
-class weddingDate(fieldClass):
- sName='weddingDate'
- def __init__(self):
- fieldClass.__init__(self, weddingDate.sName)
-
-class deathYear(fieldClass):
- sName='deathYear'
- def __init__(self):
- fieldClass.__init__(self, deathYear.sName)
+
class deathDate(fieldClass):
sName='deathDate'
def __init__(self):
fieldClass.__init__(self, deathDate.sName)
-
-class deathDateDay(fieldClass):
- sName='MonthDayDateGenerator'
- def __init__(self):
- fieldClass.__init__(self, deathDateDay.sName)
-
+
class burialDate(fieldClass):
sName='burialDate'
def __init__(self):
@@ -322,12 +185,6 @@ class age(fieldClass):
sName='age'
def __init__(self):
fieldClass.__init__(self, age.sName)
-
-class ageUnit(fieldClass):
- sName='ageUnit'
- def __init__(self):
- fieldClass.__init__(self, ageUnit.sName)
-
class firstNameField(fieldClass):
sName = 'firstname'
diff --git a/usecases/ABP/src/ABPRefFromBigCVS.py b/usecases/ABP/src/ABPRefFromBigCVS.py
index de25e9e..f35b59f 100644
--- a/usecases/ABP/src/ABPRefFromBigCVS.py
+++ b/usecases/ABP/src/ABPRefFromBigCVS.py
@@ -11,18 +11,7 @@
copyright NLE 2017
READ project
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/usecases/ABP/src/ABPResourceGeneration.py b/usecases/ABP/src/ABPResourceGeneration.py
index bc43c40..438595a 100644
--- a/usecases/ABP/src/ABPResourceGeneration.py
+++ b/usecases/ABP/src/ABPResourceGeneration.py
@@ -10,18 +10,7 @@
copyright Naverlabs 2017
READ project
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/usecases/ABP/src/ABPWorkflow.py b/usecases/ABP/src/ABPWorkflow.py
index d2ae22a..57348ff 100644
--- a/usecases/ABP/src/ABPWorkflow.py
+++ b/usecases/ABP/src/ABPWorkflow.py
@@ -12,18 +12,7 @@
copyright Naver LAbs Europe 2017
READ project
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
diff --git a/usecases/ABP/src/ABP_IE.py b/usecases/ABP/src/ABP_IE.py
index 77034aa..8c9a654 100644
--- a/usecases/ABP/src/ABP_IE.py
+++ b/usecases/ABP/src/ABP_IE.py
@@ -11,18 +11,7 @@
READ project
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
+
Developed for the EU project READ. The READ project has received funding
@@ -134,36 +123,6 @@ def labelTable(self,table):
- def findNameColumn(self,table):
- """
- find the column which corresponds to the people names c
- """
- self.bDebug=False
- #tag fields with template
- lColPos = {}
- lColInvName = {}
- for cell in table.getCells():
- try:lColPos[cell.getIndex()[1]]
- except: lColPos[cell.getIndex()[1]]=[]
- if cell.getIndex()[1] < 5:
- res = field.applyTaggers(cell)
- for field in cell.getFields():
- if field is not None:
- # res [ (token,label,score) ...]
- extractedValues = field.extractLabel(res)
- if extractedValues != []:
- # extractedValues = map(lambda offset,value,label,score:(value,score),extractedValues)
- extractedValues = list(map(lambda x:(x[1],x[3]),extractedValues))
- field.setOffset(res[0])
- field.setValue(extractedValues)
- # field.addValue(extractedValues)
- lColPos[cell.getIndex()[1]].append(field.getName())
- try:lColInvName[field.getName()].append(cell.getIndex()[1])
- except: lColInvName[field.getName()] = [cell.getIndex()[1]]
- if self.bDebug: print ('foundXX:',field.getName(), field.getValue())
- cell.resetField()
- return max(lColInvName['firstname'],key=lColInvName['firstname'].count)
-
def extractData(self,table,myRecord, lTemplate):
"""
layout
@@ -178,28 +137,26 @@ def extractData(self,table,myRecord, lTemplate):
find layout level for record completion
extract data/record
-inference if IEOnto
+
+
"""
-# self.bDebug = False
-# table.buildNDARRAY()
+ self.bDebug = False
+ table.buildNDARRAY()
if lTemplate is not None:
# convert string to tableTemplateObject
template = tableTemplateClass()
template.buildFromPattern(lTemplate)
template.labelTable(table)
else: return None
-# firstNameColIndex =self.findNameColumn(table)
-
- # create a batch for the full page
#tag fields with template
for cell in table.getCells():
if cell.getFields() != []:
if self.bDebug:print(table.getPage(),cell.getIndex(), cell.getFields(), cell.getContent())
- res = myRecord.applyTaggers(cell)
for field in cell.getFields():
if field is not None:
- #res = field.applyTaggers(cell)
+ res = field.applyTaggers(cell)
# res [ (token,label,score) ...]
extractedValues = field.extractLabel(res)
if extractedValues != []:
@@ -207,8 +164,8 @@ def extractData(self,table,myRecord, lTemplate):
extractedValues = list(map(lambda x:(x[1],x[3]),extractedValues))
field.setOffset(res[0])
field.setValue(extractedValues)
-# field.addValue(extractedValues)
if self.bDebug: print ('found:',field, field.getValue())
+
### now at record level ?
### scope = propagation using only docObject (hardcoded ?)
@@ -277,6 +234,19 @@ def testGTText(self,page):
+ def htrWithTemplate(self,table,template,htrModelId):
+ """
+ perform an HTR with dictionaries specific to each column
+
+ need: docid, pageid
+ """
+
+ # for the current column: need to get tablecells ids
+ # more efficient(?why more efficient?) to have it at column level: not cell ; so just after table template tool
+ for col in table.getColumns():
+ lCellsID = map(lambda x:x.getID(),col.getCells())
+ for id in lCellsID: print(id)
+
def mineTable(self,tabel,dr):
@@ -298,66 +268,46 @@ def processWithTemplate(self,table,dr):
"""
# selection of the dictionaries per columns
# template 5,10: first col = numbering
+ lTemplateIE2 = [
+ ((slice(1,None),slice(0,1)) ,[ 'numbering'],[ dr.getFieldByName('numbering') ])
+ , ((slice(1,None),slice(1,2)) ,[ 'abp_names', 'names_aux','numbering','religion'],[ dr.getFieldByName('lastname'), dr.getFieldByName('firstname'),dr.getFieldByName('religion') ])
+ , ((slice(1,None),slice(2,3)) ,[ 'abp_profession','religion' ] ,[ dr.getFieldByName('occupation'), dr.getFieldByName('religion') ])
+ , ((slice(1,None),slice(3,4)) ,[ 'abp_location' ] ,[ dr.getFieldByName('location') ])
+ , ((slice(1,None),slice(4,5)) ,[ 'abp_family' ] ,[ dr.getFieldByName('situation') ])
+ ,((slice(1,None),slice(5,6)) ,[ 'deathreason','artz'] ,[ dr.getFieldByName('deathreason'),dr.getFieldByName('doktor')])
+ , ((slice(1,None),slice(6,7)) ,[] , [ ]) #binding
+ , ((slice(1,None),slice(7,8)) ,[ 'abp_dates' ] ,[ dr.getFieldByName('deathDate') ])
+ , ((slice(1,None),slice(8,9)) ,[ 'abp_dates','abp_location' ] ,[ dr.getFieldByName('burialDate'),dr.getFieldByName('burialLocation') ])
+ , ((slice(1,None),slice(9,10)) ,[ 'abp_age'] ,[ dr.getFieldByName('age')])
+# , ((slice(1,None),slice(9,10)) ,[ dr.getFieldByName('priester')])
+# , ((slice(1,None),slice(10,11)),[ dr.getFieldByName('notes')])
+ ]
- # find calibration column: abp_names
- table.buildNDARRAY()
-# print (self.findNameColumn(table))
-# lTemplateIE2 = [
-# ((slice(1,None),slice(0,1)) ,[ 'numbering'],[ dr.getFieldByName('numbering') ])
-# , ((slice(1,None),slice(1,2)) ,[ 'abp_names', 'names_aux','numbering','religion'],[ dr.getFieldByName('lastname'), dr.getFieldByName('firstname'),dr.getFieldByName('religion') ])
-# , ((slice(1,None),slice(2,3)) ,[ 'abp_profession','religion' ] ,[ dr.getFieldByName('occupation'), dr.getFieldByName('religion') ])
-# , ((slice(1,None),slice(3,4)) ,[ 'abp_location' ] ,[ dr.getFieldByName('location') ])
-# , ((slice(1,None),slice(4,5)) ,[ 'abp_family' ] ,[ dr.getFieldByName('situation') ])
-# ,((slice(1,None),slice(5,6)) ,[ 'deathreason','artz'] ,[ dr.getFieldByName('deathreason'),dr.getFieldByName('doktor')])
-# , ((slice(1,None),slice(6,7)) ,[] , [ ]) #binding
-# , ((slice(1,None),slice(7,8)) ,['abp_dates', 'abp_dates' ,'abp_year'] ,[,dr.getFieldByName('deathDate'),dr.getFieldByName('deathYear') ])
-# , ((slice(1,None),slice(8,9)) ,[ 'abp_dates','abp_location' ] ,[ dr.getFieldByName('burialDate'),dr.getFieldByName('burialLocation') ])
-# , ((slice(1,None),slice(9,10)) ,[ 'abp_age','abp_ageunit'] ,[ dr.getFieldByName('age'), dr.getFieldByName('ageUnit')])
-# # , ((slice(1,None),slice(9,10)) ,[ dr.getFieldByName('priester')])
-# # , ((slice(1,None),slice(10,11)),[ dr.getFieldByName('notes')])
-# ]
-
-
-
-
- #fuzzy
- lTemplateIECAL = [
- ((slice(1,None),slice(0,4)) ,[ 'abp_names', 'names_aux','numbering','religion'],[ dr.getFieldByName('lastname'), dr.getFieldByName('firstname') ,dr.getFieldByName('religion')])
- , ((slice(1,None),slice(1,4)) ,[ 'abp_profession','religion' ] ,[ dr.getFieldByName('occupation'), dr.getFieldByName('religion') ])
- ]
-
- #detect empty left columns ?
- template = tableTemplateClass()
- template.buildFromPattern(lTemplateIECAL)
- template.labelTable(table)
-
- iRef = self.findNameColumn(table)
- print ("=============",iRef)
lTemplateIE = [
- ((slice(1,None),slice(iRef,iRef+1)) ,[ 'abp_names', 'names_aux','numbering','religion'],[ dr.getFieldByName('lastname'), dr.getFieldByName('firstname') ,dr.getFieldByName('religion')])
- , ((slice(1,None),slice(iRef+1,iRef+2)) ,[ 'abp_profession','religion' ] ,[ dr.getFieldByName('occupation'), dr.getFieldByName('religion') ])
- , ((slice(1,None),slice(iRef+2,iRef+3)) ,[ 'abp_location' ] ,[ dr.getFieldByName('location') ])
- , ((slice(1,None),slice(iRef+3,iRef+4)) ,[ 'abp_family' ] ,[ dr.getFieldByName('situation') ])
+ ((slice(1,None),slice(0,1)) ,[ 'abp_names', 'names_aux','numbering','religion'],[ dr.getFieldByName('lastname'), dr.getFieldByName('firstname') ,dr.getFieldByName('religion')])
+ , ((slice(1,None),slice(1,2)) ,[ 'abp_profession','religion' ] ,[ dr.getFieldByName('occupation'), dr.getFieldByName('religion') ])
+ , ((slice(1,None),slice(2,3)) ,[ 'abp_location' ] ,[ dr.getFieldByName('location') ])
+ , ((slice(1,None),slice(3,4)) ,[ 'abp_family' ] ,[ dr.getFieldByName('situation') ])
#[] binding
- # 4 6
- # 5 7
- # 6 8
- , ((slice(1,None),slice(iRef+4,iRef+6)) ,[ 'abp_deathreason','artz'] ,[ dr.getFieldByName('deathreason'),dr.getFieldByName('doktor')])
- , ((slice(1,None),slice(iRef+5,iRef+9)) ,[ 'abp_dates','abp_year' ] ,[ dr.getFieldByName('MonthDayDateGenerator'), dr.getFieldByName('deathDate') ,dr.getFieldByName('deathYear')])
- , ((slice(1,None),slice(iRef+6,iRef+9)) ,[ 'abp_dates','abp_year','abp_location' ] ,[ dr.getFieldByName('burialDate'),dr.getFieldByName('deathYear'),dr.getFieldByName('burialLocation') ])
- , ((slice(1,None),slice(iRef+8,iRef+10)) ,[ 'abp_age','abp_ageunit'] ,[ dr.getFieldByName('age'), dr.getFieldByName('ageUnit')])
+ , ((slice(1,None),slice(4,6)) ,[ 'deathreason','artz'] ,[ dr.getFieldByName('deathreason'),dr.getFieldByName('doktor')])
+ , ((slice(1,None),slice(6,7)) ,[ 'abp_dates' ] ,[ dr.getFieldByName('deathDate') ])
+ , ((slice(1,None),slice(7,8)) ,[ 'abp_dates','abp_location' ] ,[ dr.getFieldByName('burialDate'),dr.getFieldByName('burialLocation') ])
+ , ((slice(1,None),slice(8,9)) ,[ 'abp_age'] ,[ dr.getFieldByName('age')])
# , ((slice(1,None),slice(9,10)) ,[ dr.getFieldByName('priester')])
# , ((slice(1,None),slice(10,11)),[ dr.getFieldByName('notes')])
- ]
- # recalibrate template
+ ]
-# # lTemplate = lTemplateIE
-# if table.getNbColumns() >= 12:
-# lTemplate = lTemplateIE2
-# else:
-# lTemplate = lTemplateIE
- self.extractData(table,dr,lTemplateIE)
+
+# lTemplate = lTemplateIE
+ if table.getNbColumns() == 12:
+ lTemplate = lTemplateIE2
+ else:
+ lTemplate = lTemplateIE
+
+# if self.htrModelID is not None: self.htrWithTemplate(table, lTemplate, self.htrModelID)
+
+ self.extractData(table,dr,lTemplate)
# select best solutions
# store inthe proper final format
@@ -390,7 +340,7 @@ def run(self,doc):
###
for page in self.lPages:
- print("page: ", page.getNumber())
+# print("page: ", page.getNumber())
# self.testGTText(page)
# continue
lTables = page.getAllNamedObjects(XMLDSTABLEClass)
@@ -401,11 +351,9 @@ def run(self,doc):
continue
if self.BuseStoredTemplate:
self.processWithTemplate(table, dr)
- #try:self.processWithTemplate(table, dr)
- #except: print('issue with page %s'%page)
else:
self.mineTable(table,dr)
-
+
self.evalData = dr.generateOutput(self.evalData)
# print self.evalData.serialize('utf-8',True)
@@ -536,7 +484,7 @@ def testFirstNameLastNameRecord(self,srefData,srunData, bVisual):
lCovered=[]
for a,i in enumerate(r2):
# print (key,a,r1[a],i,rows[r1[a]][2],cols[i][2], 1/cost_matrix[r1[a],i])
- if 1 / cost_matrix[r1[a],i] > lcsTH:
+ if 1 / cost_matrix[r1[a,],i] > lcsTH:
cntOk += 1
if bT:
ltisRefsRunbErrbMiss.append( (runElt[1],int(runElt[0]), cols[i], rows[r1[a]],False, False) )
@@ -622,8 +570,8 @@ def testRecordField(self,lfieldName,lfieldInRef,srefData,srunData, bVisual):
key=page.get('pagenum')
xpath = "./%s" % ("RECORD")
lrecord = page.xpath(xpath)
- if len(lrecord)==0:
- lRef.append([])
+ if len(lrecord) == 0:
+ pass
else:
for record in lrecord:
lf =[]
@@ -835,8 +783,6 @@ def testCompare(self, srefData, srunData, bVisual=False):
dicTestByTask['location']= self.testRecordField(['location'],[None],srefData, srunData,bVisual)
dicTestByTask['deathreason']= self.testRecordField(['deathreason'],[None],srefData, srunData,bVisual)
dicTestByTask['names']= self.testRecordField(['firstname','lastname'],[None,None],srefData, srunData,bVisual)
- dicTestByTask['doktor']= self.testRecordField(['doktor'],['helfer_name'],srefData, srunData,bVisual)
-
# dicTestByTask['namedeathlocationoccupation']= self.testRecordField(['firstname','lastname','deathreason','location','occupation'],[None,None,None,None,None],srefData, srunData,bVisual)
dicTestByTask['situation']= self.testRecordField(['situation'],['family'],srefData, srunData,bVisual)
# dicTestByTask['Year']= self.testYear(srefData, srunData,bVisual)
diff --git a/usecases/ABP/src/PageCellToRegion.py b/usecases/ABP/src/PageCellToRegion.py
index deabdaa..f622ace 100644
--- a/usecases/ABP/src/PageCellToRegion.py
+++ b/usecases/ABP/src/PageCellToRegion.py
@@ -41,8 +41,8 @@ def __init__(self):
self.sPttrn = None
self.dpi = 300
-# self.xmlns='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
- self.ns={'a':PageXml.NS_PAGE_XML}
+ self.xmlns='http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
+ self.ns={'a':self.xmlns}
self.id=1
self.HeightTH=0.5
@@ -63,7 +63,7 @@ def resizeCell(self,cell,ns):
replace the cell region by a BB for textlines: better for transcriber
"""
xpath = "./a:%s" % ("TextLine")
- lTextLines = cell.xpath(xpath, namespaces={'a':PageXml.NS_PAGE_XML})
+ lTextLines = cell.xpath(xpath, namespaces={'a':self.xmlns})
if lTextLines == []:
return True
@@ -149,16 +149,16 @@ def convertTableCells(self,document):
xpath = "//a:%s" % ("ReadingOrder")
lRO = document.getroot().xpath(xpath,namespaces = self.ns)
if lRO == []:
- ro = PageXml.createPageXmlNode('ReadingOrder')
+ ro = PageXml.createPageXmlNode('ReadingOrder', self.xmlns)
#addPrevSibling
else:
ro =lRO[0]
for table in lTables:
- orderGroup = PageXml.createPageXmlNode('OrderedGroup')
+ orderGroup = PageXml.createPageXmlNode('OrderedGroup',self.xmlns)
ro.append(orderGroup)
- orderGroup.set('{%s}id'%PageXml.NS_PAGE_XML,table.get('id'))
- orderGroup.set('{%s}caption'%PageXml.NS_PAGE_XML,'Cell2TextRegion')
+ orderGroup.set('{%s}id'%self.xmlns,table.get('id'))
+ orderGroup.set('{%s}caption'%self.xmlns,'Cell2TextRegion')
xpath = "./a:%s" % ("TableCell")
lCells = table.xpath(xpath,namespaces = self.ns)
@@ -169,7 +169,7 @@ def convertTableCells(self,document):
# cell.unlinkNode()
# print cell
table.getparent().append(cell)
- cell.tag = '{%s}TextRegion'%(PageXml.NS_PAGE_XML)
+ cell.tag = '{%s}TextRegion'%(self.xmlns)
cell.set('custom',"readingOrder {index:%d;}"%i)
# delete cell props
for propname in ['row','col','rowSpan','colSpan']:
@@ -181,10 +181,10 @@ def convertTableCells(self,document):
lCorner = cell.xpath(xpath,namespaces = self.ns)
for c in lCorner:
c.getparent().remove(c)
- reind = PageXml.createPageXmlNode('RegionRefIndexed')
+ reind = PageXml.createPageXmlNode('RegionRefIndexed', self.xmlns)
orderGroup.append(reind)
- reind.set('{%s}index'%PageXml.NS_PAGE_XML,str(i))
- reind.set('{%s}regionRef'%PageXml.NS_PAGE_XML,cell.get('id'))
+ reind.set('{%s}index'%self.xmlns,str(i))
+ reind.set('{%s}regionRef'%self.xmlns,cell.get('id'))
## resize cell/region:
if self.resizeCell(cell,self.ns):
diff --git a/usecases/ABP/src/contentGenerator.py b/usecases/ABP/src/contentGenerator.py
index 78ccabe..67a4a4a 100644
--- a/usecases/ABP/src/contentGenerator.py
+++ b/usecases/ABP/src/contentGenerator.py
@@ -11,18 +11,7 @@
copyright NLE 2017
READ project
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
Developed for the EU project READ. The READ project has received funding
@@ -68,7 +57,7 @@ def generate(self):
class AgeUnitGenerator(textGenerator):
def __init__(self):
textGenerator.__init__(self,lang=None)
- self.loadResourcesFromList( [[('Jahre',50),('Ja',10),('Monate',10),('M.',10),('W',5),('Wochen',5),('T',3),('Tag',6),('Stunde',10)]])
+ self.loadResourcesFromList( [[('Jahre',50),('Ja',10),('Monate',20),('Wochen',10),('Tag',10),('Stunde',10)]])
class ageValueGenerator(integerGenerator):
"""
@@ -108,44 +97,18 @@ def generate(self):
return Generator.generate(self)
class legitimGenerator(textGenerator):
- """
- ID name namelabel kuerzel
- 1_1 legitim leg. l
- 1_2 illegitim ill. i
- 1_3 adoptiert adopt. a
- 1_4 durch nachfolge p.m.s.l. vor
-
- """
def __init__(self):
textGenerator.__init__(self,lang=None)
# self._value = ['leg','legitim','illeg','illegitim']
self.loadResourcesFromList( [[('leg',60),('legitim',20),('illeg',10),('illegitim',20)]])
class religionGenerator(textGenerator):
- """
- 2_1 katholisch kath. rk
-2_2 evangelisch ev. ev
-2_3 orthodox orth. or
-2_4 sonstige sonst. ss
-2_5 altkatholisch altkath. alt
-2_6 christlich christlich ch
-2_7 Konvertit Konvertit kon
-2_8 protestantisch prot. pr
-
- """
def __init__(self):
textGenerator.__init__(self,lang=None)
self.loadResourcesFromList( [[('K',30),('kath',40),('katholic',5),('katho',5),('K. R.',5),("evangelist",5),('evang.',5),("evg.",5)]])
# self._value = ['k','kath','katholic','katho','k. R.','evangelist','evang.','evg.']
class familyStatus(textGenerator):
- """
- 3_1 ledig ledig ld
-3_2 verheiratet verh. vh
-3_3 verwitwet verw. vw
-
- children not covered
- """
def __init__(self):
textGenerator.__init__(self,lang=None)
self.loadResourcesFromList( [[('knabe',5),('mädchen',5),('kind',30),('Säugling',5),('ledig',20), ('verehelichet.',10),('erehelicht',10),('wittwe',20),('wittwer',10),('verwitwet',5),('verw.',5),('verheirathet',10),('verhei',10)]])
@@ -166,7 +129,7 @@ class deathreasonGenerator(textGenerator):
def __init__(self):
textGenerator.__init__(self,lang=None)
self._name = 'deathreason'
- self._lpath=[os.path.abspath('../resources/old/deathreason.pkl')]
+ self._lpath=[os.path.abspath('../resources/deathreason.pkl')]
self._value = list(map(lambda x:x[0],self.loadResources(self._lpath)))
self._lenRes= len(self._lresources)
@@ -182,22 +145,7 @@ def __init__(self):
two locations
"""
-class NGenerator(textGenerator):
- def __init__(self):
- textGenerator.__init__(self,lang=None)
- self.loadResourcesFromList( [[('N',50),('Num',10)]])
-
-class HausnumberGenerator(textGenerator):
- def __init__(self,mean,std):
- textGenerator.__init__(self,lang=None)
- self._structure = [ ( (NGenerator(),1,80) ,(positiveIntegerGenerator(mean,std),1,100),100 ) ]
- def generate(self):
- return Generator.generate(self)
-
class location2Generator(textGenerator):
- """
- missing Rothsmansdorf Nr̳ 12
- """
def __init__(self):
textGenerator.__init__(self,lang=None)
self._name = 'location'
@@ -205,7 +153,7 @@ def __init__(self):
self.location2 = locationGenerator()
self.prep = locationPrepositionGenerator()
self._structure = [
- ( (self.location2,1,20),(self.prep,1,10), (self.location,1,100),(HausnumberGenerator(50,10),1,20),(legitimGenerator(),1,10),100)
+ ( (self.location2,1,20),(self.prep,1,10), (self.location,1,100),(legitimGenerator(),1,10),100)
]
def generate(self):
return Generator.generate(self)
@@ -214,7 +162,7 @@ class locationGenerator(textGenerator):
def __init__(self):
textGenerator.__init__(self,lang=None)
self._name = 'location'
- self._lpath=[os.path.abspath('../resources/old/location.pkl')]
+ self._lpath=[os.path.abspath('../resources/location.pkl')]
self._value = list(map(lambda x:x[0],self.loadResources(self._lpath)))
self._lenRes= len(self._lresources)
@@ -232,7 +180,7 @@ class professionGenerator(textGenerator):
def __init__(self):
textGenerator.__init__(self,lang=None)
self._name = 'profession'
- self._lpath=[os.path.abspath('../resources/old/profession.pkl')]
+ self._lpath=[os.path.abspath('../resources/profession.pkl')]
self._value = list(map(lambda x:x[0],self.loadResources(self._lpath)))
self._lenRes= len(self._lresources)
@@ -241,7 +189,7 @@ class firstNameGenerator(textGenerator):
def __init__(self):
textGenerator.__init__(self,lang=None)
self._name = 'firstName'
- self._lpath=[os.path.abspath('../resources/old/firstname.pkl')]
+ self._lpath=[os.path.abspath('../resources/firstname.pkl')]
self._value = list(map(lambda x:x[0],self.loadResources(self._lpath)))
self._lenRes= len(self._lresources)
@@ -250,7 +198,7 @@ class lastNameGenerator(textGenerator):
def __init__(self):
textGenerator.__init__(self,lang=None)
self._name = 'firstName'
- self._lpath=[os.path.abspath('../resources/old/lastname.pkl')]
+ self._lpath=[os.path.abspath('../resources/lastname.pkl')]
self._value = list(map(lambda x:x[0],self.loadResources(self._lpath)))
self._lenRes= len(self._lresources)
@@ -344,12 +292,6 @@ def generate(self):
return self
class MonthDayDateGenerator(textGenerator):
- """
- '16. Nov' -> [((0, 0), '16.', 'numberedItems', [0.9996762]), ((1, 1), 'Nov', 'MonthDateGenerator', [0.9997758])]
-
-
- add . after number ?
- """
def __init__(self,lang,value=None):
textGenerator.__init__(self,lang)
self._value = [value]
@@ -392,7 +334,7 @@ class HourDateGenerator(textGenerator):
def __init__(self,lang,value=None):
self._fulldate = None
textGenerator.__init__(self,lang)
- #self._value = [value]
+ self._value = [value]
self.realization=['H','I']
def setValue(self,d):
@@ -400,8 +342,8 @@ def setValue(self,d):
self._value = [d.hour]
def generate(self):
- try:self._generation = u""+str(int(self._fulldate.strftime('%'+ '%s'%self.getRandomElt(self.realization))))
- except UnicodeDecodeError: self._generation = u""+self._fulldate.strftime('%'+ '%d'%self.getRandomElt(self.realization)).decode('latin-1')
+ try:self._generation = u""+self._fulldate.strftime('%'+ '%s'%self.getRandomElt(self.realization))
+ except UnicodeDecodeError: self._generation = u""+self._fulldate.strftime('%'+ '%s'%self.getRandomElt(self.realization)).decode('latin-1')
return self
class yearGenerator(textGenerator):
@@ -429,7 +371,7 @@ def generate(self):
class DayPartsGenerator(textGenerator):
def __init__(self,lang,value=None):
textGenerator.__init__(self,lang)
- self._value=['abends','morgens','vormittags','nachmittags','mittags','nacht','fruh','früh']
+ self._value=['abends','morgens','nachmittags','mittags','nacht','fruh']
class FullHourDateGenerator(textGenerator):
@@ -458,7 +400,7 @@ def __init__(self,lang):
self.hourGen = FullHourDateGenerator(lang)
self.yearGen = yearGenerator(lang)
self._structure = [
- ((self.yearGen,1,90),(self.weekdayGen,1,90),(self.monthdayGen,1,90),(self.monthGen,1,90),(self.hourGen,1,100), 100)
+ ((self.yearGen,1,90),(self.weekdayGen,1,90),(self.monthdayGen,1,90),(self.monthGen,1,90),(self.hourGen,1,100), 75)
]
def setValue(self,v):
"""
@@ -513,11 +455,8 @@ def __init__(self):
self._structure = [
- ( (self.monthdayGen,1,90),(self.monthGen,1,100), 100)
- , ( (self.weekdayGen,1,90),(self.monthdayGen,1,90),(self.monthGen,1,90),(self.yearGen,1,40),(self.hourGen,1,100), 100)
- , ( (DENGenerator(self.lang),1,100),(self.monthdayGen,1,100),(self.monthGen,1,90), (self.hourGen,1,10) ,100)
- # ??
- ,( (self.yearGen,1,100),50)
+ ( (self.weekdayGen,1,90),(self.monthdayGen,1,90),(self.monthGen,1,90),(self.yearGen,1,40),(self.hourGen,1,100), 100)
+ ,( (DENGenerator(self.lang),1,100),(self.monthdayGen,1,100),(self.monthGen,1,90), (self.hourGen,1,10) ,100)
]
@@ -550,46 +489,6 @@ class ABPRecordGenerator(textGenerator):
else:
lang='de-DE'
- # per type as wel!!
- lClassesToBeLearnt = [[],[]]
- lClassesToBeLearnt[1] = [
- 'deathreasonGenerator'
- ,'doktorGenerator'
- ,'legitemGenerator'
- ,'doktorTitleGenerator'
- ,'lastNameGenerator'
- ,'firstNameGenerator'
- ,'professionGenerator'
- ,'religionGenerator'
- ,'familyStatus'
- ,'textletterRandomGenerator'
- ,'numberedItems'
- ,'location2Generator'
- ,'ageValueGenerator'
- ,'AgeUnitGenerator'
- ,'DENGeneratornum'
- ,'MonthDayDateGenerator'
- ,'weekDayDateGenerator'
- ,'MonthDateGenerator'
- ,'UMGenerator'
- ,'HourDateGenerator'
- ,'UHRGenerator'
- ,'yearGenerator'
- ,'numericalGenerator'
- ,'textRandomGenerator'
- ,'integerGenerator'
- ,'textletterRandomGenerator'
- ,'legitimGenerator'
- ]
-
- lClassesToBeLearnt[0]= [
- 'deathreasonGenerator'
- ,'doktorGenerator'
- ,'PersonName2'
- ,'AgeGenerator'
- ,'ABPGermanDateGenerator'
- ]
-
# method level otherwise loadresources for each sample!!
person= PersonName2(lang)
date= ABPGermanDateGenerator()
@@ -610,6 +509,31 @@ class ABPRecordGenerator(textGenerator):
noise2 = textletterRandomGenerator(10,5)
+ # per type as wel!!
+ lClassesToBeLearnt =['deathreasonGenerator'
+ ,'doktorGenerator'
+ ,'doktorTitleGenerator'
+ ,'PersonName2'
+ ,'lastNameGenerator'
+ ,'firstNameGenerator'
+ ,'professionGenerator'
+ ,'religionGenerator'
+ ,'familyStatus'
+ ,'textletterRandomGenerator'
+ ,'locationGenerator'
+ ,'AgeGenerator'
+ ,'ageValueGenerator'
+ ,'AgeUnitGenerator'
+ ,'ABPGermanDateGenerator'
+ ,'DENGeneratornum'
+ ,'MonthDayDateGenerator'
+ ,'weekDayDateGenerator'
+ ,'MonthDateGenerator'
+ ,'UMGenerator'
+ ,'HourDateGenerator'
+ ,'UHRGenerator'
+ ,'yearGenerator'
+ ]
def __init__(self):
textGenerator.__init__(self,self.lang)
@@ -620,8 +544,7 @@ def __init__(self):
self.noise2,self.person,
self.date,self.deathreasons,self.doktor,self.location,self.profession,self.status, self.age, self.misc]
-# myList=[self.person]
- for g in myList: g.setClassesToBeLearnt(self.lClassesToBeLearnt)
+
self._structure = []
@@ -674,7 +597,7 @@ class ABPRecordGeneratorTOK(textGenerator):
# method level otherwise loadresources for each sample!!
person= PersonName2(lang)
date= ABPGermanDateGenerator()
- date.defineRange(1700, 1900)
+ date.defineRange(1700, 2000)
deathreasons = deathReasonColumnGenerator(lang)
doktor= doktorGenerator(lang)
location= location2Generator()
@@ -732,55 +655,37 @@ def ABP(options,args):
g.GTForTokenization()
else:
if options.bLoad:
- with gzip.open(os.path.join(options.dirname,options.name+".pkl"), "rb") as fd:
- g = pickle.load(fd)
- print('generator loaded:%s'%(os.path.join(options.dirname,options.name+".pkl")))
- print (g.__class__.__name__)
- print (g.getNoiseLevel())
- else:
- g = ABPRecordGenerator()
- g.setNoiseType(options.noiseType)
- g.setNoiseLevel(options.noiseLevel)
-
- if options.bconll:
- lReport={}
-
- lvlrange = [0,10]
- lfd=[None for i in range(len(lvlrange))]
- for i,lvl in enumerate(lvlrange):
- lfd[i] = open(os.path.join(options.dirname,options.name+"_%s_%s.txt"%(lvl,g.getNoiseType())), "w",encoding='utf-8')
-
+ pass
+
+ g = ABPRecordGenerator()
+ g.setNoiseType(options.noiseType)
+ lReport={}
+ fd= open(os.path.join(options.dirname,options.name+".txt"), "w",encoding='utf-8')
for i in range(options.nbX):
g.instantiate()
- # store the history?
g.generate()
try:lReport[tuple(g._instance)] +=1
except KeyError: lReport[tuple(g._instance)] = 1
-
if options.bFairseq:
sS,sT =g.formatFairSeqWord(g.exportAnnotatedData([]))
if len(sS.strip()) > 0:
iosource.write("%s\n"%sS)
iotarget.write("%s\n"%sT)
-
- elif options.bconll:
- for i,lvl in enumerate(lvlrange):
- g.setNoiseLevel(lvl)
- sGen = g.formatAnnotatedData(g.exportAnnotatedData([ "None","None" ,"None"]),mode=2)
- lfd[i].write(sGen)
-
- if options.bconll:
- [lfd[i].write("# %s %s\n"%(lReport[inst],inst)) for i in range(len(lvlrange)) for inst in lReport]
- [fd.close() for fd in lfd]
+ else:
+ sGen = g.formatAnnotatedData(g.exportAnnotatedData([]),mode=2)
+ fd.write(sGen)
+ for inst in lReport:
+ fd.write("# %s %s\n"%(lReport[inst],inst))
+ fd.close()
if options.bFairseq:
iosource.close()
iotarget.close()
-# elif options.bconll:
-# if g is not None and not options.bLoad:
-# with gzip.open(os.path.join(options.dirname,options.name+".pkl"), "wb") as fd:
-# pickle.dump(g, fd, protocol=2)
+ elif options.bconll:
+ if g is not None:
+ with gzip.open(os.path.join(options.dirname,options.name+".pkl"), "wb") as fd:
+ pickle.dump(g, fd, protocol=2)
if __name__ == "__main__":
@@ -794,13 +699,11 @@ def ABP(options,args):
parser.add_option("--model", dest="name", action="store", type="string",default="test.pkl", help="model name")
parser.add_option("--dir", dest="dirname", action="store", type="string", default=".",help="directory to store model")
parser.add_option("--noise", dest="noiseType", action="store", type=int, default=0, help="add noise of type N")
- parser.add_option("--noiselvl", dest="noiseLevel", action="store", type=int, default=10, help="noise level (percentage) NN")
-
- parser.add_option("--load", dest="bLoad", action="store_true", default=False, help="load model")
+ parser.add_option("--load", dest="bLoad", action="store_true", default=False, help="model name")
parser.add_option("--number", dest="nbX", action="store", type=int, default=10,help="number of samples")
- parser.add_option("--tok", dest="bTok", action="store", type=int,default=False, help="correct tokenisation GT")
+ parser.add_option("--tok", dest="bTok", action="store", type=int,default=False, help="correct tokination GT")
parser.add_option("--fairseq", dest="bFairseq", action="store", type=int, default=False,help="seq2seq GT")
- parser.add_option("--conll", dest="bconll", action="store_true", default=True,help="conll like GT")
+ parser.add_option("--conll", dest="bconll", action="store", type=int, default=True,help="conll like GT")
(options, args) = parser.parse_args()
diff --git a/usecases/ABP/src/processDatesPerPage.py b/usecases/ABP/src/processDatesPerPage.py
index 40fdbe2..0a8ea9e 100644
--- a/usecases/ABP/src/processDatesPerPage.py
+++ b/usecases/ABP/src/processDatesPerPage.py
@@ -1,29 +1,12 @@
# -*- coding: utf-8 -*-
"""
-
-
Build a table grid from cells
H. Déjean
-
copyright Naver 2019
READ project
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see .
-
-
Developed for the EU project READ. The READ project has received funding
from the European Union's Horizon 2020 research and innovation programme
under grant agreement No 674943.
diff --git a/usecases/BAR/mpxml_viewer.bat b/usecases/BAR/mpxml_viewer.bat
index 862bde1..0515c08 100644
--- a/usecases/BAR/mpxml_viewer.bat
+++ b/usecases/BAR/mpxml_viewer.bat
@@ -1 +1 @@
-C:\Anaconda2\python.exe c:\Local\meunier\git\TranskribusDU\TranskribusDU\visu\mpxml_viewer.bat.py %0.ini
+C:\Anaconda\python.exe c:\Local\TranskribusDU\src\visu\mpxml_viewer.bat.py C:\Local\TranskribusDU\usecases\BAR\mpxml_viewer.bat.ini
diff --git a/usecases/StAZH/DU_StAZH.py b/usecases/StAZH/DU_StAZH.py
new file mode 100644
index 0000000..33bf279
--- /dev/null
+++ b/usecases/StAZH/DU_StAZH.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+"""
+ First DU task for StAZH
+
+ Copyright Xerox(C) 2016 JL. Meunier
+ Copyright Naver (C) 2019 H. Déjean
+"""
+import sys, os
+
+import TranskribusDU_version # if import error, updade the PYTHONPATH environment variable
+
+from common.trace import traceln
+from tasks.DU_Task_Factory import DU_Task_Factory
+from graph.Graph_Multi_SinglePageXml import Graph_MultiSinglePageXml
+from graph.NodeType_PageXml import NodeType_PageXml_type
+from graph.FeatureDefinition_PageXml_std import FeatureDefinition_PageXml_StandardOnes
+from graph.NodeType_PageXml import NodeType_PageXml_type_woText, NodeType_PageXml_type
+from graph.FeatureDefinition_PageXml_std_noText import FeatureDefinition_PageXml_StandardOnes_noText
+
+
+def getConfiguredGraphClass(doer):
+ """
+ In this class method, we must return a configured graph class
+ """
+ #DU_GRAPH = ConjugateSegmenterGraph_MultiSinglePageXml # consider each age as if indep from each other
+ DU_GRAPH = Graph_MultiSinglePageXml
+
+ ntClass = NodeType_PageXml_type_woText #NodeType_PageXml_type
+
+ #lIgnoredLabels = ['menu-section-heading','Item-number']
+
+ lLabels = ['catch-word', 'header', 'heading', 'marginalia', 'page-number']
+
+ nt = ntClass("TR" #some short prefix because labels below are prefixed with it
+ , lLabels # in conjugate, we accept all labels, andNone becomes "none"
+ , []
+ , True # unused
+ , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v/3)) #we reduce overlap in this way
+ )
+ nt.setLabelAttribute("type")
+ #DU_GRAPH.sEdgeLabelAttribute="TR"
+ nt.setXpathExpr((".//pc:TextRegion"
+ , ".//pc:TextEquiv") #how to get their text
+ )
+ DU_GRAPH.addNodeType(nt)
+
+ return DU_GRAPH
+
+
+if __name__ == "__main__":
+ # import better_exceptions
+ # better_exceptions.MAX_LENGTH = None
+
+ # standard command line options for CRF- ECN- GAT-based methods
+ usage, parser = DU_Task_Factory.getStandardOptionsParser(sys.argv[0])
+
+ traceln("VERSION: %s" % DU_Task_Factory.getVersion())
+
+ # ---
+ #parse the command line
+ (options, args) = parser.parse_args()
+
+ cFeatureDefinition = FeatureDefinition_PageXml_StandardOnes_noText
+# dFeatureConfig = {
+# 'n_tfidf_node':400, 't_ngrams_node':(1,3), 'b_tfidf_node_lc':False
+# , 'n_tfidf_edge':400, 't_ngrams_edge':(1,3), 'b_tfidf_edge_lc':False }
+ try:
+ sModelDir, sModelName = args
+ except Exception as e:
+ traceln("Specify a model folder and a model name!")
+ DU_Task_Factory.exit(usage, 1, e)
+
+ doer = DU_Task_Factory.getDoer(sModelDir, sModelName
+ , options = options
+ , fun_getConfiguredGraphClass= getConfiguredGraphClass
+ , cFeatureDefinition = cFeatureDefinition
+ , dFeatureConfig = {}
+ )
+
+ # setting the learner configuration, in a standard way
+ # (from command line options, or from a JSON configuration file)
+ dLearnerConfig = doer.getStandardLearnerConfig(options)
+
+
+ # of course, you can put yours here instead.
+ doer.setLearnerConfiguration(dLearnerConfig)
+
+ #doer.setConjugateMode()
+
+ # act as per specified in the command line (--trn , --fold-run, ...)
+ doer.standardDo(options)
+
+ del doer
+
+
+
+
+
+
+
diff --git a/usecases/__init__.py b/usecases/__init__.py
new file mode 100644
index 0000000..d52e42e
--- /dev/null
+++ b/usecases/__init__.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+
+#REMOVE THIS annoying warning saying:
+# /usr/lib/python2.7/site-packages/requests-2.12.1-py2.7.egg/requests/packages/urllib3/connectionpool.py:843: InsecureRequestWarning: Unverified HTTPS request is being made.
+# Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings InsecureRequestWarning)
+
+import sys, os
+
+DEBUG=0
+
+sCOL = "col"
+
+def _exit(usage, status, exc=None):
+ if usage:
+ sys.stderr.write("ERROR: usage : %s\n"%usage)
+ if exc != None:
+ sys.stderr.write(str(exc)) #any exception?
+ sys.exit(status)
+
+def _checkFindColDir(lsDir, sColName=sCOL, bAbsolute=True):
+ """
+ For each directory in the input list, check if it is a "col" directory, or look for a 'col' sub-directory
+ If a string is given instead of a list, make of it a list
+ If None is given, just return an empty list
+ return the list of "col" directory absolute path
+ or raise an exception
+ """
+ if lsDir == None: return list()
+ if type(lsDir) != list: lsDir = [lsDir]
+ lsColDir = list()
+ for sDir in lsDir:
+ if not(sDir.endswith(sColName) or sDir.endswith(sColName+os.path.sep)):
+ sColDir = os.path.join(sDir, sColName)
+ else:
+ sColDir = sDir
+ if bAbsolute:
+ sColDir = os.path.abspath(sColDir)
+ if not( os.path.exists(sColDir) and os.path.isdir(sColDir) ):
+ raise ValueError("Non-existing folder: %s"%sColDir)
+ lsColDir.append(sColDir)
+ return lsColDir