diff --git a/.gitignore b/.gitignore index 7a3c805..095c030 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ env/ -*.swp .cache/ .pytest_cache/ *__pycache__* *.egg-info/ +# vim swap files +*.swp +# pyc files +*.pyc diff --git a/gramophone/apps/__init__.py b/gramophone/apps/__init__.py index f50b57a..195a260 100644 --- a/gramophone/apps/__init__.py +++ b/gramophone/apps/__init__.py @@ -1 +1,2 @@ from .gp_app import create_gp_app +from .hy_app import create_hy_app diff --git a/gramophone/apps/gp_app.py b/gramophone/apps/gp_app.py index e362d43..0856426 100644 --- a/gramophone/apps/gp_app.py +++ b/gramophone/apps/gp_app.py @@ -1,12 +1,41 @@ from __future__ import absolute_import from flask import Flask +from flask import request -def create_gp_app(mapping): +def create_gp_app(aligner,transcriber,rater,formatter): app = Flask(__name__) - @app.route('/gp') + @app.route('/gp/', methods=['GET', 'POST']) def index(): - return mapping + if request.method == 'GET': + + # get args + strings = request.args.getlist('w') + formats = request.args.getlist('f') + + oformat = "" + if formats: + oformat = formats[0] + + results = [] + for string in strings: + segmentations = aligner.scan(string.lower()) + best_transcription = [] + best_prob = 0.0 + for segmentation in segmentations: + transcriptions = transcriber.transcribe(segmentation) + for transcription in transcriptions: + prob = rater.rate([segmentation,transcription]) + #click.echo("%s: %f" % (u",".join(transcription),prob), err=True) + if prob >= best_prob: + best_prob = prob + best_transcription = transcription + results.append((string,u",".join(best_transcription),prob)) + + return formatter.encode(results,oformat) + + elif request.method == 'POST': + return str(request.form) return app diff --git a/gramophone/apps/hy_app.py b/gramophone/apps/hy_app.py new file mode 100644 index 0000000..d9bc147 --- /dev/null +++ b/gramophone/apps/hy_app.py @@ -0,0 +1,36 @@ +from __future__ import absolute_import + +from flask import Flask +from flask import request + +def create_hy_app(coder,labeller,formatter): + app = Flask(__name__) + + @app.route('/hy/', methods=['GET', 'POST']) + def index(): + if request.method == 'GET': + + # get args + strings = request.args.getlist('w') + formats = request.args.getlist('f') + + oformat = "" + if formats: + oformat = formats[0] + + results = [] + for string in strings: + encodement = coder.encode(string,mode="scan") + labellings = labeller.label(encodement) + combination = [] + for labelling in labellings: + for i in range(len(encodement)): + combination.append(u"%s\t%s" % (encodement[i],labelling[i])) + results.append((string,coder.decode(combination))) + + return formatter.encode(results, oformat) + + elif request.method == 'POST': + return str(request.form) + + return app diff --git a/gramophone/gp/__init__.py b/gramophone/gp/__init__.py index 24c99dc..45d21ef 100644 --- a/gramophone/gp/__init__.py +++ b/gramophone/gp/__init__.py @@ -1,3 +1,4 @@ from .alignment import Aligner from .transcription import Transcriber from .rating import Rater +from .formatting import Formatter diff --git a/gramophone/gp/alignment.py b/gramophone/gp/alignment.py index 96a6451..91fb08b 100644 --- a/gramophone/gp/alignment.py +++ b/gramophone/gp/alignment.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- -import sys import pywrapfst as fst import regex as re @@ -41,9 +40,15 @@ def chain(self,g): t.set_output_symbols(self.syms) src = t.add_state() t.set_start(src) + dest = src for c in g: + # skip unknown symbols + try: + s = self.syms.find(c) + except: + continue dest = t.add_state() - t.add_arc(src,fst.Arc(self.syms.find(c), self.syms.find(c), "0", dest)) + t.add_arc(src,fst.Arc(s, s, "0", dest)) src = dest t.set_final(dest) return t @@ -89,7 +94,6 @@ def __align_fst(self,g,p): t4.project(project_output=True) if t4.num_arcs(t4.start()) == 0: - sys.stderr.write(u"Empty expansion: %s %s\n" % (g, p)) return fst.Fst() t5 = fst.compose(t3,self.E) diff --git a/gramophone/gp/formatting.py b/gramophone/gp/formatting.py new file mode 100644 index 0000000..45a105c --- /dev/null +++ b/gramophone/gp/formatting.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- + +import json + +class Formatter: + + def __init__(self): + """ + The constructor. + """ + pass + + def encode(self, io_tuples, fmt): + + fmt = fmt.strip().lower() + + if fmt == "txt": + return "\n".join("%s\t%s" % (triple[0], triple[1]) for triple in io_tuples) + + elif fmt == "json": + result = [] + for triple in io_tuples: + result.append({"word" : triple[0], "phonology" : triple[1], "probability" : "%.5f" % triple[2]}) + return json.dumps(result) + else: + return str(io_tuples) diff --git a/gramophone/hy/__init__.py b/gramophone/hy/__init__.py index 491d118..912cbc5 100644 --- a/gramophone/hy/__init__.py +++ b/gramophone/hy/__init__.py @@ -1,2 +1,3 @@ from .coding import Coder from .labelling import Labeller +from .formatting import Formatter diff --git a/gramophone/hy/formatting.py b/gramophone/hy/formatting.py new file mode 100644 index 0000000..cb98e05 --- /dev/null +++ b/gramophone/hy/formatting.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- + +import json + +class Formatter: + + def __init__(self): + """ + The constructor. + """ + pass + + def encode(self, io_tuples, fmt): + + fmt = fmt.strip().lower() + + if fmt == "txt": + return "\n".join("%s\t%s" % (pair[0], pair[1]) for pair in io_tuples) + + elif fmt == "json": + result = [] + for pair in io_tuples: + result.append({"word" : pair[0], "hyphenation" : pair[1]}) + return json.dumps(result) + else: + return str(io_tuples) diff --git a/gramophone/scripts/gramophone.py b/gramophone/scripts/gramophone.py index ebf0095..91f7846 100644 --- a/gramophone/scripts/gramophone.py +++ b/gramophone/scripts/gramophone.py @@ -27,7 +27,7 @@ def HY(name="hy"): @click.option('-m', '--model', default='model', help='prefix of the output model files') @click.argument('data') def train_gp(mapping,model,data): - """Train a model""" + """Train a model.""" # # stage 1: alignment @@ -129,7 +129,7 @@ def apply_gp(mapping,crf,lm,strings): # convert for string in in_strings: - segmentations = aligner.scan(string) + segmentations = aligner.scan(string.lower()) best_transcription = [] best_prob = 0.0 for segmentation in segmentations: @@ -195,7 +195,7 @@ def train_hy(model,data): labeller.save(model + ".hy.crf") @HY.command(name="apply") -@click.option('-c', '--crf', required=True, help='transcription CRF model') +@click.option('-c', '--crf', required=True, help='hyphenation CRF model') @click.argument('strings', nargs=-1) def apply_hy(crf,strings): """Convert strings""" diff --git a/gramophone/scripts/gramophone_server.py b/gramophone/scripts/gramophone_server.py index b83047a..37a92b7 100644 --- a/gramophone/scripts/gramophone_server.py +++ b/gramophone/scripts/gramophone_server.py @@ -3,12 +3,71 @@ import click from gramophone import apps +from gramophone import gp +from gramophone import hy -@click.command() +@click.group() +def cli(): + pass + +@cli.command(name="gp") @click.option('-M', '--mapping', required=True, help='grapheme-phoneme mapping') -def run(mapping): +@click.option('-c', '--crf', required=True, help='transcription CRF model') +@click.option('-l', '--language-model', 'lm', required=True, help='rating language model') +def run_gp(mapping,crf,lm): """ - Run the application + Run the g2p server. """ - app = apps.create_gp_app(mapping) + + # + # loading + # + click.echo(u"Loading...", err=True) + + click.echo(u"...data alignment", err=True) + aligner = gp.Aligner(mapping=mapping) + + click.echo(u"...transcription CRF model", err=True) + transcriber = gp.Transcriber() + transcriber.load(crf) + + click.echo(u"...n-gram language model", err=True) + rater = gp.Rater.load(lm) + + click.echo(u"...output formatter", err=True) + formatter = gp.Formatter() + + + # + # load app and run + # + app = apps.create_gp_app(aligner,transcriber,rater,formatter) + app.run() + +@cli.command(name="hy") +@click.option('-c', '--crf', required=True, help='hyphenation CRF model') +def run_hy(crf): + """ + Run the hyphenation server. + """ + + # + # loading + # + click.echo(u"Loading...", err=True) + + click.echo(u"...coder", err=True) + coder = hy.Coder() + + click.echo(u"...hyphenation CRF model", err=True) + labeller = hy.Labeller() + labeller.load(crf) + + click.echo(u"...output formatter", err=True) + formatter = hy.Formatter() + + # + # load app and run + # + app = apps.create_hy_app(coder, labeller, formatter) app.run() diff --git a/setup.py b/setup.py index 608033e..5a3dbfa 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ entry_points={ 'console_scripts': [ 'gramophone=gramophone.scripts.gramophone:cli', - 'gramophone_server=gramophone.scripts.gramophone_server:run', + 'gramophone-server=gramophone.scripts.gramophone_server:cli', ] }, )