Skip to content
This repository has been archived by the owner on Mar 6, 2019. It is now read-only.

Updates for speed and python 3 compatibility #7

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 12 additions & 11 deletions bin/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
from __future__ import print_function
import sys


Expand Down Expand Up @@ -44,14 +45,14 @@
if total_words_per_sentence == correct_words_per_sentence:
correct_sentences += 1

print
print 'Sentence-Level Stats:'
print '\tcorrect: ', correct_sentences
print '\ttotal: ', total_sentences
print '\t% correct: ', 100 * (correct_sentences / float(total_sentences))

print
print 'Word-Level Stats:'
print '\tcorrect:', correct_words
print '\ttotal:', total_words
print '\t% correct:', 100 * (correct_words / float(total_words))
print()
print('Sentence-Level Stats:')
print('\tcorrect: ', correct_sentences)
print('\ttotal: ', total_sentences)
print('\t% correct: ', 100 * (correct_sentences / float(total_sentences)))

print()
print('Word-Level Stats:')
print('\tcorrect:', correct_words)
print('\ttotal:', total_words)
print('\t% correct:', 100 * (correct_words / float(total_words)))
57 changes: 43 additions & 14 deletions ingredient_phrase_tagger/training/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from __future__ import print_function

import re
import decimal
import optparse
import argparse
import pandas as pd
from multiprocessing import Pool, TimeoutError, Queue, Process, Manager
from Queue import Empty

import utils

Expand All @@ -10,24 +14,37 @@ class Cli(object):
def __init__(self, argv):
self.opts = self._parse_args(argv)
self._upstream_cursor = None
m = Manager()
self.output_queue = m.Queue()

def run(self):
self.generate_data(self.opts.count, self.opts.offset)
self.generate_data(self.opts.count, self.opts.offset, self.opts.threads)

def generate_data(self, count, offset):
def generate_data(self, count, offset, threads):
"""
Generates training data in the CRF++ format for the ingredient
tagging task
"""
df = pd.read_csv(self.opts.data_path)
df = df.fillna("")

start = int(offset)
end = int(offset) + int(count)
start = offset
end = offset + count

df_slice = df.iloc[start: end]

for index, row in df_slice.iterrows():
qr = Process(target=self.start_queue_reader)
qr.start()
worker_pool = Pool(processes=threads or None)
worker_pool.map_async(self._generate_data_worker, df_slice.iterrows())
worker_pool.close()
worker_pool.join()
self.output_queue.put('DONE')
qr.join()

def _generate_data_worker(self, args):
index, row = args
out = []
try:
# extract the display name
display_input = utils.cleanUnicodeFractions(row["input"])
Expand All @@ -38,13 +55,24 @@ def generate_data(self, count, offset):

for i, (token, tags) in enumerate(rowData):
features = utils.getFeatures(token, i+1, tokens)
print utils.joinLine([token] + features + [self.bestTag(tags)])
out.append(utils.joinLine([token] + features + [self.bestTag(tags)]))

# ToDo: deal with this
except UnicodeDecodeError:
pass

print
if out:
self.output_queue.put('\n'.join(out))

def start_queue_reader(self):
o = None
while o != 'DONE':
try:
o = self.output_queue.get()
if o != 'DONE':
print(o, end="\n\n", flush=True)
except Empty:
pass

def parseNumbers(self, s):
"""
Expand Down Expand Up @@ -152,11 +180,12 @@ def _parse_args(self, argv):
Parse the command-line arguments into a dict.
"""

opts = optparse.OptionParser()
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

opts.add_option("--count", default="100", help="(%default)")
opts.add_option("--offset", default="0", help="(%default)")
opts.add_option("--data-path", default="nyt-ingredients-snapshot-2015.csv", help="(%default)")
parser.add_argument("--count", default=100, type=int, help=' ')
parser.add_argument("--offset", default=0, type=int, help=' ')
parser.add_argument("--threads", default=0, type=int, help=' ')
parser.add_argument("--data-path", default="nyt-ingredients-snapshot-2015.csv", help=' ')

(options, args) = opts.parse_args(argv)
return options
return parser.parse_args(argv)
47 changes: 44 additions & 3 deletions roundtrip.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,56 @@
#!/bin/sh
COUNT_TRAIN=20000
COUNT_TEST=2000
COUNT_THREAD=$(grep -c ^processor /proc/cpuinfo)

usage() {
printf "Usage: %s: [−-train-count COUNT] [−-test-count COUNT] [--thread-count COUNT] <input-file>\n" $0
exit 2
}

optspec=":h-:"
while getopts "$optspec" optchar; do
case "${optchar}" in
-)
case "${OPTARG}" in
"train-count")
val="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
COUNT_TRAIN=$val
;;
"test-count")
val="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
COUNT_TEST=$val
;;
"threads")
val="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
COUNT_THREAD=$val
;;
*)
usage;;
esac;;
*)
usage;;
esac
done
shift $(($OPTIND - 1))
input_file=$1
if [ -z "$input_file" ]; then
printf "Please provide an input csv file with training data\n"
usage
fi

echo "Input file: ${iput_file}"
echo "Training count: ${COUNT_TRAIN}"
echo "Testing count: ${COUNT_TEST}"

echo "generating training data..."
bin/generate_data --data-path=nyt-ingredients-snapshot-2015.csv --count=$COUNT_TRAIN --offset=0 > tmp/train_file || exit 1
bin/generate_data --data-path=$input_file --count=$COUNT_TRAIN --offset=0 --threads=$COUNT_THREAD > tmp/train_file || exit 1

echo "generating test data..."
bin/generate_data --data-path=nyt-ingredients-snapshot-2015.csv --count=$COUNT_TEST --offset=$COUNT_TRAIN > tmp/test_file || exit 1
bin/generate_data --data-path=$input_file --count=$COUNT_TEST --offset=$COUNT_TRAIN --threads=$COUNT_THREAD > tmp/test_file || exit 1

echo "training..."
crf_learn template_file tmp/train_file tmp/model_file || exit 1
crf_learn -p$COUNT_THREAD template_file tmp/train_file tmp/model_file || exit 1

echo "testing..."
crf_test -m tmp/model_file tmp/test_file > tmp/test_output || exit 1
Expand Down