Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

crfsuite-stdin: command not found #4

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
texts/results
*.pyc
21 changes: 15 additions & 6 deletions README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ You need to install the following third-party software first:

In order to run the software, you need to have the following components:
1. Python 2.7.
2. NLTK 2.0b9 (newer versions will not work because of the different interfaces in the tree modules).
2. NLTK 3.0.0 (I made updates to Vanessa's code to work with the newer NLTK).
3. Java
4. gcc
5. Perl
Expand All @@ -43,7 +43,10 @@ SETUP CRFSUITE
~~~~~~~~~~~~~~~~~~~~~~
The $gCRF_ROOT$ symbol in the commands below stands for the root directory of gCRF.

1. Test if the binary file of CRFsuite-stdin is ready for use, by executing the following two commands:
1. Install working version of NLTK:
pip install --user nltk==3.0.0

2. Test if the binary file of CRFsuite-stdin is ready for use, by executing the following two commands:
cd $gCRF_ROOT$/tools/crfsuite/
crfsuite-stdin tag -pi -m ../../model/tree_build_set_CRF/label/intra.crfsuite test.txt

Expand All @@ -53,20 +56,26 @@ LEAF:0.063409
Elaboration[N][S]:0.958434
LEAF:0.060318

2. If the test in the Step 1 fails, you need to build the binary from source codes. To do so, you need to do the following three things:
a. Build LibLBFGS
3. If the test in the Step 1 fails, you need to build the binary from source codes. To do so, you need to do the following three things:
a. Get LibLBFGS
cd $gCRF_ROOT$/tools/crfsuite
curl -L -O https://github.com/downloads/chokkan/liblbfgs/liblbfgs-1.10.tar.gz
gunzip liblbfgs-1.10.tar.gz
tar xvf liblbfgs-1.10.tar
b. Build LibLBFGS
cd $gCRF_ROOT$/tools/crfsuite/liblbfgs-1.10
./configure --prefix=$HOME/local
make
make install

b. Build CRFsuite-stdin
c. Build CRFsuite-stdin
cd $gCRF_ROOT$/tools/crfsuite/crfsuite-0.12
chmod +x configure
./configure --prefix=$HOME/local --with-liblbfgs=$HOME/local
make
make install

c. Copy the crfsuite binary under $HOME/local/bin to tools/crfsuite and rename it as crfsuite-stdin
d. Copy the crfsuite binary under $HOME/local/bin to tools/crfsuite and rename it as crfsuite-stdin
cp $HOME/local/bin/crfsuite $gCRF_ROOT$/tools/crfsuite/crfsuite-stdin
chmod +x ../crfsuite-stdin

Expand Down
Binary file removed src/classifiers/__init__.pyc
Binary file not shown.
31 changes: 21 additions & 10 deletions src/classifiers/crf_classifier.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## FIXED: Jessy Li 05/28/15
## Change stderr reading for crf output to pipe.communicate();
## adding stdout to Popen();
## output from crf is in fact in stdout not stderr
## seems classifier's Popen has to be called every time a sentence needs to be classified
import subprocess
import paths
import os.path
Expand All @@ -17,28 +22,34 @@ def __init__(self, name, model_type, model_path, model_file, verbose):

self.classifier_cmd = '%s/crfsuite-stdin tag -pi -m %s -' % (paths.CRFSUITE_PATH,
os.path.join(self.model_path, self.model_fname))
self.classifier = None
# print self.classifier_cmd
self.classifier = subprocess.Popen(self.classifier_cmd, shell = True, stdin = subprocess.PIPE, stderr = subprocess.PIPE)
#self.classifier = subprocess.Popen(self.classifier_cmd, shell = True, stdin = subprocess.PIPE, stderr = subprocess.PIPE)

if self.classifier.poll():
raise OSError('Could not create classifier subprocess, with error info:\n%s' % self.classifier.stderr.readline())
#if self.classifier.poll():
# raise OSError('Could not create classifier subprocess, with error info:\n%s' % self.classifier.stderr.readline())

#self.cnt = 0


def classify(self, vectors):
# print '\n'.join(vectors) + "\n\n"

self.classifier.stdin.write('\n'.join(vectors) + "\n\n")
#self.classifier.stdin.write('\n'.join(vectors) + "\n\n")
self.classifier = subprocess.Popen(self.classifier_cmd, shell = True, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
if self.classifier.poll():
raise OSError('Could not create classifier subprocess, with error info:\n%s' % self.classifier.stderr.readline())

lines = []
line = self.classifier.stderr.readline()
while (line.strip() != ''):
#lines = []
#line = self.classifier.stderr.readline()
#while (line.strip() != ''):
# print line
lines.append(line)
line = self.classifier.stderr.readline()

# lines.append(line)
# line = self.classifier.stderr.readline()

stdo,stde = self.classifier.communicate('\n'.join(vectors) +"\n")
lines = stdo.strip().split("\n")

if self.classifier.poll():
raise OSError('crf_classifier subprocess died')

Expand Down
Binary file removed src/classifiers/crf_classifier.pyc
Binary file not shown.
Binary file removed src/document/__init__.pyc
Binary file not shown.
Binary file removed src/document/base_representation.pyc
Binary file not shown.
14 changes: 7 additions & 7 deletions src/document/constituent.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def get_subtree_rel_in_span(self, pos = 'L'):
if not isinstance(t, ParseTree):
return 'NO-REL'
else:
return t.node
return t.label()

def get_num_tokens(self):
if self.get_num_edus() == 1:
Expand All @@ -127,7 +127,7 @@ def get_subtree_rel(self):
if not isinstance(self.parse_subtree, ParseTree):
return 'NO-REL'
else:
return self.parse_subtree.node
return self.parse_subtree.label()


def get_ngram(self, n):
Expand Down Expand Up @@ -186,7 +186,7 @@ def get_POS_ngram(self, n):
ngrams = []
for i in range(start, end):
try:
ngrams.append(t[t.leaf_treeposition(i)[ : -1]].node)
ngrams.append(t[t.leaf_treeposition(i)[ : -1]].label())
except Exception, e:
print self.l_start, self.l_end, self.r_end
print self.parse_subtree
Expand Down Expand Up @@ -220,16 +220,16 @@ def traverse_tree(self, t, start_edu):
L = t[0]
l_num_edus = 1 if not isinstance(L, ParseTree) else len(L.leaves())

if t.node[-5] == 'N':
if t.label()[-5] == 'N':
result.extend(self.traverse_tree(L, start_edu))

R = t[1]
if t.node[-2] == 'N':
if t.label()[-2] == 'N':
result.extend(self.traverse_tree(R, start_edu + l_num_edus))

# if len(result) == 0:
# print t
# print t.node
# print t.label()

return result

Expand All @@ -254,4 +254,4 @@ def make_new_constituent(self, label, c, deepcopy = False):
new_c.left_child = self
new_c.right_child = c

return new_c
return new_c
Binary file removed src/document/constituent.pyc
Binary file not shown.
Binary file removed src/document/dependency.pyc
Binary file not shown.
Binary file removed src/document/doc.pyc
Binary file not shown.
6 changes: 2 additions & 4 deletions src/document/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

@author: Wei
'''
import utils.rst_lib
from base_representation import BaseRepresentation
from constituent import Constituent

Expand Down Expand Up @@ -62,7 +61,7 @@ def get_POS_ngram(self, token_offset, n):

ngrams = []
for i in range(start, end):
ngrams.append(self.parse_tree[self.parse_tree.leaf_treeposition(i)[ : -1]].node)
ngrams.append(self.parse_tree[self.parse_tree.leaf_treeposition(i)[ : -1]].label())

return ngrams

Expand All @@ -77,11 +76,10 @@ def get_edu(self, token_id):
def get_bottom_level_constituents(self):
constituents = []
(start_edu, end_edu) = self.doc.cuts[self.sent_id]

for i in range(start_edu, end_edu):
c = Constituent(self.doc.edus[i], self.doc,
i, i + 1, i + 1, self.sent_id, self.sent_id)

constituents.append(c)

return constituents
return constituents
Binary file modified src/document/sentence.pyc
Binary file not shown.
8 changes: 3 additions & 5 deletions src/document/token.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,11 @@ def get_PoS_tag(self):
if not self.pos:
if self.sentence.parse_tree is not None:
t = self.sentence.parse_tree
self.pos = t[self.get_treepos()[ : -1]].node
self.pos = t[self.get_treepos()[ : -1]].label()

elif self.sentence.unlexicalized_parse_tree is not None:
t = self.sentence.unlexicalized_parse_tree

self.pos = t[self.get_treepos()[ : -1]].node

t = self.sentence.unlexicalized_parse_tree
self.pos = t[self.get_treepos()[ : -1]].label()
return self.pos


Expand Down
Binary file modified src/document/token.pyc
Binary file not shown.
Binary file modified src/features/__init__.pyc
Binary file not shown.
12 changes: 6 additions & 6 deletions src/features/segmenter_feature_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def write_unit_token_identity_features(self, token, unit, position):
treepos = treepos[ : -1]

ancestor_subtree = tree[treepos]
self.features.add('Largest_Subtree_Top_Tag=%s_Unit=%d@%d' % (ancestor_subtree.node, unit, position))
self.features.add('Largest_Subtree_Top_Tag=%s_Unit=%d@%d' % (ancestor_subtree.label(), unit, position))
self.features.add('Largest_Subtree_Depth=%s_Unit=%d@%d' % (len(treepos), unit, position))

if len(treepos) < len(token.get_treepos()):
Expand All @@ -64,7 +64,7 @@ def write_unit_token_identity_features(self, token, unit, position):
# if ancestor_subtree.head == token.id - 1:
# self.features.add('Is_Head_in_Largest_Subtree_Unit=%d@%d' % (unit, position))

production = '%s->%s' % (ancestor_subtree.node, '#'.join(str(x) for x in nltk.tree._child_names(ancestor_subtree)))
production = '%s->%s' % (ancestor_subtree.label(), '#'.join(str(x) for x in nltk.tree._child_names(ancestor_subtree)))
self.features.add('Largest_Subtree_Production=%s_Unit=%d@%d' % (production, unit, position))
# print 'treepos', treepos
# print 'token_treepos', token.get_treepos()
Expand Down Expand Up @@ -175,7 +175,7 @@ def write_global_features_for_span(self, span, token, unit, position):
self.features.add('Subtrees_to_Neighbouring_Boundary=%d_Unit%d@%d' % (len(subtrees), unit, position))
subtree_top_tags = []
for subtree in subtrees:
subtree_top_tags.append(subtree.node)
subtree_top_tags.append(subtree.label())
# self.features.add('Subtree_Tags_to_Neighbouring_Boundary=%s_Unit%d@%d' % ('#'.join(subtree_top_tags), unit, position))

# print start, end
Expand All @@ -187,8 +187,8 @@ def write_global_features_for_span(self, span, token, unit, position):
ancestor_subtree = tree[ancestor_treepos[ : -1]]
# print ancestor_subtree

self.features.add('Ancestor_Subtree_Tag_Neighbouring_Boundary=%s_Unit%d@%d' % (ancestor_subtree.node, unit, position))
production = '%s->%s' % (ancestor_subtree.node, '#'.join(subtree_top_tags))
self.features.add('Ancestor_Subtree_Tag_Neighbouring_Boundary=%s_Unit%d@%d' % (ancestor_subtree.label(), unit, position))
production = '%s->%s' % (ancestor_subtree.label(), '#'.join(subtree_top_tags))
# production = '%s->%s' % (ancestor_subtree.node, '#'.join(str(x) for x in nltk.tree._child_names(ancestor_subtree)))
self.features.add('Ancestor_Subtree_Production_Neighbouring_Boundary=%s_Unit%d@%d' % (production, unit, position))

Expand All @@ -215,4 +215,4 @@ def write_features(self, tokens, offset2neighbouring_boundaries = None):

self.write_token_pair_features(token1, token2, i)

return self.features
return self.features
Binary file modified src/features/segmenter_feature_writer.pyc
Binary file not shown.
14 changes: 7 additions & 7 deletions src/features/tree_feature_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def write_dominance_set_features(self, L, R, position):

l_ancestor_subtree = t[l_ancestor_pos]

self.features.add('Top_syntactic_tag_Unit1=%s@%d' % (l_ancestor_subtree.node, position))
self.features.add('Top_syntactic_tag_Unit1=%s@%d' % (l_ancestor_subtree.label(), position))

if len(l_ancestor_subtree.leaves()) == l_end_word - l_start_word:
self.features.add('Valid_syntax_subtree_Unit1@%d' % position)
Expand All @@ -114,12 +114,12 @@ def write_dominance_set_features(self, L, R, position):
self.features.add('Num_Syntax_subtrees_Unit1=%d@%d' % (len(l_subtrees), position))

if len(l_subtrees) == 1:
self.features.add('Top_Syntax_tag_Unit1=%s@%d' % (l_subtrees[0].node, position))
self.features.add('Top_Syntax_tag_Unit1=%s@%d' % (l_subtrees[0].label(), position))


l_subtree_top_tags = []
for (i, subtree) in enumerate(l_subtrees):
l_subtree_top_tags.append(subtree.node)
l_subtree_top_tags.append(subtree.label())

l_subtrees_top_tags.append(l_subtree_top_tags)
else:
Expand All @@ -136,7 +136,7 @@ def write_dominance_set_features(self, L, R, position):

r_ancestor_subtree = t[r_ancestor_pos]

self.features.add('Top_syntactic_tag_Unit2=%s@%d' % (r_ancestor_subtree.node, position))
self.features.add('Top_syntactic_tag_Unit2=%s@%d' % (r_ancestor_subtree.label(), position))

if len(r_ancestor_subtree.leaves()) == r_end_word - r_start_word:
self.features.add('Valid_syntax_subtree_Unit2@%d' % position)
Expand All @@ -145,11 +145,11 @@ def write_dominance_set_features(self, L, R, position):
self.features.add('Num_Syntax_subtrees_Unit2=%d@%d' % (len(r_subtrees), position))

if len(r_subtrees) == 1:
self.features.add('Top_Syntax_tag_Unit2=%s@%d' % (r_subtrees[0].node, position))
self.features.add('Top_Syntax_tag_Unit2=%s@%d' % (r_subtrees[0].label(), position))

r_subtree_top_tags = []
for (i, subtree) in enumerate(r_subtrees):
r_subtree_top_tags.append(subtree.node)
r_subtree_top_tags.append(subtree.label())

r_subtrees_top_tags.append(r_subtree_top_tags)
else:
Expand Down Expand Up @@ -343,4 +343,4 @@ def write_features_for_constituents(self, constituents, positions, scope, labeli
if (L.get_num_edus() + R.get_num_edus()) == len(L.doc.edus):
self.features.add('Last_Pair@%d' % position)

return self.features
return self.features
Binary file modified src/features/tree_feature_writer.pyc
Binary file not shown.
Binary file modified src/logs/__init__.pyc
Binary file not shown.
Binary file modified src/logs/log_writer.pyc
Binary file not shown.
20 changes: 10 additions & 10 deletions src/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,16 +210,16 @@ def parse(self, filename):
# print out
print 'Finished tree building in %.2f seconds.' % (treeBuildEnd - treeBuildStart)
self.log_writer.write('Finished tree building in %.2f seconds.' % (treeBuildEnd - treeBuildStart))

for i in range(len(doc.edus)):
pt.__setitem__(pt.leaf_treeposition(i), '_!%s!_' % ' '.join(doc.edus[i]))

out = pt.pprint()
print 'Output tree building result to %s.' % outfname
f_o = open(outfname, "w")
f_o.write(out)
f_o.close()

if len(doc.edus) > 1:
for i in range(len(doc.edus)):
pt.__setitem__(pt.leaf_treeposition(i), '_!%s!_' % ' '.join(doc.edus[i]))
out = pt.pprint()
print 'Output tree building result to %s.' % outfname
f_o = open(outfname, "w")
f_o.write(out)
f_o.close()
else:
print "One edu, can't build tree. Skipping!!!"

if self.save_preprocessed_doc:
print 'Saved fully processed document data to %s.' % serialized_doc_filename
Expand Down
Binary file modified src/parsers/__init__.pyc
Binary file not shown.
Binary file modified src/parsers/base_parser.pyc
Binary file not shown.
6 changes: 3 additions & 3 deletions src/parsers/intra_sentential_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def parse_sequence(self, sentence):
if c.get_num_edus() > 1:
predicted_label = mc_predictions[k]

c.parse_subtree.node = predicted_label
c.parse_subtree.set_label(predicted_label)
if self.verbose:
print 'Relabling'
print 'L', c.left_child
Expand All @@ -87,7 +87,7 @@ def relabel_stumps(self, stumps):
c2 = c.right_child

predicted_label = mc_predictions[k]
stumps[k].node = predicted_label
stumps[k].set_label(predicted_label)
if self.verbose:
print 'Relabling'
print 'L', c1
Expand All @@ -102,4 +102,4 @@ def parse_each_sentence(self, sentence):
sentence.prepare_parsing()

return self.parse_sequence(sentence)

Binary file modified src/parsers/intra_sentential_parser.pyc
Binary file not shown.
5 changes: 2 additions & 3 deletions src/parsers/multi_sentential_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def parse_sequence(self, doc):


seq_prob = self.connect_stumps(best_one, doc)

doc.discourse_tree = doc.constituents[0].parse_subtree
# print doc.discourse_tree

Expand Down Expand Up @@ -108,7 +107,7 @@ def relabel_stumps(self, doc, i):
c2 = c.right_child

predicted_label = max_prob_predictions[k]
c.parse_subtree.node = predicted_label
c.parse_subtree.set_label(predicted_label)

if self.verbose:
print 'Relabling'
Expand Down Expand Up @@ -159,4 +158,4 @@ def connect_stumps(self, i, doc):
# print

return seq_prob

Binary file modified src/parsers/multi_sentential_parser.pyc
Binary file not shown.
Binary file removed src/paths.pyc
Binary file not shown.
Binary file modified src/prep/__init__.pyc
Binary file not shown.
Binary file modified src/prep/prep_utils.pyc
Binary file not shown.
Loading