arne-cl · MousaMohammed · Jan 30, 2018 · Jan 30, 2018 · Jan 30, 2018 · Jan 30, 2018
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+texts/results
+*.pyc
diff --git a/README.txt b/README.txt
@@ -30,7 +30,7 @@ You need to install the following third-party software first:
 
 In order to run the software, you need to have the following components:
 1. Python 2.7.
-2. NLTK 2.0b9 (newer versions will not work because of the different interfaces in the tree modules).
+2. NLTK 3.0.0 (I made updates to Vanessa's code to work with the newer NLTK).
 3. Java
 4. gcc
 5. Perl
@@ -43,7 +43,10 @@ SETUP CRFSUITE
 ~~~~~~~~~~~~~~~~~~~~~~
 The $gCRF_ROOT$ symbol in the commands below stands for the root directory of gCRF.
 
-1. Test if the binary file of CRFsuite-stdin is ready for use, by executing the following two commands:
+1. Install working version of NLTK: 
+   pip install --user nltk==3.0.0
+
+2. Test if the binary file of CRFsuite-stdin is ready for use, by executing the following two commands:
 cd $gCRF_ROOT$/tools/crfsuite/
 crfsuite-stdin tag -pi -m ../../model/tree_build_set_CRF/label/intra.crfsuite test.txt
 
@@ -53,20 +56,26 @@ LEAF:0.063409
 Elaboration[N][S]:0.958434
 LEAF:0.060318
 
-2. If the test in the Step 1 fails, you need to build the binary from source codes. To do so, you need to do the following three things:
-a. Build LibLBFGS
+3. If the test in the Step 1 fails, you need to build the binary from source codes. To do so, you need to do the following three things:
+a. Get LibLBFGS
+   cd $gCRF_ROOT$/tools/crfsuite   
+   curl -L -O https://github.com/downloads/chokkan/liblbfgs/liblbfgs-1.10.tar.gz
+   gunzip liblbfgs-1.10.tar.gz 
+   tar xvf liblbfgs-1.10.tar
+b. Build LibLBFGS
    cd $gCRF_ROOT$/tools/crfsuite/liblbfgs-1.10
    ./configure --prefix=$HOME/local
    make
    make install
 
-b. Build CRFsuite-stdin
+c. Build CRFsuite-stdin
    cd $gCRF_ROOT$/tools/crfsuite/crfsuite-0.12
+   chmod +x configure
    ./configure --prefix=$HOME/local --with-liblbfgs=$HOME/local
    make
    make install
 
-c. Copy the crfsuite binary under $HOME/local/bin to tools/crfsuite and rename it as crfsuite-stdin
+d. Copy the crfsuite binary under $HOME/local/bin to tools/crfsuite and rename it as crfsuite-stdin
    cp $HOME/local/bin/crfsuite $gCRF_ROOT$/tools/crfsuite/crfsuite-stdin
    chmod +x ../crfsuite-stdin
 

diff --git a/src/classifiers/__init__.pyc b/src/classifiers/__init__.pyc
diff --git a/src/classifiers/crf_classifier.py b/src/classifiers/crf_classifier.py
@@ -1,3 +1,8 @@
+## FIXED: Jessy Li 05/28/15
+## Change stderr reading for crf output to pipe.communicate();
+## adding stdout to Popen();
+## output from crf is in fact in stdout not stderr
+## seems classifier's Popen has to be called every time a sentence needs to be classified
 import subprocess
 import paths
 import os.path
@@ -17,28 +22,34 @@ def __init__(self, name, model_type, model_path, model_file, verbose):
 
         self.classifier_cmd = '%s/crfsuite-stdin tag -pi -m %s -' % (paths.CRFSUITE_PATH, 
 							 os.path.join(self.model_path, self.model_fname))
+        self.classifier = None
 #        print self.classifier_cmd
-        self.classifier = subprocess.Popen(self.classifier_cmd, shell = True, stdin = subprocess.PIPE, stderr = subprocess.PIPE)
+        #self.classifier = subprocess.Popen(self.classifier_cmd, shell = True, stdin = subprocess.PIPE, stderr = subprocess.PIPE)
 
-        if self.classifier.poll():
-            raise OSError('Could not create classifier subprocess, with error info:\n%s' % self.classifier.stderr.readline())
+        #if self.classifier.poll():
+        #    raise OSError('Could not create classifier subprocess, with error info:\n%s' % self.classifier.stderr.readline())
 
         #self.cnt = 0
 
 
     def classify(self, vectors):
 #        print '\n'.join(vectors) + "\n\n"
 
-        self.classifier.stdin.write('\n'.join(vectors) + "\n\n")
+        #self.classifier.stdin.write('\n'.join(vectors) + "\n\n")
+        self.classifier = subprocess.Popen(self.classifier_cmd, shell = True, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
+        if self.classifier.poll():
+            raise OSError('Could not create classifier subprocess, with error info:\n%s' % self.classifier.stderr.readline())
 
-        lines = []
-        line = self.classifier.stderr.readline()
-        while (line.strip() != ''):
+        #lines = []
+        #line = self.classifier.stderr.readline()
+        #while (line.strip() != ''):
 #            print line
-            lines.append(line)
-            line = self.classifier.stderr.readline()
-
+       #     lines.append(line)
+       #     line = self.classifier.stderr.readline()
 
+        stdo,stde = self.classifier.communicate('\n'.join(vectors) +"\n")
+        lines = stdo.strip().split("\n")
+
         if self.classifier.poll():
             raise OSError('crf_classifier subprocess died')
 

diff --git a/src/classifiers/crf_classifier.pyc b/src/classifiers/crf_classifier.pyc
diff --git a/src/document/__init__.pyc b/src/document/__init__.pyc
diff --git a/src/document/base_representation.pyc b/src/document/base_representation.pyc
diff --git a/src/document/constituent.py b/src/document/constituent.py
@@ -104,7 +104,7 @@ def get_subtree_rel_in_span(self, pos = 'L'):
         if not isinstance(t, ParseTree):
             return 'NO-REL'
         else:
-            return t.node
+            return t.label()
 
     def get_num_tokens(self):
         if self.get_num_edus() == 1:
@@ -127,7 +127,7 @@ def get_subtree_rel(self):
         if not isinstance(self.parse_subtree, ParseTree):
             return 'NO-REL'
         else:
-            return self.parse_subtree.node
+            return self.parse_subtree.label()
 
 
     def get_ngram(self, n):
@@ -186,7 +186,7 @@ def get_POS_ngram(self, n):
         ngrams = []
         for i in range(start, end):
             try:
-                ngrams.append(t[t.leaf_treeposition(i)[ : -1]].node)
+                ngrams.append(t[t.leaf_treeposition(i)[ : -1]].label())
             except Exception, e:
                 print self.l_start, self.l_end, self.r_end
                 print self.parse_subtree
@@ -220,16 +220,16 @@ def traverse_tree(self, t, start_edu):
             L = t[0]
             l_num_edus = 1 if not isinstance(L, ParseTree) else len(L.leaves())
 
-            if t.node[-5] == 'N':
+            if t.label()[-5] == 'N':
                 result.extend(self.traverse_tree(L, start_edu))
 
             R = t[1]
-            if t.node[-2] == 'N':
+            if t.label()[-2] == 'N':
                 result.extend(self.traverse_tree(R, start_edu + l_num_edus))
 
 #            if len(result) == 0:
 #                print t
-#                print t.node
+#                print t.label()
 
             return result
 
@@ -254,4 +254,4 @@ def make_new_constituent(self, label, c, deepcopy = False):
         new_c.left_child = self
         new_c.right_child = c
 
-        return new_c
+        return new_c
diff --git a/src/document/constituent.pyc b/src/document/constituent.pyc
diff --git a/src/document/dependency.pyc b/src/document/dependency.pyc
diff --git a/src/document/doc.pyc b/src/document/doc.pyc
diff --git a/src/document/sentence.py b/src/document/sentence.py
@@ -3,7 +3,6 @@
 
 @author: Wei
 '''
-import utils.rst_lib
 from base_representation import BaseRepresentation
 from constituent import Constituent
 
@@ -62,7 +61,7 @@ def get_POS_ngram(self, token_offset, n):
 
         ngrams = []
         for i in range(start, end):
-            ngrams.append(self.parse_tree[self.parse_tree.leaf_treeposition(i)[ : -1]].node)
+            ngrams.append(self.parse_tree[self.parse_tree.leaf_treeposition(i)[ : -1]].label())
 
         return ngrams
 
@@ -77,11 +76,10 @@ def get_edu(self, token_id):
     def get_bottom_level_constituents(self):
         constituents = []
         (start_edu, end_edu) = self.doc.cuts[self.sent_id]
-
         for i in range(start_edu, end_edu):
             c = Constituent(self.doc.edus[i], self.doc,
                             i, i + 1, i + 1, self.sent_id, self.sent_id)
 
             constituents.append(c)
 
-        return constituents
+        return constituents
diff --git a/src/document/sentence.pyc b/src/document/sentence.pyc
diff --git a/src/document/token.py b/src/document/token.py
@@ -48,13 +48,11 @@ def get_PoS_tag(self):
         if not self.pos:
             if self.sentence.parse_tree is not None:
                 t = self.sentence.parse_tree
-                self.pos = t[self.get_treepos()[ : -1]].node
+                self.pos = t[self.get_treepos()[ : -1]].label()
 
             elif self.sentence.unlexicalized_parse_tree is not None:
-                t = self.sentence.unlexicalized_parse_tree
-
-                self.pos = t[self.get_treepos()[ : -1]].node
-
+                t = self.sentence.unlexicalized_parse_tree  
+                self.pos = t[self.get_treepos()[ : -1]].label()
         return self.pos
 
 

diff --git a/src/document/token.pyc b/src/document/token.pyc
diff --git a/src/features/__init__.pyc b/src/features/__init__.pyc
diff --git a/src/features/segmenter_feature_writer.py b/src/features/segmenter_feature_writer.py
@@ -53,7 +53,7 @@ def write_unit_token_identity_features(self, token, unit, position):
             treepos = treepos[ : -1]
 
         ancestor_subtree = tree[treepos]
-        self.features.add('Largest_Subtree_Top_Tag=%s_Unit=%d@%d' % (ancestor_subtree.node, unit, position))
+        self.features.add('Largest_Subtree_Top_Tag=%s_Unit=%d@%d' % (ancestor_subtree.label(), unit, position))
         self.features.add('Largest_Subtree_Depth=%s_Unit=%d@%d' % (len(treepos), unit, position))
 
         if len(treepos) < len(token.get_treepos()):
@@ -64,7 +64,7 @@ def write_unit_token_identity_features(self, token, unit, position):
 #            if ancestor_subtree.head == token.id - 1:
 #                self.features.add('Is_Head_in_Largest_Subtree_Unit=%d@%d' % (unit, position))
 
-            production = '%s->%s' % (ancestor_subtree.node, '#'.join(str(x) for x in nltk.tree._child_names(ancestor_subtree)))
+            production = '%s->%s' % (ancestor_subtree.label(), '#'.join(str(x) for x in nltk.tree._child_names(ancestor_subtree)))
             self.features.add('Largest_Subtree_Production=%s_Unit=%d@%d' % (production, unit, position))
 #            print 'treepos', treepos
 #            print 'token_treepos', token.get_treepos()
@@ -175,7 +175,7 @@ def write_global_features_for_span(self, span, token, unit, position):
         self.features.add('Subtrees_to_Neighbouring_Boundary=%d_Unit%d@%d' % (len(subtrees), unit, position))
         subtree_top_tags = []
         for subtree in subtrees:
-            subtree_top_tags.append(subtree.node)
+            subtree_top_tags.append(subtree.label())
 #        self.features.add('Subtree_Tags_to_Neighbouring_Boundary=%s_Unit%d@%d' % ('#'.join(subtree_top_tags), unit, position))
 
 #        print start, end
@@ -187,8 +187,8 @@ def write_global_features_for_span(self, span, token, unit, position):
             ancestor_subtree = tree[ancestor_treepos[ : -1]]
 #        print ancestor_subtree
 
-        self.features.add('Ancestor_Subtree_Tag_Neighbouring_Boundary=%s_Unit%d@%d' % (ancestor_subtree.node, unit, position))
-        production = '%s->%s' % (ancestor_subtree.node, '#'.join(subtree_top_tags))
+        self.features.add('Ancestor_Subtree_Tag_Neighbouring_Boundary=%s_Unit%d@%d' % (ancestor_subtree.label(), unit, position))
+        production = '%s->%s' % (ancestor_subtree.label(), '#'.join(subtree_top_tags))
 #        production = '%s->%s' % (ancestor_subtree.node, '#'.join(str(x) for x in nltk.tree._child_names(ancestor_subtree)))
         self.features.add('Ancestor_Subtree_Production_Neighbouring_Boundary=%s_Unit%d@%d' % (production, unit, position))
 
@@ -215,4 +215,4 @@ def write_features(self, tokens, offset2neighbouring_boundaries = None):
 
                 self.write_token_pair_features(token1, token2, i)
 
-        return self.features
+        return self.features
diff --git a/src/features/segmenter_feature_writer.pyc b/src/features/segmenter_feature_writer.pyc
diff --git a/src/features/tree_feature_writer.py b/src/features/tree_feature_writer.py
@@ -105,7 +105,7 @@ def write_dominance_set_features(self, L, R, position):
 
             l_ancestor_subtree = t[l_ancestor_pos]
 
-            self.features.add('Top_syntactic_tag_Unit1=%s@%d' % (l_ancestor_subtree.node, position))
+            self.features.add('Top_syntactic_tag_Unit1=%s@%d' % (l_ancestor_subtree.label(), position))
 
             if len(l_ancestor_subtree.leaves()) == l_end_word - l_start_word:
                 self.features.add('Valid_syntax_subtree_Unit1@%d' % position)
@@ -114,12 +114,12 @@ def write_dominance_set_features(self, L, R, position):
             self.features.add('Num_Syntax_subtrees_Unit1=%d@%d' % (len(l_subtrees), position))
 
             if len(l_subtrees) == 1:
-                self.features.add('Top_Syntax_tag_Unit1=%s@%d' % (l_subtrees[0].node, position))
+                self.features.add('Top_Syntax_tag_Unit1=%s@%d' % (l_subtrees[0].label(), position))
 
 
             l_subtree_top_tags = []
             for (i, subtree) in enumerate(l_subtrees):
-                l_subtree_top_tags.append(subtree.node)
+                l_subtree_top_tags.append(subtree.label())
 
             l_subtrees_top_tags.append(l_subtree_top_tags)
         else:
@@ -136,7 +136,7 @@ def write_dominance_set_features(self, L, R, position):
 
             r_ancestor_subtree = t[r_ancestor_pos]
 
-            self.features.add('Top_syntactic_tag_Unit2=%s@%d' % (r_ancestor_subtree.node, position))
+            self.features.add('Top_syntactic_tag_Unit2=%s@%d' % (r_ancestor_subtree.label(), position))
 
             if len(r_ancestor_subtree.leaves()) == r_end_word - r_start_word:
                 self.features.add('Valid_syntax_subtree_Unit2@%d' % position)
@@ -145,11 +145,11 @@ def write_dominance_set_features(self, L, R, position):
             self.features.add('Num_Syntax_subtrees_Unit2=%d@%d' % (len(r_subtrees), position))
 
             if len(r_subtrees) == 1:
-                self.features.add('Top_Syntax_tag_Unit2=%s@%d' % (r_subtrees[0].node, position))
+                self.features.add('Top_Syntax_tag_Unit2=%s@%d' % (r_subtrees[0].label(), position))
 
             r_subtree_top_tags = []
             for (i, subtree) in enumerate(r_subtrees):
-                r_subtree_top_tags.append(subtree.node)
+                r_subtree_top_tags.append(subtree.label())
 
             r_subtrees_top_tags.append(r_subtree_top_tags)
         else:
@@ -343,4 +343,4 @@ def write_features_for_constituents(self, constituents, positions, scope, labeli
                     if (L.get_num_edus() + R.get_num_edus()) == len(L.doc.edus):
                         self.features.add('Last_Pair@%d' % position)
 
-        return self.features
+        return self.features
diff --git a/src/features/tree_feature_writer.pyc b/src/features/tree_feature_writer.pyc
diff --git a/src/logs/__init__.pyc b/src/logs/__init__.pyc
diff --git a/src/logs/log_writer.pyc b/src/logs/log_writer.pyc
diff --git a/src/parse.py b/src/parse.py
@@ -210,16 +210,16 @@ def parse(self, filename):
     #                print out
                     print 'Finished tree building in %.2f seconds.' % (treeBuildEnd - treeBuildStart)  
                     self.log_writer.write('Finished tree building in %.2f seconds.' % (treeBuildEnd - treeBuildStart))
-
-                    for i in range(len(doc.edus)):
-                        pt.__setitem__(pt.leaf_treeposition(i), '_!%s!_' % ' '.join(doc.edus[i]))
-
-                    out = pt.pprint()
-                    print 'Output tree building result to %s.' % outfname
-                    f_o = open(outfname, "w")
-                    f_o.write(out)
-                    f_o.close()
-
+                    if len(doc.edus) > 1:
+                        for i in range(len(doc.edus)):
+                            pt.__setitem__(pt.leaf_treeposition(i), '_!%s!_' % ' '.join(doc.edus[i]))
+                        out = pt.pprint()
+                        print 'Output tree building result to %s.' % outfname
+                        f_o = open(outfname, "w")
+                        f_o.write(out)
+                        f_o.close()
+                    else:
+                        print "One edu, can't build tree. Skipping!!!"
 
                 if self.save_preprocessed_doc:
                     print 'Saved fully processed document data to %s.' % serialized_doc_filename           

diff --git a/src/parsers/__init__.pyc b/src/parsers/__init__.pyc
diff --git a/src/parsers/base_parser.pyc b/src/parsers/base_parser.pyc
diff --git a/src/parsers/intra_sentential_parser.py b/src/parsers/intra_sentential_parser.py
@@ -62,7 +62,7 @@ def parse_sequence(self, sentence):
                 if c.get_num_edus() > 1:
                     predicted_label = mc_predictions[k]
 
-                    c.parse_subtree.node = predicted_label
+                    c.parse_subtree.set_label(predicted_label)
                     if self.verbose:
                         print 'Relabling'
                         print 'L', c.left_child
@@ -87,7 +87,7 @@ def relabel_stumps(self, stumps):
                 c2 = c.right_child
 
                 predicted_label = mc_predictions[k]
-                stumps[k].node = predicted_label
+                stumps[k].set_label(predicted_label)
                 if self.verbose:
                     print 'Relabling'
                     print 'L', c1
@@ -102,4 +102,4 @@ def parse_each_sentence(self, sentence):
         sentence.prepare_parsing()
 
         return self.parse_sequence(sentence)
-
+    
diff --git a/src/parsers/intra_sentential_parser.pyc b/src/parsers/intra_sentential_parser.pyc
diff --git a/src/parsers/multi_sentential_parser.py b/src/parsers/multi_sentential_parser.py
@@ -55,7 +55,6 @@ def parse_sequence(self, doc):
 
 
             seq_prob = self.connect_stumps(best_one, doc)
-
         doc.discourse_tree = doc.constituents[0].parse_subtree
 #        print doc.discourse_tree
 
@@ -108,7 +107,7 @@ def relabel_stumps(self, doc, i):
                 c2 = c.right_child
 
                 predicted_label = max_prob_predictions[k]
-                c.parse_subtree.node = predicted_label
+                c.parse_subtree.set_label(predicted_label)
 
                 if self.verbose:
                     print 'Relabling'
@@ -159,4 +158,4 @@ def connect_stumps(self, i, doc):
 #        print
 
         return seq_prob
-
+    
diff --git a/src/parsers/multi_sentential_parser.pyc b/src/parsers/multi_sentential_parser.pyc
diff --git a/src/paths.pyc b/src/paths.pyc
diff --git a/src/prep/__init__.pyc b/src/prep/__init__.pyc
diff --git a/src/prep/prep_utils.pyc b/src/prep/prep_utils.pyc