support for non numeric ids

felipebravom · Oct 4, 2017 · 6fe0726 · 6fe0726
1 parent 244ae23
commit 6fe0726
Show file tree

Hide file tree

Showing 2 changed files with 90 additions and 4 deletions.
diff --git a/evaluate.py b/evaluate.py
@@ -41,20 +41,20 @@ def evaluate(pred,gold):
         for line in gold_lines:
             parts=line.split('\t')
             if len(parts)==4:   
-                data_dic[int(parts[0])]=[float(line.split('\t')[3])]
+                data_dic[parts[0]]=[float(line.split('\t')[3])]
             else:
                 raise ValueError('Format problem.')
 
 
         for line in pred_lines:
             parts=line.split('\t')
             if len(parts)==4:  
-                if int(parts[0]) in data_dic:
+                if parts[0] in data_dic:
                     try:
-                        data_dic[int(parts[0])].append(float(line.split('\t')[3]))
+                        data_dic[parts[0]].append(float(line.split('\t')[3]))
                     except ValueError:
                         # Invalid predictions are replaced by a default value
-                        data_dic[int(parts[0])].append(0.5)
+                        data_dic[parts[0]].append(0.5)
                 else:
                     raise ValueError('Invalid tweet id.')
             else:

diff --git a/tweets_to_arff_disc.py b/tweets_to_arff_disc.py
@@ -0,0 +1,86 @@
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# tweets_to_arff.py
+# felipebravom
+# Running example: python tweets_to_arff data/anger-ratings-0to1.test.target.tsv data/anger-ratings-0to1.test.target.arff
+import sys
+
+
+def create_arff(input_file,output_file):
+    """
+    Creates an arff dataset
+    """
+
+
+
+
+    out=open(output_file,"w")  
+    header='@relation '+input_file+'\n\n@attribute id numeric \n@attribute tweet string\n@attribute emotion string\n@attribute intensity {low,medium,high} \n\n@data\n'
+    out.write(header)
+
+
+
+    f=open(input_file, "rb")
+    lines=f.readlines()
+
+
+    for line in lines:
+        parts=line.split("\t")
+        if len(parts)==4:
+
+            id=parts[0]
+            tweet=parts[1]
+            emotion=parts[2]
+            score=parts[3].strip() 
+            if score == "NONE":
+                score = "?"
+            else:
+                score_val=float(score)
+                if score_val <= 1.0/3:
+                    score="low"
+                elif score_val <=2.0/3:
+                    score="medium"
+                else:
+                    score="high"
+
+            out_line=id+',\"'+tweet+'\",'+'\"'+emotion+'\",'+score+'\n'
+            out.write(out_line)
+        else:
+            print "Wrong format"
+
+
+    f.close()  
+    out.close()  
+
+
+
+
+
+def main(argv):
+    input_file=argv[0]
+    output_file=argv[1]
+    create_arff(input_file,output_file)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])    
+
+
+
+
+
+
+
+
+