diff --git a/evaluate.py b/evaluate.py index 0d92f4b..40e680d 100644 --- a/evaluate.py +++ b/evaluate.py @@ -41,7 +41,7 @@ def evaluate(pred,gold): for line in gold_lines: parts=line.split('\t') if len(parts)==4: - data_dic[int(parts[0])]=[float(line.split('\t')[3])] + data_dic[parts[0]]=[float(line.split('\t')[3])] else: raise ValueError('Format problem.') @@ -49,12 +49,12 @@ def evaluate(pred,gold): for line in pred_lines: parts=line.split('\t') if len(parts)==4: - if int(parts[0]) in data_dic: + if parts[0] in data_dic: try: - data_dic[int(parts[0])].append(float(line.split('\t')[3])) + data_dic[parts[0]].append(float(line.split('\t')[3])) except ValueError: # Invalid predictions are replaced by a default value - data_dic[int(parts[0])].append(0.5) + data_dic[parts[0]].append(0.5) else: raise ValueError('Invalid tweet id.') else: diff --git a/tweets_to_arff_disc.py b/tweets_to_arff_disc.py new file mode 100644 index 0000000..c6c7e86 --- /dev/null +++ b/tweets_to_arff_disc.py @@ -0,0 +1,86 @@ +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# tweets_to_arff.py +# felipebravom +# Running example: python tweets_to_arff data/anger-ratings-0to1.test.target.tsv data/anger-ratings-0to1.test.target.arff +import sys + + +def create_arff(input_file,output_file): + """ + Creates an arff dataset + """ + + + + + out=open(output_file,"w") + header='@relation '+input_file+'\n\n@attribute id numeric \n@attribute tweet string\n@attribute emotion string\n@attribute intensity {low,medium,high} \n\n@data\n' + out.write(header) + + + + f=open(input_file, "rb") + lines=f.readlines() + + + for line in lines: + parts=line.split("\t") + if len(parts)==4: + + id=parts[0] + tweet=parts[1] + emotion=parts[2] + score=parts[3].strip() + if score == "NONE": + score = "?" + else: + score_val=float(score) + if score_val <= 1.0/3: + score="low" + elif score_val <=2.0/3: + score="medium" + else: + score="high" + + out_line=id+',\"'+tweet+'\",'+'\"'+emotion+'\",'+score+'\n' + out.write(out_line) + else: + print "Wrong format" + + + f.close() + out.close() + + + + + +def main(argv): + input_file=argv[0] + output_file=argv[1] + create_arff(input_file,output_file) + + +if __name__ == "__main__": + main(sys.argv[1:]) + + + + + + + + + \ No newline at end of file