diff --git a/batch/batchSimpleTraining.py b/batch/batchSimpleTraining.py index 13d53ee..6b57ea9 100644 --- a/batch/batchSimpleTraining.py +++ b/batch/batchSimpleTraining.py @@ -1,4 +1,4 @@ -import sys, getopt, json, argparse, os, csv +import sys, getopt, argparse, os, csv import numpy as np sys.path.append( "../" ) from feelit import utils @@ -7,6 +7,7 @@ from sklearn.cross_validation import KFold import operator import logging +import pickle emotions = utils.LJ40K @@ -37,7 +38,7 @@ def get_arguments(argv): help='SVM parameter (DEFAULT: 1). This can be a list expression, e.g., 0.1,1,10,100') parser.add_argument('-g', '--gamma', metavar='GAMMA', type=parse_list, default=None, help='RBF parameter (DEFAULT: 1/dimensions). This can be a list expression, e.g., 0.1,1,10,100') - parser.add_argument('-t', '--temp_output_dir', metavar='TEMP_DIR', + parser.add_argument('-t', '--temp_output_dir', metavar='TEMP_DIR', default=None, help='output intermediate data of each emotion in the specified directory (DEFAULT: not output)') parser.add_argument('-n', '--no_scaling', action='store_true', default=False, help='do not perform feature scaling (DEFAULT: False)') @@ -47,12 +48,6 @@ def get_arguments(argv): help='show debug messages') args = parser.parse_args(argv) return args - -def get_feature_list(feature_list_file): - fp = open(feature_list_file, 'r') - feature_list = json.load(fp) - fp.close() - return feature_list def get_file_name_by_emtion(train_dir, emotion, **kwargs): ''' @@ -84,10 +79,20 @@ def collect_results(all_results, emotion, results): all_results['X_predict_prob'].append(results['X_predict_prob']) return all_results +def test_writable(file_path): + writable = True + try: + filehandle = open(file_path, 'w') + except IOError: + writable = False + + filehandle.close() + return writable + if __name__ == '__main__': args = get_arguments(sys.argv[1:]) - features = get_feature_list(args.feature_list_file) + features = utils.get_feature_list(args.feature_list_file) if args.debug: loglevel = logging.DEBUG @@ -97,6 +102,17 @@ def collect_results(all_results, emotion, results): loglevel = logging.ERROR logging.basicConfig(format='[%(levelname)s] %(message)s', level=loglevel) + #import pdb; pdb.set_trace(); + # some pre-checking + if args.temp_output_dir is not None and not os.path.isdir(args.temp_output_dir): + raise Exception("temp folder %s doesn't exist." % (args.temp_output_dir)) + + if os.path.exists(args.output_file_name): + logging.warning("file %s will be overwrote." % (args.output_file_name)) + elif not test_writable(args.output_file_name): + raise Exception("file %s is not writable." % (args.output_file_name)) + + # main loop collect_best_param = {} # TODO: remove all_results = {'emotion': ['Evals'], 'weighted_score': ['Accuracy Rate'], 'auc': ['AUC'], 'X_predict_prob': []} @@ -125,7 +141,7 @@ def collect_results(all_results, emotion, results): scores = {} for svmc in args.c: for rbf_gamma in args.gamma: - score = learner.kfold(kfolder, classifier='SVM', kernel='rbf', prob=False, C=svmc, scaling=(not args.no_scaling), gamma=rbf_gamma) + score = learner.kfold(kfolder, classifier='SVM', kernel='rbf', prob=False, C=svmc, scaling=(not args.no_scaling) ,gamma=rbf_gamma) scores.update({(svmc, rbf_gamma): score}) if args.temp_output_dir: @@ -158,11 +174,18 @@ def collect_results(all_results, emotion, results): ## collect results all_results = collect_results(all_results, emotion_name, results) + if args.temp_output_dir: + fpath = os.path.join(args.temp_output_dir, "model_%s_%f_%F.pkl" % (emotion_name, best_C, best_gamma)); + learner.dump_model(fpath); + if not args.no_scaling: + fpath = os.path.join(args.temp_output_dir, "scaler_%s.pkl" % (emotion_name)); + learner.dump_scaler(fpath); if args.temp_output_dir: fpath = os.path.join(args.temp_output_dir, 'best_param.csv') utils.dump_dict_to_csv(fpath, collect_best_param) fpath = os.path.join(args.temp_output_dir, 'X_predict_prob.csv') utils.dump_list_to_csv(fpath, all_results['X_predict_prob']) + utils.dump_list_to_csv(args.output_file_name, [all_results['emotion'], all_results['weighted_score'], all_results['auc']]) diff --git a/batch/batchTestModel.py b/batch/batchTestModel.py new file mode 100644 index 0000000..649aa13 --- /dev/null +++ b/batch/batchTestModel.py @@ -0,0 +1,65 @@ +import sys, argparse, os +sys.path.append( "../" ) +from feelit import utils +from feelit.features import Learning +from feelit.features import DataPreprocessor +import logging + +emotions = utils.LJ40K + +def get_arguments(argv): + parser = argparse.ArgumentParser(description='load a trained model and predict the results') + parser.add_argument('model_file_name', metavar='MODEL_FILE', + help='input model file') + parser.add_argument('emotion_id', metavar='EMOTION_ID', type=int, + help='0-39, go check utils.LJ40K') + parser.add_argument('feature_list_file', metavar='feature_list_file', + help='This program will fuse the features listed in this file. This program will load the testing file only.') + parser.add_argument('-s', '--scaler_file', metavar='SCALER_FILE', default=None, + help='scaler file for scaling') + parser.add_argument('-v', '--verbose', action='store_true', default=False, + help='show messages') + parser.add_argument('-d', '--debug', action='store_true', default=False, + help='show debug messages') + args = parser.parse_args(argv) + return args + +if __name__ == '__main__': + + args = get_arguments(sys.argv[1:]) + features = utils.get_feature_list(args.feature_list_file) + + if args.debug: + loglevel = logging.DEBUG + elif args.verbose: + loglevel = logging.INFO + else: + loglevel = logging.ERROR + logging.basicConfig(format='[%(levelname)s] %(message)s', level=loglevel) + + + # pre-checking + if not os.path.exists(args.model_file_name): + raise Exception("model file %s doesn't exist." % (args.model_file_name)) + if not os.path.exists(args.feature_list_file): + raise Exception("feature file %s doesn't exist." % (args.feature_list_file)) + + + #load + learner = Learning(logger=logging) + if args.scaler_file: + learner.load_scaler(args.scaler_file) + learner.load_model(args.model_file_name) + + # prepare test data + paths = [f['test_file'] for f in features] + preprocessor = DataPreprocessor(logger=logging) + preprocessor.loads([f['feature'] for f in features], paths) + X_test, y_test, feature_name = preprocessor.fuse() + emotion_name = emotions[args.emotion_id] + yb_test = preprocessor.get_binary_y_by_emotion(y_test, emotion_name) + + # predict + results = learner.predict(X_test, yb_test, weighted_score=True, X_predict_prob=True, auc=True) + + # ToDo: write a result file diff --git a/example/TFIDF_TSVD300+w2v300+BOP300.json b/example/TFIDF_TSVD300+w2v300+BOP300.json new file mode 100644 index 0000000..33fd96a --- /dev/null +++ b/example/TFIDF_TSVD300+w2v300+BOP300.json @@ -0,0 +1,17 @@ +[ + { + "feature": "TFIDF_TSVD300", + "train_dir": "/corpus/LJ40K/data/train/TFIDF_TSVD300/800p800n_Xy", + "test_file": "/corpus/LJ40K/data/test/TFIDF_TSVD300/full.Xy/TFIDF_TSVD300.full.Xy.test.npz" + }, + { + "feature": "w2v300", + "train_dir": "/corpus/LJ40K/data/train/w2v300/800p800n_Xy", + "test_file": "/corpus/LJ40K/data/test/w2v300/full.Xy/w2v300.full.Xy.test.npz" + }, + { + "feature": "BOP_mincount3_TSVD300", + "train_dir": "/corpus/LJ40K/data/train/BOP_mincount3_TSVD300/800p800n_Xy", + "test_file": "/corpus/LJ40K/data/test/BOP_mincount3_TSVD300/full.Xy/BOP_mincount3_TSVD300.full.Xy.test.npz" + } +] diff --git a/example/TFIDF_TSVD300+w2v300.json b/example/TFIDF_TSVD300+w2v300.json new file mode 100644 index 0000000..49ddc27 --- /dev/null +++ b/example/TFIDF_TSVD300+w2v300.json @@ -0,0 +1,12 @@ +[ + { + "feature": "TFIDF_TSVD300", + "train_dir": "/home/doug919/projects/data/MKLv2/2000samples_4/train/TFIDF_TSVD300/800p800n_Xy", + "test_file": "/home/doug919/projects/data/MKLv2/2000samples_4/TFIDF_TSVD300.Xy.train.npz" + }, + { + "feature": "w2v300", + "train_dir": "/home/doug919/projects/data/MKLv2/2000samples_4/train/w2v300/800p800n_Xy", + "test_file": "/home/doug919/projects/data/MKLv2/2000samples_4/w2v300.Xy.train.npz" + } +] diff --git a/feelit/features.py b/feelit/features.py index 1dc9922..5b07faa 100644 --- a/feelit/features.py +++ b/feelit/features.py @@ -913,6 +913,7 @@ def __init__(self, X=None, y=None, **kwargs): self.kfold_results = [] self.Xs = {} self.ys = {} + self.scaling = False if 'scaling' not in kwargs else kwargs['scaling'] def set(self, X, y, feature_name): self.X = X @@ -959,7 +960,7 @@ def _train(self, X_train, y_train, **kwargs): # Douglas: this doesn't make sense #if utils.isSparse(self.X): # with_mean = False - + self.scaling = False if 'scaling' not in kwargs else kwargs['scaling'] if self.scaling: self.scaler = StandardScaler(with_mean=with_mean, with_std=with_std) @@ -1002,17 +1003,46 @@ def _train(self, X_train, y_train, **kwargs): self.logging.debug(self.params) self.clf.fit(X_train, y_train) + def dump_model(self, file_name): + try: + pickle.dump(self.clf, open(file_name, "w")) + except ValueError: + self.logging.error("failed to dump %s" % (file_name)) + + def dump_scaler(self, file_name): + try: + if self.scaling: + pickle.dump(self.scaler, open(file_name, "w")) + else: + self.logging.warning("scaler doesn't exist") + except ValueError: + self.logging.error("failed to dump %s" % (file_name)) + + def load_model(self, file_name): + try: + self.clf = pickle.load( open(file_name, "r")) + except ValueError: + self.logging.error("failed to load %s" % (file_name)) + + def load_scaler(self, file_name): + try: + self.scaler = pickle.load( open(file_name, "r")) + if self.scaler: + self.scaling = True + except ValueError: + self.logging.error("failed to load %s" % (file_name)) + def predict(self, X_test, y_test, **kwargs): ''' return dictionary of results ''' - + if self.scaling: X_test = self.scaler.transform(X_test) self.logging.info('y_test = %s', str(y_test.shape)) y_predict = self.clf.predict(X_test) - X_predict_prob = self.clf.predict_proba(X_test) if self.prob else 0 + X_predict_prob = self.clf.predict_proba(X_test) if self.clf.probability else 0 results = {} if 'score' in kwargs and kwargs['score'] == True: results.update({'score': self.clf.score(X_test, y_test.tolist())}) diff --git a/feelit/utils.py b/feelit/utils.py index 31aaa43..3064273 100644 --- a/feelit/utils.py +++ b/feelit/utils.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from math import exp +import json LJ40K = ['accomplished', 'aggravated', 'amused', 'annoyed', 'anxious', 'awake', 'blah', 'blank', 'bored', 'bouncy', 'busy', 'calm', 'cheerful', 'chipper', 'cold', 'confused', 'contemplative', 'content', 'crappy', 'crazy', 'creative', 'crushed', 'depressed', 'drained', 'ecstatic', 'excited', 'exhausted', 'frustrated', 'good', 'happy', 'hopeful', 'hungry', 'lonely', 'loved', 'okay', 'pissed off', 'sad', 'sick', 'sleepy', 'tired'] @@ -379,3 +380,9 @@ def parse_list(astr): for part in astr.split(','): result.add(float(part)) return sorted(result) + +def get_feature_list(feature_list_file): + fp = open(feature_list_file, 'r') + feature_list = json.load(fp) + fp.close() + return feature_list \ No newline at end of file