split emotion patterns

doug919 · Feb 22, 2015 · 8e6cc20 · 8e6cc20
1 parent 85eeeba
commit 8e6cc20
Show file tree

Hide file tree

Showing 6 changed files with 335 additions and 108 deletions.
diff --git a/batch/batchFetchPatterns.py b/batch/batchFetchPatterns.py
@@ -1,12 +1,42 @@
+from __future__ import print_function
 import sys
 import logging
 sys.path.append('..')
-from feelit.patterns import PatternFetcher
+from feelit.features import PatternFetcher
+import numpy as np
+import argparse
+
+
+def get_arguments(argv):
+
+    parser = argparse.ArgumentParser(description='fetch patterns from MongoDB')
+    parser.add_argument('output_file', metavar='output_file', 
+                        help='File name of the ouput .npa file')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, 
+                        help='show messages')
+    parser.add_argument('-d', '--debug', action='store_true', default=False, 
+                        help='show debug messages')
+    args = parser.parse_args(argv)
+    return args
+
+def update_progress_bar(n_cur, n_total, bar_length=50):
+
+    percent = float(n_cur) / n_total
+    hashes = '#' * int(round(percent * bar_length))
+    spaces = ' ' * (bar_length - len(hashes))
+    print('\rPercent: [{0}] {1}%'.format(hashes + spaces, int(round(percent * 100))), end='')
+
 
 if __name__ == '__main__':
 
-
-    loglevel = logging.DEBUG
+    args = get_arguments(sys.argv[1:])
+
+    if args.debug:
+        loglevel = logging.DEBUG
+    elif args.verbose:
+        loglevel = logging.INFO
+    else:
+        loglevel = logging.ERROR
     logging.basicConfig(format='[%(levelname)s] %(message)s', level=loglevel) 
 
     pf = PatternFetcher(logger=logging)
@@ -18,14 +48,21 @@
 
 
     logging.info('forming patterns')
+    X = []
+    y = []
     min_count = 1
     weighted = True    
     for udocId, emotion in docs:
 
+        update_progress_bar(udocId, len(docs))
+
         pattern_freq_vec = pf.get_pattern_freq_by_udocId(udocId, min_count, weighted)
 
         # sum vectors horizontally
-        import pdb; pdb.set_trace()
-
-
+        sum_vec = pf.sum_pattern_freq_vector(pattern_freq_vec)
+
+        X.append(sum_vec)
+        y.append(emotion)
 
+    logging.info('save to "%s"' % (args.output_file))
+    np.savez_compressed(args.output_file, X=np.array(X), y=np.array(y))
diff --git a/batch/batchSplitEmotion.py b/batch/batchSplitEmotion.py
@@ -0,0 +1,63 @@
+import sys
+import logging
+sys.path.append('..')
+from feelit.features import FileSplitter
+import numpy as np
+import argparse
+import pickle
+
+
+def get_arguments(argv):
+
+    parser = argparse.ArgumentParser(description='split data in each emotion into the .npz file')
+    parser.add_argument('-b', '--begin', metavar='RANGE_BEGIN', type=int, default=0, 
+                        help='The begining index of the data in each emotion (DEFAULT: 0)')
+    parser.add_argument('-e', '--end', metavar='RANGE_END', type=int, default=1000, 
+                        help='The end index of the data in each emotion (DEFAULT: 1000)')
+    parser.add_argument('-p', '--pickle_file', metavar='FILE_NAME', default='', 
+                        help='pickle file generated by using feelit.utils.random_idx and pickle')
+    parser.add_argument('-x', '--output_ext', metavar='EXT', default='.npz', 
+                        help='output files\' extension; only useful when specified "-s" (DEFAULT: ".npz")')
+    parser.add_argument('input', metavar='input', 
+                        help='File name of the input data')
+    parser.add_argument('output', metavar='output', 
+                        help='File name or prefix for output data')
+    parser.add_argument('-s', '--split', action='store_true', default=False, 
+                        help='separate output file by emotions')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, 
+                        help='show messages')
+    parser.add_argument('-d', '--debug', action='store_true', default=False, 
+                        help='show debug messages')
+    args = parser.parse_args(argv)
+    return args
+
+
+if __name__ == '__main__':
+
+    args = get_arguments(sys.argv[1:])
+
+    if args.debug:
+        loglevel = logging.DEBUG
+    elif args.verbose:
+        loglevel = logging.INFO
+    else:
+        loglevel = logging.ERROR
+    logging.basicConfig(format='[%(levelname)s] %(message)s', level=loglevel)     
+
+    splitter = FileSplitter(logger=logging)
+
+    splitter.load(args.input)
+
+    splitter.split(args.begin, args.end)
+
+    if args.pickle_file != '':
+        idxs = pickle.load(open(args.pickle_file))
+        splitter.merge_negatives(idxs)
+
+    if args.split:
+        splitter.dump_by_emotions(args.output, args.output_ext)
+    else:
+        splitter.dump(args.output)
+
+
+
diff --git a/example/pattern40.json b/example/pattern40.json
@@ -0,0 +1,7 @@
+[
+    {
+        "feature": "pattern40",
+        "train_dir": "/home/doug919/projects/data/MKLv2/2000samples_4/train/pattern40/800p800n_Xy",
+        "test_file": "/home/doug919/projects/data/MKLv2/2000samples_4/test_8000/pattern40/full.Xy/pattern40.full.Xy.test.npz"
+    }
+]
diff --git a/feelit/features.py b/feelit/features.py
@@ -2,6 +2,9 @@
 
 ##########################################
 # classes:
+#   feelit > features > PatternFetcher
+#   feelit > features > FileSplitter
+#   feelit > features > DataPreprocessor
 #   feelit > features > LoadFile
 #   feelit > features > FetchMongo
 #   feelit > features > DimensionReduction
@@ -20,6 +23,9 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, BaseNB
 from sklearn.metrics import roc_curve, auc
+from random import randint
+import pymongo
+from operator import add
 
 '''
 def load(path, fields="ALL"):
@@ -82,6 +88,221 @@ def dump(path, **kwargs):
     np.savez_compressed(path, **kwargs)
 '''
 
+class PatternFetcher(object):
+    """
+    See batchFetchPatterns.py for example usage
+    """
+
+    def __init__(self, **kwargs):
+        """
+        options:
+            logger          : logging instance
+            mongo_addr      : mongo db import                           (DEFAULT: 'doraemon.iis.sinica.edu.tw')
+            db              : database name                             (DEFAULT: 'LJ40K')
+            lexicon         : pattern frequency collection              (DEFAULT: 'lexicon.nested')
+            pats            : patterns related to all the documents     (DEFAULT: 'pats')
+            docs            : map of udocId and emotions                (DEFAULT: 'docs')
+        """
+
+        ## process args
+        if 'logger' in kwargs and kwargs['logger']:
+            self.logging = kwargs['logger']
+        else:
+            logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.ERROR)  
+            self.logging = logging
+
+        ## mongodb settings
+        mongo_addr = 'doraemon.iis.sinica.edu.tw' if 'mongo_addr' not in kwargs else kwargs['mongo_addr']
+
+        ## default collection name
+        self.db = 'LJ40K' if 'db' not in kwargs else kwargs['db']
+
+        lexicon = 'lexicon.nested' if 'lexicon' not in kwargs else kwargs['lexicon']
+        pats = 'pats' if 'pats' not in kwargs else kwargs['pats']
+        docs = 'docs' if 'docs' not in kwargs else kwargs['docs']
+
+        ### connect to mongodb
+        self.mongo_client = pymongo.MongoClient(mongo_addr)
+
+        self.collection_pattern_freq = self.mongo_client[self.db][lexicon]
+        self.collection_patterns = self.mongo_client[self.db][pats]
+        self.collection_docs = self.mongo_client[self.db][docs]
+
+        color_order = self.mongo_client['feelit']['color.order']
+        self.emotion_list = color_order.find_one({ 'order': 'group-maxis'})['emotion']
+
+    def get_all_doc_labels(self, sort=True):
+        """
+        parameters:
+            sort: True/False; sorting by docId
+        return:
+            [(udocId0, emotion0), ...], which is sorted by udocId
+        """
+        docs = [(doc['udocID'], doc['emotion']) for doc in self.collection_docs.find().batch_size(1024)]
+
+        if sort:
+            docs = sorted(docs, key=lambda x:x[0] )
+        return docs
+
+    def get_pattern_freq_by_udocId(self, udocId, min_count=1, weighted=True):
+
+        """
+        parameters:
+            udocId: the id you want 
+            min_count: the minimum frequency count to filter out the patterns
+        """
+
+        pattern_freq_vec = {}
+        mdocs = self.collection_patterns.find({'udocID': udocId}, {'_id':0, 'pattern':1, 'usentID': 1, 'weight':1}).sort('usentID', 1).batch_size(512)
+
+        for mdoc in mdocs:
+
+            pat = mdoc['pattern'].lower()
+            freq_vec = self.collection_pattern_freq.find_one({'pattern': pat}) 
+
+            # filter patterns' corpus frequency <= min_count 
+            if not freq_vec:
+                self.logging.warning('pattern freq of "%s" is not found' % (pat))
+                continue
+            elif sum(freq_vec['count'].values()) <= min_count:
+                self.logging.warning('pattern freq of "%s" <= %d' % (pat, min_count))
+                continue
+
+            # build freq vector with all emotions
+            weighted_freq_vec = {}
+            for e in self.emotion_list:
+                if e not in freq_vec['count']: 
+                    freq_vec['count'][e] = 0.0
+
+                w = mdoc['weight'] if weighted else 1.0
+                weighted_freq_vec[e] = freq_vec['count'][e] * w
+
+            pattern_freq_vec[pat] = weighted_freq_vec
+
+        return pattern_freq_vec
+
+    def sum_pattern_freq_vector(self, pf):
+
+        sum_vec = [0] * len(self.emotion_list)
+
+        for freq_vec in pf.values():
+
+            temp_vec = []
+            for e in freq_vec:
+                temp_vec.append(freq_vec[e])
+
+            sum_vec = map(add, sum_vec, temp_vec)
+
+        return sum_vec
+
+
+class FileSplitter(object):
+    """
+    see batchSplitEmotion.py for usage
+    """
+
+    def __init__(self, **kwargs):    
+
+        if 'logger' in kwargs and kwargs['logger']:
+            self.logging = kwargs['logger']
+        else:
+            logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.ERROR)  
+            self.logging = logging
+
+    def load(self, file_path):
+        """
+        parameters:
+            file_path: input data path
+        """
+        data = np.load(file_path)
+
+        self.X = data['X']
+        self.y = data['y']
+
+    def split(self, begin, end, samples_in_each_emotion=1000):
+        """
+        parameters:
+            begin:
+            end:
+            samples_in_each_emotion:
+        """
+
+        if begin < 0 or end > samples_in_each_emotion:
+            return False
+
+        # we suppose that the input would be ordered by emotion with 1000 samples in each emotion
+        n_emotion = self.X.shape[0]/samples_in_each_emotion
+
+        self.X_sub = []
+        self.y_sub = []
+
+        for i in range(n_emotion):
+
+            temp_begin = begin + i * samples_in_each_emotion
+            temp_end = end + i * samples_in_each_emotion
+
+            self.X_sub += self.X[temp_begin: temp_end].tolist()
+            self.y_sub += self.y[temp_begin: temp_end].tolist()
+
+    def _subsample_by_idx(self, X, y, idxs):
+        """
+        subsample a 2-D array by row index
+        """
+        _X, _y = [], []
+        for i in idxs:
+            _X.append(X[i])
+            _y.append(y[i])
+
+        return _X, _y
+
+    def merge_negatives(self, idx_dict):
+        """
+        idx_dict: {'emotion': [(index, 'sample_emotion')], ...}
+            i.e., {'tired': [(31201, 'tired'), (100, 'happy')]}
+        """
+
+        self.X_dict = {}
+        self.y_dict = {}
+
+        for i_label, label in enumerate(idx_dict):
+
+            idxs = [i for i,l in idx_dict[label]]
+            self.X_dict[label], self.y_dict[label] = self._subsample_by_idx(self.X_sub, self.y_sub, idxs)
+
+    def _binary_label(self, y, emotion):
+        return [1 if e == emotion else -1 for e in y]
+
+    def dump_by_emotions(self, file_prefix, ext):
+        """
+        save self.X_dict to file_prefix_emotion.npz
+        """
+        for key, value in self.X_dict.iteritems():
+
+            yb = self._binary_label(self.y_dict[key], key)
+
+            # TODO: .train.npz is a hidden naming rule which should be eliminated
+            fname = file_prefix + '.' + key + ext
+
+            self.logging.debug("dumping X, y to %s" % (fname))
+            np.savez_compressed(fname, X=np.array(value), y=np.array(yb))
+
+
+    def dump(self, file_path, **kwargs):
+        """
+        parameters:
+            file_path: output data path
+
+        option:
+            X: output data X
+            y: output data y
+        """
+        out_X = self.X_sub if 'X' not in kwargs else kwargs['X']
+        out_y = self.y_sub if 'y' not in kwargs else kwargs['y']
+
+        self.logging.debug("dumping X, y to %s" % (file_path))
+        np.savez_compressed(file_path, X=np.array(out_X), y=np.array(out_y))
+
+
 class LoadFile(object):
     """
     Fetch features from files