Skip to content

Commit

Permalink
split emotion patterns
Browse files Browse the repository at this point in the history
  • Loading branch information
doug919 committed Feb 22, 2015
1 parent 85eeeba commit 8e6cc20
Show file tree
Hide file tree
Showing 6 changed files with 335 additions and 108 deletions.
49 changes: 43 additions & 6 deletions batch/batchFetchPatterns.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,42 @@
from __future__ import print_function
import sys
import logging
sys.path.append('..')
from feelit.patterns import PatternFetcher
from feelit.features import PatternFetcher
import numpy as np
import argparse


def get_arguments(argv):

parser = argparse.ArgumentParser(description='fetch patterns from MongoDB')
parser.add_argument('output_file', metavar='output_file',
help='File name of the ouput .npa file')
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help='show messages')
parser.add_argument('-d', '--debug', action='store_true', default=False,
help='show debug messages')
args = parser.parse_args(argv)
return args

def update_progress_bar(n_cur, n_total, bar_length=50):

percent = float(n_cur) / n_total
hashes = '#' * int(round(percent * bar_length))
spaces = ' ' * (bar_length - len(hashes))
print('\rPercent: [{0}] {1}%'.format(hashes + spaces, int(round(percent * 100))), end='')


if __name__ == '__main__':


loglevel = logging.DEBUG
args = get_arguments(sys.argv[1:])

if args.debug:
loglevel = logging.DEBUG
elif args.verbose:
loglevel = logging.INFO
else:
loglevel = logging.ERROR
logging.basicConfig(format='[%(levelname)s] %(message)s', level=loglevel)

pf = PatternFetcher(logger=logging)
Expand All @@ -18,14 +48,21 @@


logging.info('forming patterns')
X = []
y = []
min_count = 1
weighted = True
for udocId, emotion in docs:

update_progress_bar(udocId, len(docs))

pattern_freq_vec = pf.get_pattern_freq_by_udocId(udocId, min_count, weighted)

# sum vectors horizontally
import pdb; pdb.set_trace()


sum_vec = pf.sum_pattern_freq_vector(pattern_freq_vec)

X.append(sum_vec)
y.append(emotion)

logging.info('save to "%s"' % (args.output_file))
np.savez_compressed(args.output_file, X=np.array(X), y=np.array(y))
63 changes: 63 additions & 0 deletions batch/batchSplitEmotion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import sys
import logging
sys.path.append('..')
from feelit.features import FileSplitter
import numpy as np
import argparse
import pickle


def get_arguments(argv):

parser = argparse.ArgumentParser(description='split data in each emotion into the .npz file')
parser.add_argument('-b', '--begin', metavar='RANGE_BEGIN', type=int, default=0,
help='The begining index of the data in each emotion (DEFAULT: 0)')
parser.add_argument('-e', '--end', metavar='RANGE_END', type=int, default=1000,
help='The end index of the data in each emotion (DEFAULT: 1000)')
parser.add_argument('-p', '--pickle_file', metavar='FILE_NAME', default='',
help='pickle file generated by using feelit.utils.random_idx and pickle')
parser.add_argument('-x', '--output_ext', metavar='EXT', default='.npz',
help='output files\' extension; only useful when specified "-s" (DEFAULT: ".npz")')
parser.add_argument('input', metavar='input',
help='File name of the input data')
parser.add_argument('output', metavar='output',
help='File name or prefix for output data')
parser.add_argument('-s', '--split', action='store_true', default=False,
help='separate output file by emotions')
parser.add_argument('-v', '--verbose', action='store_true', default=False,
help='show messages')
parser.add_argument('-d', '--debug', action='store_true', default=False,
help='show debug messages')
args = parser.parse_args(argv)
return args


if __name__ == '__main__':

args = get_arguments(sys.argv[1:])

if args.debug:
loglevel = logging.DEBUG
elif args.verbose:
loglevel = logging.INFO
else:
loglevel = logging.ERROR
logging.basicConfig(format='[%(levelname)s] %(message)s', level=loglevel)

splitter = FileSplitter(logger=logging)

splitter.load(args.input)

splitter.split(args.begin, args.end)

if args.pickle_file != '':
idxs = pickle.load(open(args.pickle_file))
splitter.merge_negatives(idxs)

if args.split:
splitter.dump_by_emotions(args.output, args.output_ext)
else:
splitter.dump(args.output)



7 changes: 7 additions & 0 deletions example/pattern40.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[
{
"feature": "pattern40",
"train_dir": "/home/doug919/projects/data/MKLv2/2000samples_4/train/pattern40/800p800n_Xy",
"test_file": "/home/doug919/projects/data/MKLv2/2000samples_4/test_8000/pattern40/full.Xy/pattern40.full.Xy.test.npz"
}
]
221 changes: 221 additions & 0 deletions feelit/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

##########################################
# classes:
# feelit > features > PatternFetcher
# feelit > features > FileSplitter
# feelit > features > DataPreprocessor
# feelit > features > LoadFile
# feelit > features > FetchMongo
# feelit > features > DimensionReduction
Expand All @@ -20,6 +23,9 @@
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, BaseNB
from sklearn.metrics import roc_curve, auc
from random import randint
import pymongo
from operator import add

'''
def load(path, fields="ALL"):
Expand Down Expand Up @@ -82,6 +88,221 @@ def dump(path, **kwargs):
np.savez_compressed(path, **kwargs)
'''

class PatternFetcher(object):
"""
See batchFetchPatterns.py for example usage
"""

def __init__(self, **kwargs):
"""
options:
logger : logging instance
mongo_addr : mongo db import (DEFAULT: 'doraemon.iis.sinica.edu.tw')
db : database name (DEFAULT: 'LJ40K')
lexicon : pattern frequency collection (DEFAULT: 'lexicon.nested')
pats : patterns related to all the documents (DEFAULT: 'pats')
docs : map of udocId and emotions (DEFAULT: 'docs')
"""

## process args
if 'logger' in kwargs and kwargs['logger']:
self.logging = kwargs['logger']
else:
logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.ERROR)
self.logging = logging

## mongodb settings
mongo_addr = 'doraemon.iis.sinica.edu.tw' if 'mongo_addr' not in kwargs else kwargs['mongo_addr']

## default collection name
self.db = 'LJ40K' if 'db' not in kwargs else kwargs['db']

lexicon = 'lexicon.nested' if 'lexicon' not in kwargs else kwargs['lexicon']
pats = 'pats' if 'pats' not in kwargs else kwargs['pats']
docs = 'docs' if 'docs' not in kwargs else kwargs['docs']

### connect to mongodb
self.mongo_client = pymongo.MongoClient(mongo_addr)

self.collection_pattern_freq = self.mongo_client[self.db][lexicon]
self.collection_patterns = self.mongo_client[self.db][pats]
self.collection_docs = self.mongo_client[self.db][docs]

color_order = self.mongo_client['feelit']['color.order']
self.emotion_list = color_order.find_one({ 'order': 'group-maxis'})['emotion']

def get_all_doc_labels(self, sort=True):
"""
parameters:
sort: True/False; sorting by docId
return:
[(udocId0, emotion0), ...], which is sorted by udocId
"""
docs = [(doc['udocID'], doc['emotion']) for doc in self.collection_docs.find().batch_size(1024)]

if sort:
docs = sorted(docs, key=lambda x:x[0] )
return docs

def get_pattern_freq_by_udocId(self, udocId, min_count=1, weighted=True):

"""
parameters:
udocId: the id you want
min_count: the minimum frequency count to filter out the patterns
"""

pattern_freq_vec = {}
mdocs = self.collection_patterns.find({'udocID': udocId}, {'_id':0, 'pattern':1, 'usentID': 1, 'weight':1}).sort('usentID', 1).batch_size(512)

for mdoc in mdocs:

pat = mdoc['pattern'].lower()
freq_vec = self.collection_pattern_freq.find_one({'pattern': pat})

# filter patterns' corpus frequency <= min_count
if not freq_vec:
self.logging.warning('pattern freq of "%s" is not found' % (pat))
continue
elif sum(freq_vec['count'].values()) <= min_count:
self.logging.warning('pattern freq of "%s" <= %d' % (pat, min_count))
continue

# build freq vector with all emotions
weighted_freq_vec = {}
for e in self.emotion_list:
if e not in freq_vec['count']:
freq_vec['count'][e] = 0.0

w = mdoc['weight'] if weighted else 1.0
weighted_freq_vec[e] = freq_vec['count'][e] * w

pattern_freq_vec[pat] = weighted_freq_vec

return pattern_freq_vec

def sum_pattern_freq_vector(self, pf):

sum_vec = [0] * len(self.emotion_list)

for freq_vec in pf.values():

temp_vec = []
for e in freq_vec:
temp_vec.append(freq_vec[e])

sum_vec = map(add, sum_vec, temp_vec)

return sum_vec


class FileSplitter(object):
"""
see batchSplitEmotion.py for usage
"""

def __init__(self, **kwargs):

if 'logger' in kwargs and kwargs['logger']:
self.logging = kwargs['logger']
else:
logging.basicConfig(format='[%(levelname)s] %(message)s', level=logging.ERROR)
self.logging = logging

def load(self, file_path):
"""
parameters:
file_path: input data path
"""
data = np.load(file_path)

self.X = data['X']
self.y = data['y']

def split(self, begin, end, samples_in_each_emotion=1000):
"""
parameters:
begin:
end:
samples_in_each_emotion:
"""

if begin < 0 or end > samples_in_each_emotion:
return False

# we suppose that the input would be ordered by emotion with 1000 samples in each emotion
n_emotion = self.X.shape[0]/samples_in_each_emotion

self.X_sub = []
self.y_sub = []

for i in range(n_emotion):

temp_begin = begin + i * samples_in_each_emotion
temp_end = end + i * samples_in_each_emotion

self.X_sub += self.X[temp_begin: temp_end].tolist()
self.y_sub += self.y[temp_begin: temp_end].tolist()

def _subsample_by_idx(self, X, y, idxs):
"""
subsample a 2-D array by row index
"""
_X, _y = [], []
for i in idxs:
_X.append(X[i])
_y.append(y[i])

return _X, _y

def merge_negatives(self, idx_dict):
"""
idx_dict: {'emotion': [(index, 'sample_emotion')], ...}
i.e., {'tired': [(31201, 'tired'), (100, 'happy')]}
"""

self.X_dict = {}
self.y_dict = {}

for i_label, label in enumerate(idx_dict):

idxs = [i for i,l in idx_dict[label]]
self.X_dict[label], self.y_dict[label] = self._subsample_by_idx(self.X_sub, self.y_sub, idxs)

def _binary_label(self, y, emotion):
return [1 if e == emotion else -1 for e in y]

def dump_by_emotions(self, file_prefix, ext):
"""
save self.X_dict to file_prefix_emotion.npz
"""
for key, value in self.X_dict.iteritems():

yb = self._binary_label(self.y_dict[key], key)

# TODO: .train.npz is a hidden naming rule which should be eliminated
fname = file_prefix + '.' + key + ext

self.logging.debug("dumping X, y to %s" % (fname))
np.savez_compressed(fname, X=np.array(value), y=np.array(yb))


def dump(self, file_path, **kwargs):
"""
parameters:
file_path: output data path
option:
X: output data X
y: output data y
"""
out_X = self.X_sub if 'X' not in kwargs else kwargs['X']
out_y = self.y_sub if 'y' not in kwargs else kwargs['y']

self.logging.debug("dumping X, y to %s" % (file_path))
np.savez_compressed(file_path, X=np.array(out_X), y=np.array(out_y))


class LoadFile(object):
"""
Fetch features from files
Expand Down
Loading

0 comments on commit 8e6cc20

Please sign in to comment.