Skip to content

Commit

Permalink
Merge pull request #44 from IndicoDataSolutions/Chris/python3-compat
Browse files Browse the repository at this point in the history
ADD: python3 compat
  • Loading branch information
sihrc authored Sep 10, 2018
2 parents 4b8be6d + 1d5bdf1 commit af6e100
Show file tree
Hide file tree
Showing 10 changed files with 74 additions and 77 deletions.
5 changes: 0 additions & 5 deletions examples/gender.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
trX, teX, trY, teY = load_gender_data(ntrain=10000) # Can increase up to 250K or so

tokenizer = Tokenizer(min_df=10, max_features=50000)
print trX[1] # see a blog example
trX = tokenizer.fit_transform(trX)
teX = tokenizer.transform(teX)
print tokenizer.n_features

layers = [
Embedding(size=128, n_features=tokenizer.n_features),
Expand All @@ -33,7 +31,6 @@
tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
te_acc = metrics.accuracy_score(teY, te_preds > 0.5)

print i, tr_acc, te_acc

save(model, 'save_test.pkl') # How to save

Expand All @@ -44,5 +41,3 @@

tr_acc = metrics.accuracy_score(trY[:len(teY)], tr_preds > 0.5)
te_acc = metrics.accuracy_score(teY, te_preds > 0.5)

print tr_acc, te_acc
10 changes: 5 additions & 5 deletions examples/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ def load_mnist(data_dir=None):
import gzip
url = 'http://yann.lecun.com/exdb/mnist/'
fnames = [
'train-images-idx3-ubyte.gz',
'train-labels-idx1-ubyte.gz',
't10k-images-idx3-ubyte.gz',
'train-images-idx3-ubyte.gz',
'train-labels-idx1-ubyte.gz',
't10k-images-idx3-ubyte.gz',
't10k-labels-idx1-ubyte.gz'
]
for fname in fnames:
if not os.path.isfile(fname):
print 'data_dir not given and file not local - downloading mnist file:', fname
print("data_dir not given and file not local - downloading mnist file:", fname)
urllib.urlretrieve(url+fname, fname)
data_dir = ''
fd = gzip.open(os.path.join(data_dir,'train-images-idx3-ubyte.gz'))
Expand All @@ -53,5 +53,5 @@ def load_mnist(data_dir=None):

trX = trX.reshape(-1, 28, 28)
teX = teX.reshape(-1, 28, 28)

return trX, teX, trY, teY
5 changes: 1 addition & 4 deletions examples/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

trX, teX, trY, teY = load_mnist()

#Use generic layer - RNN processes a size 28 vector at a time scanning from left to right
#Use generic layer - RNN processes a size 28 vector at a time scanning from left to right
layers = [
Generic(size=28),
GatedRecurrent(size=512, p_drop=0.2),
Expand All @@ -28,6 +28,3 @@

tr_acc = np.mean(trY[:len(teY)] == np.argmax(tr_preds, axis=1))
te_acc = np.mean(teY == np.argmax(te_preds, axis=1))

# Test accuracy should be between 98.9% and 99.3%
print 'train accuracy', tr_acc, 'test accuracy', te_acc
2 changes: 1 addition & 1 deletion passage/inits.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import theano
import theano.tensor as T

from theano_utils import sharedX, floatX, intX
from passage.theano_utils import sharedX, floatX, intX

def uniform(shape, scale=0.05):
return sharedX(np.random.uniform(low=-scale, high=scale, size=shape))
Expand Down
22 changes: 11 additions & 11 deletions passage/iterators.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
import numpy as np

from utils import shuffle, iter_data
from theano_utils import floatX, intX
from passage.utils import shuffle, iter_data
from passage.theano_utils import floatX, intX

def padded(seqs):
lens = map(len, seqs)
lens = [len(seq) for seq in seqs]
max_len = max(lens)
seqs_padded = []
for seq, seq_len in zip(seqs, lens):
n_pad = max_len - seq_len
n_pad = max_len - seq_len
seq = [0] * n_pad + seq
seqs_padded.append(seq)
return np.asarray(seqs_padded).transpose(1, 0)
return np.atleast_2d(seqs_padded).transpose(1, 0)

class Linear(object):
"""
Useful for training on real valued data where first dimension is examples,
Useful for training on real valued data where first dimension is examples,
second dimension is to be iterated over, and third dimension is data vectors.
size is the number of examples per minibatch
Expand All @@ -42,7 +42,7 @@ def iterX(self, X):
yield xmb

def iterXY(self, X, Y):

if self.shuffle:
X, Y = shuffle(X, Y)

Expand Down Expand Up @@ -70,7 +70,7 @@ def iterX(self, X):
yield self.x_dtype(xmb)

def iterXY(self, X, Y):

if self.shuffle:
X, Y = shuffle(X, Y)

Expand All @@ -93,10 +93,10 @@ def iterX(self, X):
chunk_idxs = [chunk_idxs[idx] for idx in sort]
for xmb, idxmb in iter_data(x_chunk, chunk_idxs, size=self.size):
xmb = padded(xmb)
yield self.x_dtype(xmb), idxmb
yield self.x_dtype(xmb), idxmb

def iterXY(self, X, Y):

if self.shuffle:
X, Y = shuffle(X, Y)

Expand All @@ -108,4 +108,4 @@ def iterXY(self, X, Y):
mb_chunks = shuffle(mb_chunks)
for xmb, ymb in mb_chunks:
xmb = padded(xmb)
yield self.x_dtype(xmb), self.y_dtype(ymb)
yield self.x_dtype(xmb), self.y_dtype(ymb)
48 changes: 24 additions & 24 deletions passage/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from copy import deepcopy

from theano_utils import shared0s, floatX
import activations
import inits
from passage.theano_utils import shared0s, floatX
import passage.activations as activations
import passage.inits as inits

import numpy as np

Expand All @@ -24,7 +24,7 @@ def theano_one_hot(idx, n):
srng = RandomStreams()

class Generic(object):
"""
"""
Useful when processing real valued vectors see examples/mnist.py for example usage.
size is input dimensionality
Expand All @@ -39,7 +39,7 @@ def __init__(self, size, weights=None):
self.params = []

def output(self, dropout_active=False):
return self.input
return self.input

class Embedding(object):

Expand Down Expand Up @@ -100,10 +100,10 @@ def connect(self, l_in):
self.b_in = shared0s((self.size))
self.w_rec = self.init((self.size, self.size))
self.params = [self.h0, self.w_in, self.b_in, self.w_rec]

if self.weights is not None:
for param, weight in zip(self.params, self.weights):
param.set_value(floatX(weight))
param.set_value(floatX(weight))

def step(self, x_t, h_tm1, w):
h_t = self.activation(x_t + T.dot(h_tm1, w))
Expand Down Expand Up @@ -159,13 +159,13 @@ def connect(self, l_in):
self.u_o = self.init((self.size, self.size))
self.u_c = self.init((self.size, self.size))

self.params = [self.w_i, self.w_f, self.w_o, self.w_c,
self.u_i, self.u_f, self.u_o, self.u_c,
self.params = [self.w_i, self.w_f, self.w_o, self.w_c,
self.u_i, self.u_f, self.u_o, self.u_c,
self.b_i, self.b_f, self.b_o, self.b_c]

if self.weights is not None:
for param, weight in zip(self.params, self.weights):
param.set_value(floatX(weight))
param.set_value(floatX(weight))

def step(self, xi_t, xf_t, xo_t, xc_t, h_tm1, c_tm1, u_i, u_f, u_o, u_c):
i_t = self.gate_activation(xi_t + T.dot(h_tm1, u_i))
Expand All @@ -183,9 +183,9 @@ def output(self, dropout_active=False):
x_f = T.dot(X, self.w_f) + self.b_f
x_o = T.dot(X, self.w_o) + self.b_o
x_c = T.dot(X, self.w_c) + self.b_c
[out, cells], _ = theano.scan(self.step,
sequences=[x_i, x_f, x_o, x_c],
outputs_info=[T.alloc(0., X.shape[1], self.size), T.alloc(0., X.shape[1], self.size)],
[out, cells], _ = theano.scan(self.step,
sequences=[x_i, x_f, x_o, x_c],
outputs_info=[T.alloc(0., X.shape[1], self.size), T.alloc(0., X.shape[1], self.size)],
non_sequences=[self.u_i, self.u_f, self.u_o, self.u_c],
truncate_gradient=self.truncate_gradient
)
Expand All @@ -198,7 +198,7 @@ class GatedRecurrent(object):

def __init__(self, size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', truncate_gradient=-1, seq_output=False, p_drop=0., direction='forward', weights=None):
self.settings = locals()
del self.settings['self']
del self.settings['self']
self.activation_str = activation
self.activation = getattr(activations, activation)
self.gate_activation = getattr(activations, gate_activation)
Expand All @@ -225,19 +225,19 @@ def connect(self, l_in):
self.b_r = shared0s((self.size))

if 'maxout' in self.activation_str:
self.w_h = self.init((self.n_in, self.size*2))
self.w_h = self.init((self.n_in, self.size*2))
self.u_h = self.init((self.size, self.size*2))
self.b_h = shared0s((self.size*2))
else:
self.w_h = self.init((self.n_in, self.size))
self.w_h = self.init((self.n_in, self.size))
self.u_h = self.init((self.size, self.size))
self.b_h = shared0s((self.size))
self.b_h = shared0s((self.size))

self.params = [self.h0, self.w_z, self.w_r, self.w_h, self.u_z, self.u_r, self.u_h, self.b_z, self.b_r, self.b_h]

if self.weights is not None:
for param, weight in zip(self.params, self.weights):
param.set_value(floatX(weight))
param.set_value(floatX(weight))


def step(self, xz_t, xr_t, xh_t, h_tm1, u_z, u_r, u_h):
Expand All @@ -256,16 +256,16 @@ def output(self, dropout_active=False):
x_z = T.dot(X, self.w_z) + self.b_z
x_r = T.dot(X, self.w_r) + self.b_r
x_h = T.dot(X, self.w_h) + self.b_h
out, _ = theano.scan(self.step,
sequences=[x_z, x_r, x_h],
outputs_info=[repeat(self.h0, x_h.shape[1], axis=0)],
out, _ = theano.scan(self.step,
sequences=[x_z, x_r, x_h],
outputs_info=[repeat(self.h0, x_h.shape[1], axis=0)],
non_sequences=[self.u_z, self.u_r, self.u_h],
truncate_gradient=self.truncate_gradient
)
if self.seq_output:
return out
else:
return out[-1]
return out[-1]

class Dense(object):
def __init__(self, size=256, activation='rectify', init='orthogonal', p_drop=0., weights=None):
Expand All @@ -288,10 +288,10 @@ def connect(self, l_in):
self.w = self.init((self.n_in, self.size))
self.b = shared0s((self.size))
self.params = [self.w, self.b]

if self.weights is not None:
for param, weight in zip(self.params, self.weights):
param.set_value(floatX(weight))
param.set_value(floatX(weight))

def output(self, pre_act=False, dropout_active=False):
X = self.l_in.output(dropout_active=dropout_active)
Expand Down
28 changes: 17 additions & 11 deletions passage/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,40 @@
import numpy as np
from time import time

import costs
import updates
import iterators
from utils import case_insensitive_import, save
from preprocessing import LenFilter, standardize_targets
import passage.costs as costs
import passage.updates as updates
import passage.iterators as iterators

from passage.utils import case_insensitive_import, save
from passage.preprocessing import LenFilter, standardize_targets

def flatten(l):
return [item for sublist in l for item in sublist]

try:
basestring
BaseString = basestring
except NameError:
BaseString = (str, bytes)

class RNN(object):

def __init__(self, layers, cost, updater='Adam', verbose=2, Y=T.matrix(), iterator='SortedPadded'):
self.settings = locals()
del self.settings['self']
self.layers = layers

if isinstance(cost, basestring):
if isinstance(cost, BaseString):
self.cost = case_insensitive_import(costs, cost)
else:
self.cost = cost

if isinstance(updater, basestring):
if isinstance(updater, BaseString):
self.updater = case_insensitive_import(updates, updater)()
else:
self.updater = updater

if isinstance(iterator, basestring):
if isinstance(iterator, BaseString):
self.iterator = case_insensitive_import(iterators, iterator)()
else:
self.iterator = iterator
Expand Down Expand Up @@ -73,7 +80,6 @@ def fit(self, trX, trY, batch_size=64, n_epochs=1, len_filter=LenFilter(), snaps
trY = standardize_targets(trY, cost=self.cost)

n = 0.
stats = []
t = time()
costs = []
for e in range(n_epochs):
Expand All @@ -92,11 +98,11 @@ def fit(self, trX, trY, batch_size=64, n_epochs=1, len_filter=LenFilter(), snaps

status = "Epoch %d Seen %d samples Avg cost %0.4f Time elapsed %d seconds" % (e, n, np.mean(epoch_costs[-250:]), time() - t)
if self.verbose >= 2:
sys.stdout.write("\r"+status)
sys.stdout.write("\r"+status)
sys.stdout.flush()
sys.stdout.write("\n")
elif self.verbose == 1:
print status
print(status)
if path and e % snapshot_freq == 0:
save(self, "{0}.{1}".format(path, e))
return costs
Expand Down
4 changes: 2 additions & 2 deletions passage/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import string
from collections import Counter

import numpy as np
import theano
import theano.tensor as T
import string
from collections import Counter

punctuation = set(string.punctuation)
punctuation.add('\n')
Expand Down
4 changes: 2 additions & 2 deletions passage/updates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import theano.tensor as T
import numpy as np

from theano_utils import shared0s, floatX
from passage.theano_utils import shared0s, floatX

def clip_norm(g, c, n):
if c > 0:
Expand Down Expand Up @@ -176,7 +176,7 @@ def get_updates(self, params, cost):
p_t = p - (self.lr / T.sqrt(acc_t + self.epsilon)) * g
p_t = self.regularizer.weight_regularize(p_t)
updates.append((p, p_t))
return updates
return updates

class Adadelta(Update):

Expand Down
Loading

0 comments on commit af6e100

Please sign in to comment.