-
Notifications
You must be signed in to change notification settings - Fork 6
/
statistics.py
78 lines (65 loc) · 2.78 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
'''
Tensorflow implementation of Localized Factorization Machines
'''
import os
import argparse
import LoadData as DATA
import numpy as np
import tensorflow as tf
from time import time
import params
current = params.frappe_l
def parse_args():
parser = argparse.ArgumentParser(description="Data Statistics.")
parser.add_argument('--buckets', nargs='?', default='buckets/',
help='Running data path - for online storage system.')
parser.add_argument('--checkpointDir', default='checkpoint/',
help='checkpoint')
parser.add_argument('--summaryDir', default='summary/',
help='summary')
parser.add_argument('--path', nargs='?', default='running_data/',
help='Input data path.')
parser.add_argument('--dataset', nargs='?', default=current[0],
help='Choose a dataset.')
parser.add_argument('--num_class', type=int, default=1,
help='Whether to perform batch normaization (0 or 1)')
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
data = DATA.LoadData(args.buckets, args.dataset, 'log_loss', is_sparse=True,
loading=current[4])
all_data = np.concatenate((data.train_data['X'], data.validation_data['X'], data.test_data['X']), axis=0)
all_y = np.concatenate((data.train_data['Y'], data.validation_data['Y'], data.test_data['Y']), axis=0)
stat = np.zeros([np.max(all_data, axis=(0, 1)) + 1, 4], dtype=np.int32)
stat[:, 0] = np.array(range(stat.shape[0]))
print('handling...')
prev = 0
t1 = time()
for i in range(all_data.shape[0]):
if (i + 1) % 1000000 == 0:
t2 = time()
print('%.1fs: %dM' % (t2 - t1, (i + 1) / 1000000))
t1 = time()
stat[all_data[i, :], 1] = np.array(range(all_data.shape[1]))
stat[all_data[i, :], 2] += 1
if all_y[i, 0] > 0.1:
stat[all_data[i, :], 3] += 1
with tf.gfile.Open(os.path.join(args.buckets, args.dataset, args.dataset + '.stat'), 'w') as fp:
for i in range(stat.shape[0]):
fp.write('%d %d %d %d\n' % (i, stat[i, 1], stat[i, 2], stat[i, 3]))
print('\nstatistics:all')
prev = 0
num_instance = stat[:, 2]
for num in [2 ** n for n in range(1, 30)]:
count_sum = np.count_nonzero(num_instance[num_instance <= num])
count = count_sum - prev
prev = count_sum
print('%d: %d, %d' % (num, count, count_sum))
print('\nstatistics:pos')
prev = 0
num_instance = stat[:, 3]
for num in [2 ** n for n in range(1, 30)]:
count_sum = np.count_nonzero(num_instance[num_instance <= num])
count = count_sum - prev
prev = count_sum
print('%d: %d, %d' % (num, count, count_sum))