-
Notifications
You must be signed in to change notification settings - Fork 1
/
bigrams.py
56 lines (38 loc) · 1.63 KB
/
bigrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
import re
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.
df = pd.read_csv('/kaggle/input/cs446-fa19/train.csv')
result = df['train_error']
classify = df['arch_and_hp'][1]
layers = ['conv', 'batchnorm', 'flatten', 'leaky', 'tanh', 'softmax', 'relu', 'selu', 'dropout', 'maxpool', 'linear']
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
layerregex = '|'.join(layers)
document = []
print (classify)
m = {}
for i in range (0, len(df['arch_and_hp'].index)):
text = df['arch_and_hp'][i]
text = text.replace('leaky_relu', 'leaky')
# print (re.findall(layerregex, text))
text = ' '.join(re.findall(layerregex, text))
document.append(text)
X = vectorizer.fit_transform(document)
'''
print (document)
print (vectorizer.get_feature_names())
print (X.toarray())
'''
# [sum(X.toarray()) for x in zip(*input_val)]
bigrams = [sum(x) for x in zip(*X.toarray())]
print (bigrams)