-
Notifications
You must be signed in to change notification settings - Fork 1
/
playground.py
131 lines (105 loc) · 4.36 KB
/
playground.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import datetime as dt
from tqdm import tqdm
import time, json
import ast
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
import ufal.udpipe, conllu, re, sys
class ColumnSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
return X[[self.column]]
class Length(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
X = [len(x) for x in X]
return X
class get_parent_childs():
def __init__(self, tree=dict(), target=""):
self.parent = 'source'
self.child = 0
self.get(tree, target)
def get(self, struct_dict, target):
for item in struct_dict:
childs = struct_dict[item]
if item == target:
self.child = len(childs)
return 1
if len(childs):
res = self.get(childs, target)
if res == 1:
self.parent = item
class Model_udpipe:
def __init__(self, path):
"""Load given model."""
self.model = ufal.udpipe.Model.load(path)
if not self.model:
raise Exception("Cannot load UDPipe model from file '%s'" % path)
def tokenize(self, text):
"""Tokenize the text and return list of ufal.udpipe.Sentence-s."""
tokenizer = self.model.newTokenizer(self.model.DEFAULT)
if not tokenizer:
raise Exception("The model does not have a tokenizer")
return self._read(text, tokenizer)
def read(self, text, in_format):
"""Load text in the given format (conllu|horizontal|vertical) and return list of ufal.udpipe.Sentence-s."""
input_format = ufal.udpipe.InputFormat.newInputFormat(in_format)
if not input_format:
raise Exception("Cannot create input format '%s'" % in_format)
return self._read(text, input_format)
def _read(self, text, input_format):
input_format.setText(text)
error = ufal.udpipe.ProcessingError()
sentences = []
sentence = ufal.udpipe.Sentence()
while input_format.nextSentence(sentence, error):
sentences.append(sentence)
sentence = ufal.udpipe.Sentence()
if error.occurred():
raise Exception(error.message)
return sentences
def tag(self, sentence):
"""Tag the given ufal.udpipe.Sentence (inplace)."""
self.model.tag(sentence, self.model.DEFAULT)
def parse(self, sentence):
"""Parse the given ufal.udpipe.Sentence (inplace)."""
self.model.parse(sentence, self.model.DEFAULT)
def write(self, sentences, out_format):
"""Write given ufal.udpipe.Sentence-s in the required format (conllu|horizontal|vertical)."""
output_format = ufal.udpipe.OutputFormat.newOutputFormat(out_format)
output = ''
for sentence in sentences:
output += output_format.writeSentence(sentence)
output += output_format.finishDocument()
return output
def prepare_submission():
with open('learning/answer.json') as data:
answer = json.load(data)
print(len(answer['subtaskaenglish']))
with open('learning/twitter.json') as data:
twitter = json.load(data)
with open('learning/reddit.json') as data:
reddit = json.load(data)
print(len(twitter['subtaskaenglish']))
answer['subtaskaenglish'].update(twitter['subtaskaenglish'])
answer['subtaskaenglish'].update(reddit['subtaskaenglish'])
print(len(answer['subtaskaenglish']))
with open('learning/edited_answer.json', 'w') as outfile:
json.dump(answer, outfile)
print()
if __name__ == "__main__":
scaler_twitter = MinMaxScaler(feature_range=(0, 1), copy=False)
x = np.array([1,2,3,4,5,6,7,8,9,10]).reshape(-1,1)
x2 = np.array([4,5,6,6]).reshape(-1,1)
x3 = np.array([18]).reshape(-1,1)
scaler_twitter.fit(x)
print(scaler_twitter.transform(x3).tolist()[0][0])
print(type(scaler_twitter.transform(x3).tolist()[0][0]))