forked from zhangzhang10/pydaal-tutorials
-
Notifications
You must be signed in to change notification settings - Fork 27
/
LinearRegression.py
268 lines (249 loc) · 11.9 KB
/
LinearRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#Uses helper function from reg_quality_metrics.py
import sys, os
sys.path.append(os.path.join(os.path.dirname(sys.executable),'share','pydaal_examples','examples','python','source'))
import warnings
import numpy as np
from numpy import float32, float64
from reg_quality_metrics import RegressionQualityMetrics
from daal.algorithms.linear_regression import training, prediction
from daal.data_management import InputDataArchive, OutputDataArchive
from daal.data_management import Compressor_Zlib, Decompressor_Zlib, level9, DecompressionStream, CompressionStream, HomogenNumericTable,BlockDescriptor_Float64
from daal.data_management import BlockDescriptor, readWrite
import daal.algorithms.linear_regression.quality_metric_set as quality_metric_set
from daal.algorithms.linear_regression.quality_metric import single_beta, group_of_betas
from utils import printNumericTable
class LinearRegression:
'''
Constructor to set Linear Regression training parameters
parameters:
method: 'defaultDense'/'qrDense', default: 'normeq'
used to decide the calculation method. 'normeq' is normal equation, 'qrDense' is QR decomposition
interceptFlag: True/False, default: 'True'
decides whether or not intercept component to be evaluated
dtype: intc/float32, float64, default: float64
'''
def __init__(self, method = 'defaultDense', interceptFlag = True, dtype = float64):
self.method = method
#Print error message here"
self.interceptFlag = interceptFlag
self.dtype = dtype
'''
Arguments: train data feature values(type nT), train data target values(type nT)
Returns training results object
'''
def training(self, trainData, trainDependentVariables):
if self.method == 'defaultDense':
method = training.normEqDense
elif self.method == 'qrDense':
method = training.qrDense
else:
warnings.warn ('Invalid method, using default dense Normal Equation method')
method = training.normEqDense
algorithm = training.Batch(method=method, fptype = self.dtype)
# Pass a training data set and dependent values to the algorithm
algorithm.input.set (training.data, trainData)
algorithm.input.set (training.dependentVariables, trainDependentVariables)
algorithm.parameter.interceptFlag = self.interceptFlag
# Build linear regression model and retrieve the algorithm results
trainingResult = algorithm.compute()
return trainingResult
'''
Arguments: training result object, test data feature values(type nT)
Returns predicted values of type nT
'''
def predict(self, trainingResult, testData):
algorithm = prediction.Batch(fptype = self.dtype)
# Pass a testing data set and the trained model to the algorithm
algorithm.input.setTable(prediction.data, testData)
algorithm.input.setModel(prediction.model, trainingResult.get(training.model))
# Predict values of multiple linear regression and retrieve the algorithm results
predictionResult = algorithm.compute()
return (predictionResult.get (prediction.prediction))
'''
Arguments: serialized numpy array
Returns Compressed numpy array
'''
def compress(self,arrayData):
compressor = Compressor_Zlib ()
compressor.parameter.gzHeader = True
compressor.parameter.level = level9
comprStream = CompressionStream (compressor)
comprStream.push_back (arrayData)
compressedData = np.empty (comprStream.getCompressedDataSize (), dtype=np.uint8)
comprStream.copyCompressedArray (compressedData)
return compressedData
'''
Arguments: deserialized numpy array
Returns decompressed numpy array
'''
def decompress(self, arrayData):
decompressor = Decompressor_Zlib ()
decompressor.parameter.gzHeader = True
# Create a stream for decompression
deComprStream = DecompressionStream (decompressor)
# Write the compressed data to the decompression stream and decompress it
deComprStream.push_back (arrayData)
# Allocate memory to store the decompressed data
bufferArray = np.empty (deComprStream.getDecompressedDataSize (), dtype=np.uint8)
# Store the decompressed data
deComprStream.copyDecompressedArray (bufferArray)
return bufferArray
'''
Method 1:
Arguments: data(type nT/model)
Returns serialized numpy array
Method 2:
Arguments: data(type nT/model), fileName(.npy file to save serialized array to disk)
Saves serialized numpy array as "fileName" argument
Method 3:
Arguments: data(type nT/model), useCompression = True
Returns compressed numpy array
Method 4:
Arguments: data(type nT/model), fileName(.npy file to save serialized array to disk), useCompression = True
Saves compressed numpy array as "fileName" argument
'''
def serialize(self, data, fileName=None, useCompression=False):
buffArrObjName = (str (type (data)).split ()[1].split ('>')[0] + "()").replace ("'", '')
dataArch = InputDataArchive ()
data.serialize (dataArch)
length = dataArch.getSizeOfArchive ()
bufferArray = np.zeros (length, dtype=np.ubyte)
dataArch.copyArchiveToArray (bufferArray)
if useCompression == True:
if fileName != None:
if len(fileName.rsplit (".", 1))==2:
fileName = fileName.rsplit (".", 1)[0]
compressedData = LinearRegression.compress (self,bufferArray)
np.save (fileName, compressedData)
else:
comBufferArray = LinearRegression.compress (self,bufferArray)
serialObjectDict = {"Array Object": comBufferArray,
"Object Information": buffArrObjName}
return serialObjectDict
else:
if fileName != None:
if len (fileName.rsplit (".", 1)) == 2:
fileName = fileName.rsplit (".", 1)[0]
np.save (fileName, bufferArray)
else:
serialObjectDict = {"Array Object": bufferArray,
"Object Information": buffArrObjName}
return serialObjectDict
infoFile = open (fileName + ".txt", "w")
infoFile.write (buffArrObjName)
infoFile.close ()
'''
Returns deserialized/ decompressed numeric table/model
Input can be serialized/ compressed numpy array or serialized/ compressed .npy file saved to disk
'''
def deserialize(self,serialObjectDict=None, fileName=None, useCompression=False):
import daal
if fileName != None and serialObjectDict == None:
bufferArray = np.load (fileName)
buffArrObjName = open (fileName.rsplit (".", 1)[0] + ".txt", "r").read ()
elif fileName == None and any (serialObjectDict):
bufferArray = serialObjectDict["Array Object"]
buffArrObjName = serialObjectDict["Object Information"]
else:
warnings.warn ('Expecting "bufferArray" or "fileName" argument, NOT both')
raise SystemExit
if useCompression == True:
bufferArray = LinearRegression.decompress (self,bufferArray)
dataArch = OutputDataArchive (bufferArray)
try:
deSerialObj = eval (buffArrObjName)
except AttributeError:
deSerialObj = HomogenNumericTable ()
deSerialObj.deserialize (dataArch)
return deSerialObj
'''
Arguments: training result object, test data feature values of type nT, number of dependent variables, insignificant beta indexes (type list)
Returns reduced model predicted values (type nT)
'''
def predictReducedModelResults(self,trainingResult,trainData, reducedBeta,nDependentVariables=1 ):
model = trainingResult.get(training.model)
betas = model.getBeta ()
nBetas = model.getNumberOfBetas ()
savedBeta = np.zeros((nDependentVariables,nBetas))
block = BlockDescriptor ()
betas.getBlockOfRows (0, nDependentVariables, readWrite, block)
pBeta = block.getArray()
if type (reducedBeta) == int: reducedBeta = [reducedBeta]
for beta in reducedBeta:
for i in range (nDependentVariables):
savedBeta[i][beta] = pBeta[i][beta]
pBeta[i][beta] = 0
betas.releaseBlockOfRows (block)
predictedResults = LinearRegression.predict(self,trainingResult,trainData)
block = BlockDescriptor ()
betas.getBlockOfRows (0, nBetas, readWrite, block)
pBeta = block.getArray()
for beta in reducedBeta:
for i in range (0, nDependentVariables):
pBeta[i][beta] = savedBeta[i][beta]
betas.releaseBlockOfRows (block)
return predictedResults
'''
Arguments: training result object, prediction values(type nT), test data actual target values(type nT), predictedReducedModelResults(type nT), insignificant beta indexes (type list)
Returns qualityMetrics object with singleBeta and groupBeta attributes
'''
def qualityMetrics(self, trainingResult, predictResults, testGroundTruth, predictedReducedModelResults=None,noReducedBetas = 1):
model =trainingResult.get(training.model)
self._qualityMetricSetResult = RegressionQualityMetrics(testGroundTruth, predictResults, model,
predictedReducedModelResults=predictedReducedModelResults,
noReducedBetas=noReducedBetas)
return self._qualityMetricSetResult
'''
Arguments: training result object, test data feature values of type nT, test data actual target values(type nT), insignificant beta indexes (type list)
Returns predicted values(type nT), reduced model predicted values (type nT), single beta metrics result, qualityMetrics object with singleBeta and groupBeta attributes
'''
def predictWithQualityMetrics(self, trainingResult, testData, testGroundTruth, reducedBetaIndex = None):
predictResults = LinearRegression.predict(self,trainingResult,testData)
if reducedBetaIndex != None:
predictedReducedModelResults = self.predictReducedModelResults (trainingResult,testData, reducedBetaIndex,
nDependentVariables=testGroundTruth.getNumberOfColumns())
self._qualityMetricSetResult = self.qualityMetrics(trainingResult, predictResults,testGroundTruth,
predictedReducedModelResults=predictedReducedModelResults,
noReducedBetas=len(reducedBetaIndex))
else:
self._qualityMetricSetResult =self.qualityMetrics(trainingResult, predictResults,testGroundTruth)
predictedReducedModelResults=predictResults
return predictResults, predictedReducedModelResults, self._qualityMetricSetResult
'''
Arguments: qualityMetrics object
Prints RMSE, variance, z-score statistic, confidenceIntervals, inverseOfXtX matrix,
variance-covariance matrix, expectedMean, expectedVariance, SSR, SST, R-square, f-statistic
'''
def printAllQualityMetrics(self, qualityMet):
# Print quality metrics for single belta
print ("Quality metrics for a single beta")
printNumericTable (qualityMet.get('rms'),
"Root means square errors for each response (dependent variable):")
printNumericTable (qualityMet.get('variance'),
"Variance for each response (dependent variable):")
printNumericTable (qualityMet.get('zScore'),
"Z-score statistics:")
printNumericTable (qualityMet.get('confidenceIntervals'),
"Confidence intervals for each beta coefficient:")
printNumericTable (qualityMet.get('inverseOfXtX'),
"Inverse(Xt * X) matrix:")
betaCov= qualityMet.get('betaCovariances')
for i in range(len(betaCov)):
message = "Variance-covariance matrix for betas of " + str (i+1) + "-th response"
printNumericTable (betaCov[i], message)
# Print quality metrics for a group of betas
print ("Quality metrics for a group of betas")
printNumericTable (qualityMet.get('expectedMeans'),
"Means of expected responses for each dependent variable:")
printNumericTable (qualityMet.get('expectedVariance'),
"Variance of expected responses for each dependent variable:")
printNumericTable (qualityMet.get('regSS'),
"Regression sum of squares of expected responses:")
printNumericTable (qualityMet.get('resSS'),
"Sum of squares of residuals for each dependent variable:")
printNumericTable (qualityMet.get('tSS'),
"Total sum of squares for each dependent variable:")
printNumericTable (qualityMet.get('determinationCoeff'),
"Determination coefficient for each dependent variable:")
printNumericTable (qualityMet.get('fStatistics'),
"F-statistics for each dependent variable:")