-
Notifications
You must be signed in to change notification settings - Fork 2
/
preprocessing.py
60 lines (51 loc) · 1.98 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller, acf, pacf
import numpy as np
def isnan(df):
# check if dat consists nan values
if df.isna().sum().sum() > 0:
print("Deleting null values")
df.dropna(inplace=True)
def differencing(df, order = 1, col_name='Close', new_col_name='Difference'):
isnan(df)
df[new_col_name] = df[col_name] - df[col_name].shift(order)
isnan(df)
def transformation(df, fun=np.log, col_name='Close', new_col_name='LClose'):
# power transformation
isnan(df)
df[new_col_name] = df[col_name].apply(lambda x: fun(x))
isnan(df)
def test_stationary(timeseries, window=500):
"""
simple statistics to check data
rolling stat
autocorrelation
test Dickey-Fuller to check stationarity
"""
plt.figure(figsize=(15, 14))
# Rolling statistics
plt.subplot(211)
movingAverage = timeseries.rolling(window=window).mean()
movingSTD = timeseries.rolling(window=window).std()
plt.plot(timeseries, color='cornflowerblue', label='Original')
plt.plot(movingAverage, color='red', label='Rolling Mean')
plt.plot(movingSTD, color='black', label='Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation\n')
# Auto-correlation
plt.subplot(212)
correlation = acf(timeseries.iloc[1:])
partial_correlation = pacf(timeseries.iloc[1:])
plt.plot(correlation, color='green', marker='o', linestyle='--', label='AutoCorr')
plt.plot(partial_correlation, color='cornflowerblue', marker='.', linestyle='--', label='PAutoCorr')
plt.legend(loc='best')
plt.title('Autocorrelationn & Partial Autocorrelation\n')
plt.show()
# Dickey Fuller test
print('Results of Dickey Fuller Test:\n')
dftest = adfuller(timeseries.values)
print('ADF Statistic: %f' % dftest[0])
print('p-value: %f' % dftest[1])
print('Critical Values:')
for key, value in dftest[4].items():
print('\t%s: %.3f' % (key, value))