-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_transform.py
102 lines (89 loc) · 3.03 KB
/
data_transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#import numpy and pandas (for data) and NearestNeighbors (for neighbor calculations)
import numpy as np
import scipy as sp
import random
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pprint
import time
import sys
import os
import math
import warnings
import itertools
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer, KBinsDiscretizer
#our modules
import helper
import prodplay
import spotify
import plot
import algos
import testing
from analyze_data import discretize, analyze_dataset
from songdataset import SongDataset, SegmentDataset
info = helper.loadConfig("config.json")
indir = "./data/deezer"
outdir = f"{indir}/powert-2"
helper.makeDir(outdir)
# datasets = testing.load_segment_datasets(
# info["cols"], "./data/deezer", knn=False)
datasets = [
SongDataset(
name="Deezer+Spotify+MSD",
cols=info["cols"]["deezer"] + info["cols"]["spotify"] + info["cols"]["msd"],
path=f"{indir}/deezer-std-all.csv", verbose=True,
),
SongDataset(
name="PCA-Deezer+Spotify",
path=f"{indir}/deezer-pca-spotify.csv",
verbose=True,
),
SongDataset(
name="PCA-Deezer+MSD",
path=f"{indir}/deezer-pca-msd.csv",
verbose=True,
),
SongDataset(
name="PCA-Deezer+Spotify+MSD",
path=f"{indir}/deezer-pca-all.csv",
verbose=True,
),
SegmentDataset(
name="Deezer+Segments-100cnt",
cols=info["cols"]["deezer"] + info["cols"]["segments"],
path=f"{indir}/segments/cnt100.csv", verbose=True,
),
SegmentDataset(
name="Deezer+Segments-030sec",
cols=info["cols"]["deezer"] + info["cols"]["segments"],
path=f"{indir}/segments/dur030.csv", verbose=True,
)
]
def scaledata(df, cols, sc):
## Scale all the columns to the specific scaler.
print(f"\nIndividually scaling using {sc.fit_transform.__name__}")
for col in cols:
print("... {}".format(col))
df[[col]] = sc.fit_transform(df[[col]])
return df
# discretes = {}
for dataset in datasets:
print(f"Transforming {dataset.name}")
analyze_dataset(dataset, f"data/_analysis/{dataset.name}")
df_scale = dataset.full_df.copy()
cols = pd.merge(dataset.va_df, dataset.feat_df, left_index=True, right_index=True).columns
scaledata(df_scale, cols, PowerTransformer(method='yeo-johnson', standardize=True))
# discretes[dataset.name] = discretize(df_scale, cols)
scaledata(df_scale, cols, MinMaxScaler(feature_range=(-1,1)))
df_scale.to_csv(f"{outdir}/{dataset.name}.csv")
scaled_dataset = SongDataset(
name=f'scaled-{dataset.name}',
cols=dataset.cols,
path=f"{outdir}/{dataset.name}.csv",
feat_index = dataset.feat_index,
)
analyze_dataset(
scaled_dataset, f"data/_analysis/{dataset.name}/scaled/powert-2")
# helper.jsonout(discretes, "out/discretes.json")