Skip to content

Commit

Permalink
added numerical conditioning preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Fuest committed Sep 24, 2024
1 parent cf4d15a commit c70fa37
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 51 deletions.
5 changes: 4 additions & 1 deletion config/data_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,14 @@ datasets:
]
metadata_columns: ["dataid",
"building_type",
#"pv",
"pv",
"solar",
"car1",
"city",
"state",
"total_square_footage",
"house_construction_year",
"total_amount_of_pv"
]
goinerdata:
path: "home/fuest/EnData/data/goinerdata/"
Expand Down
10 changes: 7 additions & 3 deletions datasets/pecanstreet.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from torch.utils.data import Dataset

from datasets.utils import encode_categorical_variables
from datasets.utils import encode_numerical_variables

warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
Expand Down Expand Up @@ -96,8 +97,11 @@ def load_and_preprocess_data(
user_flags = self._set_user_flags(metadata, data)
data = self._preprocess_data(data)
data = pd.merge(data, metadata, on="dataid", how="left")
data, mappings = encode_categorical_variables(data.fillna("no"))
self.mappings = mappings
data = encode_categorical_variables(data)
data = encode_numerical_variables(
data,
["total_square_footage", "house_construction_year", "total_amount_of_pv"],
)
return data, metadata, user_flags

def _load_full_data(self, path: str, columns: List[str]) -> pd.DataFrame:
Expand Down Expand Up @@ -406,7 +410,7 @@ def create_all_pv_user_dataset(self) -> "PecanStreetDataset":
return PecanStreetDataset(
data=pv_data,
stats=self.stats,
is_pv_user=True,
is_pv_user=self.include_generation,
include_generation=True,
metadata=self.metadata,
normalization_method=self.normalization_method,
Expand Down
48 changes: 33 additions & 15 deletions datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Tuple

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader
Expand Down Expand Up @@ -80,40 +81,57 @@ def split_dataset(dataset: Dataset, val_split: float = 0.1) -> Tuple[Dataset, Da
return train_dataset, val_dataset


def encode_categorical_variables(df):
def encode_categorical_variables(data: pd.DataFrame, columns: List[str]):
"""
Encodes categorical variables in a DataFrame to integer codes.
Args:
df (pd.DataFrame): Input DataFrame containing categorical variables.
data (pd.DataFrame): Input DataFrame containing categorical variables.
columns (List[str]): List of column names to transform.
Returns:
df_encoded (pd.DataFrame): DataFrame with categorical variables encoded as integer codes.
mappings (dict): Dictionary mapping column names to their category-to-code mappings.
"""
df_encoded = df.copy()
df_encoded = data.copy()
mappings = {}

# Select columns with object or category data types
categorical_cols = df_encoded.select_dtypes(include=["object", "category"]).columns

for col in categorical_cols:
if col == "timeseries": # skip time series col
continue
# Convert column to 'category' dtype if not already
for col in columns:
df_encoded[col] = df_encoded[col].astype("category")

# Create a mapping from categories to codes
category_to_code = dict(enumerate(df_encoded[col].cat.categories))
code_to_category = {v: k for k, v in category_to_code.items()}

# Replace categories with codes in the DataFrame
df_encoded[col] = df_encoded[col].cat.codes

# Save the mapping for the current column
mappings[col] = {
"category_to_code": {cat: code for code, cat in category_to_code.items()},
"code_to_category": code_to_category,
}

return df_encoded, mappings
return df_encoded


def encode_numerical_variables(data: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
"""
Takes numerical conditioning columns (e.g. total square footage), and converts it into integer encoded mappings.
Args:
data (pd.DataFrame): The data whose numerical cols are being encoded.
columns (List[str]): The column names of numerical columns that need to be encoded.
Returns:
data (pd.DataFrame): The data frame that now has integer codes where numerical values used to be.
"""
for col in columns:

data[col] = pd.to_numeric(data[col], errors="coerce")
data[col]

if data[col].isnull().all():
raise ValueError(f"Column '{col}' contains no valid numeric values.")

data[col] = pd.cut(
data[col], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True
).astype(int)

return data
45 changes: 23 additions & 22 deletions eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,7 @@ def generate_samples_for_eval(
)

for keys, tensor in random_conditioning_vars.items():
random_conditioning_vars[keys] = tensor.repeat(
num_samples
) # repeat tensors according to specified num_samples
random_conditioning_vars[keys] = tensor.repeat(num_samples)

generated_ts = model.generate(random_conditioning_vars).cpu().numpy()

Expand Down Expand Up @@ -441,6 +439,7 @@ def _create_visualizations(
dataset: Any,
model: Any,
writer: SummaryWriter,
num_runs=3,
):
"""
Create various visualizations for the evaluation results.
Expand All @@ -452,28 +451,30 @@ def _create_visualizations(
dataset (Any): The dataset object.
model (Any): The trained model.
writer (SummaryWriter): TensorBoard writer for logging visualizations.
num_plots (int): The number of visualization runs you want to make.
"""
samples = self.generate_samples_for_eval(
real_user_data["dataid"].iloc[0],
model,
dataset,
num_samples=100,
)
samples = dataset.inverse_transform(samples)
month = samples.iloc[0]["month"]
weekday = samples.iloc[0]["weekday"]
for i in range(num_runs):
samples = self.generate_samples_for_eval(
real_user_data["dataid"].iloc[0],
model,
dataset,
num_samples=100,
)
samples = dataset.inverse_transform(samples)
month = samples.iloc[0]["month"]
weekday = samples.iloc[0]["weekday"]

# Visualization 1: Plot range with synthetic values
range_plot = plot_range_with_syn_values(
real_user_data_inv, samples, month, weekday
)
writer.add_figure("Visualizations/Range_Plot", range_plot)
# Visualization 1: Plot range with synthetic values
range_plot = plot_range_with_syn_values(
real_user_data_inv, samples, month, weekday
)
writer.add_figure(f"Visualizations/Range_Plot_{i}", range_plot)

# Visualization 2: Plot closest real signals with synthetic values
closest_plot = plot_syn_with_closest_real_ts(
real_user_data_inv, samples, month, weekday
)
writer.add_figure("Visualizations/Closest_Real_TS", closest_plot)
# Visualization 2: Plot closest real signals with synthetic values
closest_plot = plot_syn_with_closest_real_ts(
real_user_data_inv, samples, month, weekday
)
writer.add_figure(f"Visualizations/Closest_Real_TS_{i}", closest_plot)

# Visualization 4: t-SNE visualization of real and synthetic data
real_data_array = np.stack(real_user_data_inv["timeseries"])
Expand Down
2 changes: 1 addition & 1 deletion generator/gan/acgan.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def train_model(self, dataset):
)
for var_name in self.categorical_dims.keys():
labels = gen_categorical_vars[var_name]
g_loss += self.auxiliary_loss(aux_outputs[var_name], labels)
g_loss += 0.1 * self.auxiliary_loss(aux_outputs[var_name], labels)

g_loss.backward()
self.optimizer_G.step()
Expand Down
25 changes: 16 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,35 +8,42 @@ def evaluate_individual_user_models(
full_dataset = PecanStreetDataManager(
normalize=normalize,
include_generation=include_generation,
threshold=(-8, 8),
normalization_method="date",
threshold=(-10, 10),
normalization_method="group",
)
evaluator = Evaluator(full_dataset, model_name)
evaluator.evaluate_all_user_models()


def evaluate_single_dataset_model(
model_name, geography=None, normalize=True, include_generation=True
model_name,
geography=None,
normalize=True,
include_generation=True,
normalization_method="group",
):
full_dataset = PecanStreetDataManager(
geography=geography,
normalize=normalize,
include_generation=include_generation,
normalization_method="date",
threshold=(-10, 10),
normalization_method=normalization_method,
threshold=(-5, 5),
)
evaluator = Evaluator(full_dataset, model_name)
# evaluator.evaluate_all_users()
evaluator.evaluate_all_non_pv_users()
# evaluator.evaluate_all_pv_users()
# evaluator.evaluate_all_non_pv_users()
evaluator.evaluate_all_pv_users()


def main():
# evaluate_individual_user_models("gpt", include_generation=False)
# evaluate_individual_user_models("acgan", include_generation=True)
# evaluate_individual_user_models("acgan", include_generation=False)
# evaluate_individual_user_models("acgan", include_generation=False, normalization_method="date")
evaluate_single_dataset_model(
"diffusion_ts", geography="austin", include_generation=True, normalize="group"
"acgan",
geography="newyork",
include_generation=False,
normalization_method="date",
)


Expand Down
3 changes: 3 additions & 0 deletions tests/test_data_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ datasets:
"car1",
"city",
"state",
#"total_square_footage",
#"house_construction_year",
#"total_amount_of_pv"
]
# goinerdata:
# path: "home/fuest/EnData/data/goinerdata/"
Expand Down

0 comments on commit c70fa37

Please sign in to comment.