From 7eadd9a0eddd047650903a39fe5d73a6eedbd089 Mon Sep 17 00:00:00 2001 From: Daniel Young Date: Thu, 21 Mar 2024 11:28:35 -0700 Subject: [PATCH 1/2] Started linting project --- use_cases/eluc/README.md | 2 +- use_cases/eluc/data/constants.py | 7 +-- use_cases/eluc/data/conversion.py | 4 +- use_cases/eluc/data/eluc_data.py | 47 ++++++++++--------- use_cases/eluc/data/torch_data.py | 4 +- .../predictors/neural_network/__init__.py | 0 use_cases/eluc/predictors/predictor.py | 2 +- use_cases/eluc/predictors/sklearn/__init__.py | 0 use_cases/eluc/prescriptors/__init__.py | 0 use_cases/eluc/prescriptors/esp/__init__.py | 0 .../eluc/prescriptors/heuristics/__init__.py | 0 use_cases/eluc/prescriptors/nsga2/__init__.py | 0 12 files changed, 36 insertions(+), 30 deletions(-) create mode 100644 use_cases/eluc/predictors/neural_network/__init__.py create mode 100644 use_cases/eluc/predictors/sklearn/__init__.py create mode 100644 use_cases/eluc/prescriptors/__init__.py create mode 100644 use_cases/eluc/prescriptors/esp/__init__.py create mode 100644 use_cases/eluc/prescriptors/heuristics/__init__.py create mode 100644 use_cases/eluc/prescriptors/nsga2/__init__.py diff --git a/use_cases/eluc/README.md b/use_cases/eluc/README.md index 1af7ce3..9242502 100644 --- a/use_cases/eluc/README.md +++ b/use_cases/eluc/README.md @@ -23,7 +23,7 @@ BLUE simulations with committed emissions could be used to estimate the long-ter "Committed emissions" means all the emissions that are caused by a land-use change event are attributed to the year of the event. BLUE (bookkeeping of land use emissions) is a bookkeeping model that attributes carbon fluxes to land use activities. -See [BLUE: Bookkeeping of land use emissions](https://doi.org/10.1002/2014GB004997) for more details. +See [BLUE: Bookkeeping of land use emissions](https://doi.org/10.1002/2014GB004997) for more details. ### LUC diff --git a/use_cases/eluc/data/constants.py b/use_cases/eluc/data/constants.py index 73f6f27..0a1aca5 100644 --- a/use_cases/eluc/data/constants.py +++ b/use_cases/eluc/data/constants.py @@ -9,8 +9,8 @@ CODES_PATH = "data/codes.csv" # Different variations of land-use change columns -LAND_USE_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', - 'pastr', 'primf', 'primn', +LAND_USE_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', + 'pastr', 'primf', 'primn', 'range', 'secdf', 'secdn', 'urban'] CROP_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per'] LAND_USE_COLS = ["crop"] + [col for col in LAND_USE_COLS if col not in CROP_COLS] @@ -29,7 +29,8 @@ # ["United Kingdom", "France", "Germany", "Netherlands", "Belgium", "Switzerland", "Ireland"] EU_COUNTRIES = ["GB", "FR", "DE", "NL", "BE", "CH", "IE"] -# ["Brazil", "Bolivia", "Paraguay", "Peru", "Ecuador", "Colombia", "Venezuela", "Guyana", "Suriname", "Uruguay", "Argentina", "Chile"] +# ["Brazil", "Bolivia", "Paraguay", "Peru", "Ecuador", "Colombia", +# "Venezuela", "Guyana", "Suriname", "Uruguay", "Argentina", "Chile"] SA_COUNTRIES = ["BR", "BO", "PY", "PE", "EC", "CO", "VE", "GY", "SR", "UY", "AR", "CL"] # ["United States"] US_COUNTRIES = ["US"] diff --git a/use_cases/eluc/data/conversion.py b/use_cases/eluc/data/conversion.py index c37efc4..f71e099 100644 --- a/use_cases/eluc/data/conversion.py +++ b/use_cases/eluc/data/conversion.py @@ -9,7 +9,7 @@ from data import constants # TODO: Note: This table is not perfect and has some errors, -# we should consider manually fixing them. I tried my best but +# we should consider manually fixing them. I tried my best but # I'm not 100% sure it's correct. MANUAL_MAP = { "INDO": 360, @@ -57,7 +57,7 @@ def construct_countries_df(): # Replace all the bad codes with their real ones for i in range(len(countries_df)): old_abbrev = countries_df.iloc[i]["abbrevs"] - if old_abbrev in MANUAL_MAP.keys() and MANUAL_MAP[old_abbrev] in codes_df["Numeric code"].unique(): + if old_abbrev in MANUAL_MAP and MANUAL_MAP[old_abbrev] in codes_df["Numeric code"].unique(): countries_df.iloc[i]["abbrevs"] = codes_df[codes_df["Numeric code"] == MANUAL_MAP[old_abbrev]]["Alpha-2 code"].iloc[0] return countries_df \ No newline at end of file diff --git a/use_cases/eluc/data/eluc_data.py b/use_cases/eluc/data/eluc_data.py index b9e0e10..45a9abf 100644 --- a/use_cases/eluc/data/eluc_data.py +++ b/use_cases/eluc/data/eluc_data.py @@ -42,9 +42,9 @@ def encode_as_df(self, df: pd.DataFrame) -> pd.DataFrame: if min_val == max_val: new_df[col] = 0 else: - new_df[col] = (new_df[col] - self.fields[col]["range"][0]) / (self.fields[col]["range"][1] - self.fields[col]["range"][0]) + new_df[col] = (new_df[col] - min_val) / (max_val - min_val) return new_df - + def decode_as_df(self, df: pd.DataFrame) -> pd.DataFrame: """ Decodes a dataframe using the fields given in the constructor. @@ -53,7 +53,9 @@ def decode_as_df(self, df: pd.DataFrame) -> pd.DataFrame: new_df = df.copy() for col in new_df.columns: if col in self.fields: - new_df[col] = new_df[col] * (self.fields[col]["range"][1] - self.fields[col]["range"][0]) + self.fields[col]["range"][0] + min_val = self.fields[col]["range"][0] + max_val = self.fields[col]["range"][1] + new_df[col] = new_df[col] * (max_val - min_val) + min_val return new_df @@ -87,7 +89,7 @@ def get_encoded_train(self): if self.encoded_train_df is None: self.encoded_train_df = self.encoder.encode_as_df(self.train_df) return self.encoded_train_df - + def get_encoded_test(self): """ Same as above but for test data. @@ -95,14 +97,15 @@ def get_encoded_test(self): if self.encoded_test_df is None: self.encoded_test_df = self.encoder.encode_as_df(self.test_df) return self.encoded_test_df - + def get_fields(self) -> dict: """ Creates fields json object for the data encoder/prescriptor. """ - fields_df = self.train_df[constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]].astype("float64") - fields = dict() - for col in constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]: + cao_cols = constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"] + fields_df = self.train_df[cao_cols].astype("float64") + fields = {} + for col in cao_cols: # Set range of land and diff land uses manually to their true ranges because they # do not need to be scaled if col in constants.LAND_USE_COLS: @@ -132,14 +135,13 @@ def get_fields(self) -> dict: "valued": "CONTINUOUS" } - return fields - + return fields + def push_to_hf(self, repo_path, commit_message, token=None): """ Pushes data to huggingface repo. Don't use this unless you're sure you want to update it! :param repo_path: Path to huggingface repo. """ - whole_df = pd.concat([self.train_df, self.test_df]) # We get the indices as columns anyways so we can drop them whole_df = whole_df.drop(["lat", "lon", "time"], axis=1) @@ -147,13 +149,13 @@ def push_to_hf(self, repo_path, commit_message, token=None): if not token: token = os.getenv("HF_TOKEN") ds.push_to_hub(repo_path, commit_message=commit_message, token=token) - + class ELUCData(AbstractData): """ Loads ELUC data from HuggingFace repo and processes it. """ - + def __init__(self, start_year=1851, test_year=2012, end_year=2022, countries=None): """ If update_path is given, load raw data the old way using 2 files that are merged. @@ -169,12 +171,13 @@ def __init__(self, start_year=1851, test_year=2012, end_year=2022, countries=Non self.train_df = df.loc[start_year:test_year-1] self.test_df = df.loc[test_year:end_year-1] - + self.encoder = ELUCEncoder(self.get_fields()) def hf_to_df(self, hf_repo): """ - Loads dataset from huggingface, converts to pandas, then sets indices appropriately to time/lat/lon. + Loads dataset from huggingface, converts to pandas, then sets indices + appropriately to time/lat/lon. Keep old time/lat/lon columns so we can use them as features later. """ ds = load_dataset(hf_repo)["train"] @@ -194,7 +197,7 @@ def __init__(self, path, update_path, start_year=1851, test_year=2012, end_year= self.train_df = df.loc[start_year:test_year-1] self.test_df = df.loc[test_year:end_year-1] - + self.encoder = ELUCEncoder(self.get_fields()) def import_data(self, path, update_path): @@ -217,7 +220,9 @@ def import_data(self, path, update_path): raw = raw.merge(eluc) # Shift actions back a year - raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', 'pastr', 'primf', 'primn', 'range', 'secdf', 'secdn', 'urban'] + raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', + 'pastr', 'primf', 'primn', 'range', + 'secdf', 'secdn', 'urban'] raw_diffs = [f"{col}_diff" for col in raw_diffs] raw[raw_diffs] = raw[raw_diffs].shift(time=-1) @@ -225,7 +230,7 @@ def import_data(self, path, update_path): country_mask = regionmask.defined_regions.natural_earth_v5_0_0.countries_110.mask(raw) raw["country"] = country_mask return raw - + def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=None) -> pd.DataFrame: """ Converts an xarray DataArray to a pandas DataFrame. @@ -259,10 +264,10 @@ def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=N # Merge crops into one column because BLUE model doesn't differentiate df["crop"] = df[constants.CROP_COLS].sum(axis=1) df["crop_diff"] = df[[f"{c}_diff" for c in constants.CROP_COLS]].sum(axis=1) - + df['country_name'] = self.countries_df.loc[df['country'], 'names'].values - + # Drop this column we used for preprocessing (?) df = df.drop("mask", axis=1) - + return df diff --git a/use_cases/eluc/data/torch_data.py b/use_cases/eluc/data/torch_data.py index 2bddd07..c0904ce 100644 --- a/use_cases/eluc/data/torch_data.py +++ b/use_cases/eluc/data/torch_data.py @@ -15,7 +15,7 @@ class TorchDataset(Dataset): :param y: labels """ def __init__(self, X: np.ndarray, y: np.ndarray, device="cpu"): - super().__init__() + super().__init__() self.X = torch.tensor(X, dtype=torch.float32, device=device) self.y = torch.tensor(y, device=device) assert len(self.X) == len(self.y), "X and y must have the same length" @@ -24,4 +24,4 @@ def __len__(self): return len(self.X) def __getitem__(self, idx: int) -> tuple: - return self.X[idx], self.y[idx] \ No newline at end of file + return self.X[idx], self.y[idx] diff --git a/use_cases/eluc/predictors/neural_network/__init__.py b/use_cases/eluc/predictors/neural_network/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/use_cases/eluc/predictors/predictor.py b/use_cases/eluc/predictors/predictor.py index f905b87..054899e 100644 --- a/use_cases/eluc/predictors/predictor.py +++ b/use_cases/eluc/predictors/predictor.py @@ -51,4 +51,4 @@ def load(self, path: str): """ Loads a model from a path. :param path: path to the model - """ \ No newline at end of file + """ diff --git a/use_cases/eluc/predictors/sklearn/__init__.py b/use_cases/eluc/predictors/sklearn/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/use_cases/eluc/prescriptors/__init__.py b/use_cases/eluc/prescriptors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/use_cases/eluc/prescriptors/esp/__init__.py b/use_cases/eluc/prescriptors/esp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/use_cases/eluc/prescriptors/heuristics/__init__.py b/use_cases/eluc/prescriptors/heuristics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/use_cases/eluc/prescriptors/nsga2/__init__.py b/use_cases/eluc/prescriptors/nsga2/__init__.py new file mode 100644 index 0000000..e69de29 From 4b85488dc79967d5735448f85b8a81d99ea4f273 Mon Sep 17 00:00:00 2001 From: Daniel Young Date: Thu, 21 Mar 2024 17:07:35 -0700 Subject: [PATCH 2/2] Linted project to over 9 --- use_cases/eluc/.pylintrc | 10 +++ .../neural_network/neural_net_predictor.py | 66 ++++++++++++------- .../predictors/sklearn/sklearn_predictor.py | 18 ++--- .../prescriptors/esp/train_prescriptors.py | 5 +- .../prescriptors/esp/unileaf_prescriptor.py | 40 +++++------ .../prescriptors/heuristics/heuristics.py | 13 ++-- .../eluc/prescriptors/nsga2/candidate.py | 33 +++++----- .../eluc/prescriptors/nsga2/create_seeds.py | 10 +-- .../eluc/prescriptors/nsga2/nsga2_utils.py | 4 +- .../prescriptors/nsga2/torch_prescriptor.py | 15 ++--- .../prescriptors/nsga2/train_prescriptors.py | 5 +- use_cases/eluc/prescriptors/prescriptor.py | 10 ++- 12 files changed, 129 insertions(+), 100 deletions(-) create mode 100644 use_cases/eluc/.pylintrc diff --git a/use_cases/eluc/.pylintrc b/use_cases/eluc/.pylintrc new file mode 100644 index 0000000..036fe36 --- /dev/null +++ b/use_cases/eluc/.pylintrc @@ -0,0 +1,10 @@ +[MASTER] +ignore=demo + +jobs=0 + +max-line-length=120 + +suggestion-mode=yes + +good-names=X_train, X_val, X_test, y_train, y_val, y_test, X, Y, y, X_test_scaled \ No newline at end of file diff --git a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py index 996a291..9fc2eb7 100644 --- a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py +++ b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py @@ -79,25 +79,37 @@ class NeuralNetPredictor(Predictor): in order to take advantage of the linear relationship in the data. Data is automatically standardized and the scaler is saved with the model. """ - def __init__(self, features=None, label=None, hidden_sizes=[4096], linear_skip=True, dropout=0, device="mps", - epochs=3, batch_size=2048, optim_params={}, train_pct=1, step_lr_params={"step_size": 1, "gamma": 0.1}): - # Model setup params + def __init__(self, features=None, label=None, hidden_sizes=[4096], linear_skip=True, + dropout=0, device="mps", epochs=3, batch_size=2048, optim_params={}, + train_pct=1, step_lr_params={"step_size": 1, "gamma": 0.1}): + + self.features=None + self.label=None + + self.set_params(features, label, hidden_sizes, linear_skip, + dropout, device, epochs, batch_size, optim_params, + train_pct, step_lr_params) + self.model = None + self.scaler = StandardScaler() + + def set_params(self, features, label, hidden_sizes, linear_skip, + dropout, device, epochs, batch_size, optim_params, + train_pct, step_lr_params): + """ + Set all the parameters for the neural network. + """ self.features = features self.label = label self.hidden_sizes = hidden_sizes self.linear_skip = linear_skip self.dropout = dropout self.device = device - - # Training params - self.scaler = StandardScaler() self.epochs = epochs self.batch_size = batch_size self.optim_params = optim_params self.train_pct = train_pct self.step_lr_params = step_lr_params - def load(self, path: str): """ @@ -107,11 +119,11 @@ def load(self, path: str): load_path = Path(path) if not load_path.exists(): raise FileNotFoundError(f"Path {path} does not exist.") - + # Initialize model with config - with open(load_path / "config.json", "r", encoding="utf-8") as f: - config = json.load(f) - self.__init__(**config) + with open(load_path / "config.json", "r", encoding="utf-8") as file: + config = json.load(file) + self.set_params(**config) self.model = ELUCNeuralNet(len(self.features), self.hidden_sizes, self.linear_skip, self.dropout) self.model.load_state_dict(torch.load(load_path / "model.pt")) @@ -144,13 +156,16 @@ def save(self, path: str): "train_pct": self.train_pct, "step_lr_params": self.step_lr_params } - with open(save_path / "config.json", "w", encoding="utf-8") as f: - json.dump(config, f) + with open(save_path / "config.json", "w", encoding="utf-8") as file: + json.dump(config, file) torch.save(self.model.state_dict(), save_path / "model.pt") joblib.dump(self.scaler, save_path / "scaler.joblib") - def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None, X_test=None, y_test=None, log_path=None, verbose=False) -> dict: + def fit(self, X_train: pd.DataFrame, y_train: pd.Series, + X_val=None, y_val=None, + X_test=None, y_test=None, + log_path=None, verbose=False) -> dict: """ Fits neural network to given data using predefined parameters and hyperparameters. If no features were specified we use all the columns in X_train. @@ -164,7 +179,8 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None, :param y_test: test labels. :param log_path: path to log training data to tensorboard. :param verbose: whether to print progress bars. - :return: dictionary of results from training containing time taken, best epoch, best loss, and test loss if applicable. + :return: dictionary of results from training containing time taken, best epoch, best loss, + and test loss if applicable. """ if not self.features: self.features = X_train.columns.tolist() @@ -174,7 +190,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None, self.model.to(self.device) self.model.train() - s = time.time() + start = time.time() # Set up train set X_train = self.scaler.fit_transform(X_train[self.features]) @@ -203,7 +219,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None, result_dict = {} best_model = None best_loss = np.inf - e = 0 + end = 0 step = 0 for epoch in range(self.epochs): @@ -220,7 +236,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None, step += 1 loss.backward() optimizer.step() - + # LR Decay if self.step_lr_params: scheduler.step() @@ -235,25 +251,25 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None, out = self.model(X) loss = loss_fn(out.squeeze(), y.squeeze()) total += loss.item() * y.shape[0] - + if log_path: writer.add_scalar("val_loss", total / len(val_ds), step) - + if total < best_loss: best_model = copy.deepcopy(self.model.state_dict()) best_loss = total - e = time.time() + end = time.time() result_dict["best_epoch"] = epoch result_dict["best_loss"] = total / len(val_ds) - result_dict["time"] = e - s + result_dict["time"] = end - start print(f"epoch {epoch} mae {total / len(val_ds)}") - + if best_model: self.model.load_state_dict(best_model) else: - e = time.time() - result_dict["time"] = e - s + end = time.time() + result_dict["time"] = end - start # If we provide a test dataset if X_test is not None and y_test is not None: diff --git a/use_cases/eluc/predictors/sklearn/sklearn_predictor.py b/use_cases/eluc/predictors/sklearn/sklearn_predictor.py index 3723a2c..3aef26b 100644 --- a/use_cases/eluc/predictors/sklearn/sklearn_predictor.py +++ b/use_cases/eluc/predictors/sklearn/sklearn_predictor.py @@ -19,7 +19,7 @@ class SKLearnPredictor(Predictor, ABC): Simple abstract class for sklearn predictors. Keeps track of features fit on and label to predict. """ - def __init__(self, features=None, label=None, **kwargs): + def __init__(self, features=None, label=None): self.features = features self.label = label self.model = None @@ -36,8 +36,8 @@ def save(self, path: str): "features": self.features, "label": self.label } - with open(save_path / "config.json", "w", encoding="utf-8") as f: - json.dump(config, f) + with open(save_path / "config.json", "w", encoding="utf-8") as file: + json.dump(config, file) joblib.dump(self.model, save_path / "model.joblib") def load(self, path): @@ -46,8 +46,8 @@ def load(self, path): :param path: path to folder to load model files from. """ load_path = Path(path) - with open(load_path / "config.json", "r", encoding="utf-8") as f: - config = json.load(f) + with open(load_path / "config.json", "r", encoding="utf-8") as file: + config = json.load(file) self.features = config["features"] self.label = config["label"] self.model = joblib.load(load_path / "model.joblib") @@ -77,7 +77,7 @@ def predict(self, X_test: pd.DataFrame) -> pd.DataFrame: X_test = X_test[self.features] y_pred = self.model.predict(X_test) return pd.DataFrame(y_pred, index=X_test.index, columns=[self.label]) - + class LinearRegressionPredictor(SKLearnPredictor): """ Simple linear regression predictor. @@ -86,7 +86,7 @@ class LinearRegressionPredictor(SKLearnPredictor): def __init__(self, features=None, **kwargs): super().__init__(features) self.model = LinearRegression(**kwargs) - + class RandomForestPredictor(SKLearnPredictor): """ Simple random forest predictor. @@ -108,7 +108,7 @@ def save(self, path: str, compression=0): "features": self.features, "label": self.label } - with open(save_path / "config.json", "w", encoding="utf-8") as f: - json.dump(config, f) + with open(save_path / "config.json", "w", encoding="utf-8") as file: + json.dump(config, file) joblib.dump(self.model, save_path / "model.joblib", compress=compression) \ No newline at end of file diff --git a/use_cases/eluc/prescriptors/esp/train_prescriptors.py b/use_cases/eluc/prescriptors/esp/train_prescriptors.py index 04779f0..44ece23 100644 --- a/use_cases/eluc/prescriptors/esp/train_prescriptors.py +++ b/use_cases/eluc/prescriptors/esp/train_prescriptors.py @@ -42,8 +42,7 @@ esp_password = os.getenv('ESP_SERVICE_PASSWORD') if not esp_username or not esp_password: raise ValueError('ESP Service username and password not found.') - else: - print('ESP Service username and password found.') + print('ESP Service username and password found.') print("Running prescriptor training...") config_path = Path(args.config_path) @@ -63,4 +62,4 @@ eval_df_encoded, dataset.encoder, [nnp]) - experiment_results_dir = esp_service.train(esp_evaluator) \ No newline at end of file + experiment_results_dir = esp_service.train(esp_evaluator) diff --git a/use_cases/eluc/prescriptors/esp/unileaf_prescriptor.py b/use_cases/eluc/prescriptors/esp/unileaf_prescriptor.py index f5dcb2c..525ad14 100644 --- a/use_cases/eluc/prescriptors/esp/unileaf_prescriptor.py +++ b/use_cases/eluc/prescriptors/esp/unileaf_prescriptor.py @@ -3,13 +3,10 @@ is just a guideline for other evolution methods. A similar open-source implementation is available in the "nsga2" directory. """ - from typing import Any from typing import Dict from typing import List -from pathlib import Path - import pandas as pd import numpy as np from keras.models import load_model @@ -101,17 +98,17 @@ def _reco_to_context_actions(self, reco_df: pd.DataFrame, encoded_context_df: pd # Compute the diff # Note: the index need to match in order to subtract. Otherwise we get NaN - prescribed_actions_df = reco_df[constants.RECO_COLS].reset_index(drop=True) - context_df[constants.RECO_COLS].reset_index(drop=True) + prescribed_actions_df = reco_df[constants.RECO_COLS] - context_df[constants.RECO_COLS].reset_index(drop=True) # Rename the columns to match what the predictor expects prescribed_actions_df = prescribed_actions_df.rename(constants.RECO_MAP, axis=1) prescribed_actions_df[constants.NO_CHANGE_COLS] = 0 - + # Aggregate the context and actions dataframes. context_actions_df = pd.concat([context_df, prescribed_actions_df[constants.DIFF_LAND_USE_COLS]], axis=1) - + return context_actions_df def evaluate_candidate(self, candidate): @@ -121,24 +118,21 @@ def evaluate_candidate(self, candidate): :param candidate: a Keras neural network or rule based Prescriptor candidate :return metrics: A dictionary of {'metric_name': metric_value} """ - # Save candidate to local file for easy debug - # candidate.save('prescriptor.h5') - # Prescribe actions # Single action, recommended percentage for each land use type # Note: prescribed action is a softmax, NOT encoded in the same scale as the context prescribed_actions_df = self.prescribe(candidate) - + # Convert the softmax into a DataFrame reco_land_use_df = pd.DataFrame(prescribed_actions_df["reco_land_use"].tolist(), columns=constants.RECO_COLS) - + context_actions_df = self._reco_to_context_actions(reco_land_use_df, self.context_df) # Compute the metrics metrics = self._compute_metrics(context_actions_df) return metrics - + def _compute_metrics(self, context_actions_df): """ Computes metrics from the passed context/actions DataFrame using the instance's trained predictors. @@ -146,17 +140,17 @@ def _compute_metrics(self, context_actions_df): :return: A dictionary of {'metric_name': metric_value} """ metrics = {} - + # Get the predicted ELUC from the predictors preds = self.predict_eluc(context_actions_df) metrics['ELUC'] = preds['ELUC'].mean() - + # Compute the % of change change_df = self.compute_percent_changed(context_actions_df) metrics['change'] = change_df['change'].mean() - + return metrics - + def predict_eluc(self, context_actions_df: pd.DataFrame) -> pd.DataFrame: """ Predicts ELUC using the given predictor @@ -213,12 +207,12 @@ def _prescribe_from_nn(self, candidate, context_as_nn_input: List[np.ndarray]) - # Put the single action in an array to process it like multiple actions prescribed_actions = [prescribed_actions] - for i, action_col in enumerate(self.cao_mapping["actions"]): - if self._is_scalar(prescribed_actions[i]): + for idx, action_col in enumerate(self.cao_mapping["actions"]): + if self._is_scalar(prescribed_actions[idx]): # We have a single row and this action is numerical. Convert it to a scalar. - actions[action_col] = prescribed_actions[i].item() + actions[action_col] = prescribed_actions[idx].item() else: - actions[action_col] = prescribed_actions[i].tolist() + actions[action_col] = prescribed_actions[idx].tolist() return actions def _is_single_action_prescriptor(self): @@ -270,7 +264,7 @@ def get_fitness_metrics(config: Dict[str, Any]) -> List[str]: metrics = config["evolution"]["fitness"] fitness_metrics = [metric["metric_name"] for metric in metrics] return fitness_metrics - + def prescribe_land_use(self, context_df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Implementation of prescribe_land_use. @@ -292,7 +286,7 @@ def prescribe_land_use(self, context_df: pd.DataFrame, **kwargs) -> pd.DataFrame context_actions_df = context_actions_df.set_index(context_df.index) return context_actions_df - + def predict_metrics(self, context_actions_df: pd.DataFrame) -> tuple: """ Predicts ELUC and computes change from the given context_actions_df. @@ -301,5 +295,3 @@ def predict_metrics(self, context_actions_df: pd.DataFrame) -> tuple: change_df = self.compute_percent_changed(context_actions_df) return eluc_df, change_df - - diff --git a/use_cases/eluc/prescriptors/heuristics/heuristics.py b/use_cases/eluc/prescriptors/heuristics/heuristics.py index 0d0770f..5caa6fe 100644 --- a/use_cases/eluc/prescriptors/heuristics/heuristics.py +++ b/use_cases/eluc/prescriptors/heuristics/heuristics.py @@ -27,7 +27,7 @@ def _reco_heuristic(self, pct: float, context_df: pd.DataFrame) -> pd.DataFrame: context dataframe and returns a dataframe of recommendations based on the heuristic. """ raise NotImplementedError - + def prescribe_land_use(self, context_df: pd.DataFrame, **kwargs) -> pd.DataFrame: """ Implementation of prescribe_land_use using a heuristic. Calls the implementation of _reco_heuristic. @@ -39,7 +39,7 @@ def prescribe_land_use(self, context_df: pd.DataFrame, **kwargs) -> pd.DataFrame # Rename the columns to match what the predictor expects prescribed_actions_df = prescribed_actions_df.rename(constants.RECO_MAP, axis=1) prescribed_actions_df[constants.NO_CHANGE_COLS] = 0 - + # Aggregate the context and actions dataframes. context_actions_df = pd.concat([context_df, prescribed_actions_df[constants.DIFF_LAND_USE_COLS]], axis=1) return context_actions_df @@ -59,7 +59,7 @@ def __init__(self, best_col: str, predictor: Predictor): super().__init__(predictor) self.best_col = best_col self.presc_cols = [col for col in constants.RECO_COLS if col != best_col] - + def _reco_heuristic(self, pct: float, context_df: pd.DataFrame): """ Takes evenly from all columns and adds to best col. @@ -71,11 +71,14 @@ def _reco_heuristic(self, pct: float, context_df: pd.DataFrame): adjusted["row_sum"] = adjusted[self.presc_cols].sum(axis=1) to_change = adjusted["row_sum"] > 0 adjusted["max_change"] = adjusted[["scaled_change", "row_sum"]].min(axis=1) + + max_change = adjusted.loc[to_change, "max_change"] + row_sum = adjusted.loc[to_change, "row_sum"] # Reduce all columns by even amount for col in self.presc_cols: - adjusted.loc[to_change, col] -= adjusted.loc[to_change, col] * adjusted.loc[to_change, "max_change"] / adjusted.loc[to_change, "row_sum"] + adjusted.loc[to_change, col] -= adjusted.loc[to_change, col] * max_change / row_sum # Increase best column by max change - adjusted.loc[to_change, self.best_col] = adjusted.loc[to_change, self.best_col] + adjusted.loc[to_change, "max_change"] + adjusted.loc[to_change, self.best_col] = adjusted.loc[to_change, self.best_col] + max_change adjusted = adjusted.drop(["scaled_change", "row_sum", "max_change"], axis=1) return adjusted diff --git a/use_cases/eluc/prescriptors/nsga2/candidate.py b/use_cases/eluc/prescriptors/nsga2/candidate.py index 59f7bb4..aaa6035 100644 --- a/use_cases/eluc/prescriptors/nsga2/candidate.py +++ b/use_cases/eluc/prescriptors/nsga2/candidate.py @@ -9,7 +9,8 @@ class Candidate(torch.nn.Module): Simple fixed topology 1 hidden layer feed-forward nn candidate. Keeps track of its own metrics and evolution logging information. """ - def __init__(self, in_size: int, hidden_size: int, out_size: int, device="cpu", gen=-1, cand_id=-1, parents=(None, None)): + def __init__(self, in_size: int, hidden_size: int, out_size: int, + device="cpu", gen=-1, cand_id=-1, parents=(None, None)): super().__init__() self.in_size = in_size @@ -20,11 +21,11 @@ def __init__(self, in_size: int, hidden_size: int, out_size: int, device="cpu", torch.nn.Linear(in_size, hidden_size), torch.nn.Tanh(), torch.nn.Linear(hidden_size, out_size)) - + self.device = device self.model.to(device) self.model.eval() - + # Orthogonal initialization for layer in self.model: if isinstance(layer, torch.nn.Linear): @@ -54,20 +55,21 @@ def from_crossover(cls, parent1, parent2, p_mutation: float, gen: int, cand_id: gen=gen, cand_id=cand_id, parents=((parent1.gen, parent1.cand_id), (parent2.gen, parent2.cand_id))) - - for child_param, parent1_param, parent2_param in zip(child.parameters(), parent1.parameters(), parent2.parameters()): + + params = zip(child.parameters(), parent1.parameters(), parent2.parameters()) + for child_param, parent1_param, parent2_param in params: mask = torch.rand(size=child_param.data.shape) < 0.5 child_param.data = torch.where(mask, parent1_param.data, parent2_param.data) child.mutate(p_mutation) return child - + def forward(self, X: torch.Tensor) -> torch.Tensor: """ Forward pass of the simple nn """ out = self.model(X) return out - + def mutate(self, p_mutation: float): """ Randomly mutates each weight with probability p_mutation with gaussian noise mu=0, sigma=0.1 @@ -83,13 +85,12 @@ def record_state(self) -> dict: """ Record the state of the candidate for logging purposes """ - if self.metrics is None: + if not isinstance(self.metrics, tuple): raise ValueError("Candidate has not been evaluated yet") - else: - return {"gen": self.gen, - "id": self.cand_id, - "parents": self.parents, - "NSGA-II_rank": self.rank, # Named this to match ESP - "distance": self.distance, - "ELUC": self.metrics[0], - "change": self.metrics[1]} + return {"gen": self.gen, + "id": self.cand_id, + "parents": self.parents, + "NSGA-II_rank": self.rank, # Named this to match ESP + "distance": self.distance, + "ELUC": self.metrics[0], + "change": self.metrics[1]} diff --git a/use_cases/eluc/prescriptors/nsga2/create_seeds.py b/use_cases/eluc/prescriptors/nsga2/create_seeds.py index 866601c..9b064f1 100644 --- a/use_cases/eluc/prescriptors/nsga2/create_seeds.py +++ b/use_cases/eluc/prescriptors/nsga2/create_seeds.py @@ -48,7 +48,7 @@ def supervised_backprop(save_path: Path, ds: TorchDataset): n += len(X) pbar.set_postfix({"val loss": total_loss / n}) - + torch.save(seed.state_dict(), save_path) def seed_no_change(seed_dir: Path, df: pd.DataFrame, encoded_df: pd.DataFrame): @@ -58,7 +58,7 @@ def seed_no_change(seed_dir: Path, df: pd.DataFrame, encoded_df: pd.DataFrame): ds = TorchDataset(encoded_df[constants.CAO_MAPPING["context"]].to_numpy(), df[constants.RECO_COLS].to_numpy()) seed_dir.mkdir(parents=True, exist_ok=True) supervised_backprop(seed_dir / "no_change.pt", ds) - + def seed_max_change(seed_dir: Path, df: pd.DataFrame, encoded_df: pd.DataFrame): """ Creates a seed that attempts to prescribe the max change to secdf. @@ -70,7 +70,9 @@ def seed_max_change(seed_dir: Path, df: pd.DataFrame, encoded_df: pd.DataFrame): max_change_recos[constants.RECO_COLS] = 0 max_change_recos["secdf"] = reco_use - ds = TorchDataset(encoded_df[constants.CAO_MAPPING["context"]].to_numpy(), max_change_recos[constants.RECO_COLS].to_numpy()) + encoded_context_np = encoded_df[constants.CAO_MAPPING["context"]].to_numpy() + max_change_recos_np = max_change_recos[constants.RECO_COLS].to_numpy() + ds = TorchDataset(encoded_context_np, max_change_recos_np) seed_dir.mkdir(parents=True, exist_ok=True) supervised_backprop(seed_dir / "max_change.pt", ds) @@ -80,4 +82,4 @@ def seed_max_change(seed_dir: Path, df: pd.DataFrame, encoded_df: pd.DataFrame): train_df = dataset.train_df.sample(10000) encoded_train_df = dataset.get_encoded_train().loc[train_df.index] seed_no_change(Path("prescriptors/nsga2/seeds/test"), train_df, encoded_train_df) - seed_max_change(Path("prescriptors/nsga2/seeds/test"), train_df, encoded_train_df) \ No newline at end of file + seed_max_change(Path("prescriptors/nsga2/seeds/test"), train_df, encoded_train_df) diff --git a/use_cases/eluc/prescriptors/nsga2/nsga2_utils.py b/use_cases/eluc/prescriptors/nsga2/nsga2_utils.py index 186195f..7f39cc8 100644 --- a/use_cases/eluc/prescriptors/nsga2/nsga2_utils.py +++ b/use_cases/eluc/prescriptors/nsga2/nsga2_utils.py @@ -29,7 +29,7 @@ def fast_non_dominated_sort(candidates: list): front[0].append(p) i = 0 - while front[i] != []: + while front[i]: Q = [] for p in front[i]: for q in S[p]: @@ -81,4 +81,4 @@ def dominates(candidate1: Candidate, candidate2: Candidate): for obj1, obj2 in zip(candidate1.metrics, candidate2.metrics): if obj1 > obj2: return False - return True \ No newline at end of file + return True diff --git a/use_cases/eluc/prescriptors/nsga2/torch_prescriptor.py b/use_cases/eluc/prescriptors/nsga2/torch_prescriptor.py index c698bd3..5b06b62 100644 --- a/use_cases/eluc/prescriptors/nsga2/torch_prescriptor.py +++ b/use_cases/eluc/prescriptors/nsga2/torch_prescriptor.py @@ -1,7 +1,6 @@ """ PyTorch implementation of NSGA-II. """ - import random import shutil from pathlib import Path @@ -79,7 +78,7 @@ def _reco_to_context_actions(self, reco_df: pd.DataFrame, context_df: pd.DataFra presc_actions_df[constants.CAO_MAPPING["actions"]]], axis=1) return context_actions_df - + def _prescribe(self, candidate: Candidate, context_df=None) -> pd.DataFrame: """ Prescribes actions given a candidate and a context. @@ -122,7 +121,7 @@ def predict_metrics(self, context_actions_df: pd.DataFrame) -> pd.DataFrame: """ eluc_df = self.predictor.predict(context_actions_df) change_df = self.compute_percent_changed(context_actions_df) - + return eluc_df, change_df def _evaluate_candidates(self, candidates: list[Candidate]): @@ -155,7 +154,7 @@ def _select_parents(self, candidates: list[Candidate], n_parents: int) -> list[C break parents += front return parents - + def _tournament_selection(self, sorted_parents: list[Candidate]) -> tuple[Candidate, Candidate]: """ Takes two random parents and compares their indices since this is a measure of their performance. @@ -195,9 +194,9 @@ def neuroevolution(self, save_path: Path): # Seeding the first generation with trained models if self.seed_dir: seed_paths = list(self.seed_dir.glob("*.pt")) - for i, seed_path in enumerate(seed_paths): + for idx, seed_path in enumerate(seed_paths): print(f"Seeding with {seed_path}...") - parents[i].load_state_dict(torch.load(seed_path)) + parents[idx].load_state_dict(torch.load(seed_path)) offspring = [] for gen in tqdm(range(1, self.n_generations+1)): @@ -220,7 +219,7 @@ def neuroevolution(self, save_path: Path): results_df.to_csv(save_path / "results.csv", index=False) return parents - + def _record_gen_results(self, gen: int, candidates: list[Candidate], save_path: Path) -> None: """ Records the state of all the candidates. @@ -257,7 +256,7 @@ def prescribe_land_use(self, context_df: pd.DataFrame, **kwargs) -> pd.DataFrame gen = int(kwargs["cand_id"].split("_")[0]) state_dict = torch.load(kwargs["results_dir"] / f"{gen + 1}" / f"{kwargs['cand_id']}.pt") candidate.load_state_dict(state_dict) - + context_actions_df = self._prescribe(candidate, context_df) return context_actions_df \ No newline at end of file diff --git a/use_cases/eluc/prescriptors/nsga2/train_prescriptors.py b/use_cases/eluc/prescriptors/nsga2/train_prescriptors.py index 7d647b3..a0c8306 100644 --- a/use_cases/eluc/prescriptors/nsga2/train_prescriptors.py +++ b/use_cases/eluc/prescriptors/nsga2/train_prescriptors.py @@ -1,3 +1,6 @@ +""" +Script to train the NSGA-II prescriptors. +""" from pathlib import Path from data import constants @@ -29,4 +32,4 @@ print("Training prescriptors...") save_path = Path("prescriptors/nsga2/trained_prescriptors/test") final_pop = tp.neuroevolution(save_path) - print("Done!") \ No newline at end of file + print("Done!") diff --git a/use_cases/eluc/prescriptors/prescriptor.py b/use_cases/eluc/prescriptors/prescriptor.py index 0792ecb..329e70a 100644 --- a/use_cases/eluc/prescriptors/prescriptor.py +++ b/use_cases/eluc/prescriptors/prescriptor.py @@ -1,3 +1,6 @@ +""" +Abstract prescriptor class to be implemented. +""" from abc import ABC import pandas as pd @@ -16,7 +19,7 @@ def prescribe_land_use(self, context_df: pd.DataFrame, **kwargs) -> pd.DataFrame Outputs a concatenation of the context and actions. """ raise NotImplementedError - + def predict_metrics(self, context_actions_df: pd.DataFrame) -> tuple: """ Takes in a context actions dataframe and uses the predictor the prescriptor @@ -24,13 +27,14 @@ def predict_metrics(self, context_actions_df: pd.DataFrame) -> tuple: Returns a dataframe of ELUC and change. """ raise NotImplementedError - + def compute_percent_changed(self, context_actions_df: pd.DataFrame) -> pd.DataFrame: """ Calculates percent of land changed by prescriptor. """ # Sum the positive diffs - percent_changed = context_actions_df[context_actions_df[constants.DIFF_LAND_USE_COLS] > 0][constants.DIFF_LAND_USE_COLS].sum(axis=1) + pos_diffs = context_actions_df[context_actions_df[constants.DIFF_LAND_USE_COLS] > 0] + percent_changed = pos_diffs[constants.DIFF_LAND_USE_COLS].sum(axis=1) # Divide by sum of used land total_land = context_actions_df[constants.LAND_USE_COLS].sum(axis=1) total_land = total_land.replace(0, 1) # Avoid division by 0