Merge pull request #81 from Project-Resilience/refactor_nnp

Refactor predictor loading and saving
Project-Resilience · May 14, 2024 · 29234c8 · 29234c8
2 parents f345385 + c4bde78
commit 29234c8
Show file tree

Hide file tree

Showing 7 changed files with 85 additions and 69 deletions.
diff --git a/use_cases/eluc/experiments/predictor_experiments.ipynb b/use_cases/eluc/experiments/predictor_experiments.ipynb
@@ -82,7 +82,7 @@
     "    \"train_pct\": 1,\n",
     "    \"step_lr_params\": {\"step_size\": 1, \"gamma\": 0.1},\n",
     "}\n",
-    "nnp = NeuralNetPredictor(**nn_config)"
+    "nnp = NeuralNetPredictor(nn_config)"
    ]
   },
   {
@@ -109,7 +109,7 @@
     }
    ],
    "source": [
-    "nnp.load(\"predictors/neural_network/trained_models/experiment_nn\")\n",
+    "nnp = NeuralNetPredictor.load(\"predictors/neural_network/trained_models/experiment_nn\")\n",
     "print(f\"MAE Neural Net: {mean_absolute_error(dataset.test_df[nn_config['label']], nnp.predict(dataset.test_df[nn_config['features']]))}\")"
    ]
   },
@@ -130,7 +130,7 @@
     "    \"features\": constants.DIFF_LAND_USE_COLS,\n",
     "    \"n_jobs\": -1,\n",
     "}\n",
-    "linreg = LinearRegressionPredictor(**linreg_config)"
+    "linreg = LinearRegressionPredictor(linreg_config)"
    ]
   },
   {
@@ -157,7 +157,7 @@
     }
    ],
    "source": [
-    "linreg.load(\"predictors/sklearn/trained_models/experiment_linreg\")\n",
+    "linreg = LinearRegressionPredictor.load(\"predictors/sklearn/trained_models/experiment_linreg\")\n",
     "print(f\"MAE Linear Regression: {mean_absolute_error(dataset.test_df['ELUC'], linreg.predict(dataset.test_df[constants.DIFF_LAND_USE_COLS]))}\")"
    ]
   },
@@ -175,11 +175,12 @@
    "outputs": [],
    "source": [
     "forest_config = {\n",
+    "    \"features\": constants.NN_FEATS,\n",
     "    \"n_jobs\": -1,\n",
     "    \"max_features\": \"sqrt\",\n",
     "    \"random_state\": 42\n",
     "}\n",
-    "forest = RandomForestPredictor(features=constants.NN_FEATS, **forest_config)"
+    "forest = RandomForestPredictor(forest_config)"
    ]
   },
   {
@@ -208,7 +209,7 @@
     }
    ],
    "source": [
-    "forest.load(\"predictors/sklearn/trained_models/experiment_rf\")\n",
+    "forest = RandomForestPredictor.load(\"predictors/sklearn/trained_models/experiment_rf\")\n",
     "print(f\"MAE Random Forest: {mean_absolute_error(dataset.test_df['ELUC'], forest.predict(dataset.test_df[constants.NN_FEATS]))}\")"
    ]
   },

diff --git a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py
@@ -2,7 +2,6 @@
 Implementation of predictor.py using a simple feed-forward NeuralNetwork
 implemented in PyTorch.
 """
-
 import copy
 import json
 import time
@@ -79,57 +78,62 @@ class NeuralNetPredictor(Predictor):
     in order to take advantage of the linear relationship in the data.
     Data is automatically standardized and the scaler is saved with the model.
     """
-    def __init__(self, features=None, label=None, hidden_sizes=[4096], linear_skip=True,
-                 dropout=0, device="mps", epochs=3, batch_size=2048, optim_params={},
-                 train_pct=1, step_lr_params={"step_size": 1, "gamma": 0.1}):
-
-        self.features=None
-        self.label=None
-
-        self.set_params(features, label, hidden_sizes, linear_skip,
-                        dropout, device, epochs, batch_size, optim_params,
-                        train_pct, step_lr_params)
+    def __init__(self, model_config: dict):
+        """
+        Model config should contain the following:
+        features: list of features to use in the model (optional, defaults to all features)
+        label: name of the label column (optional, defaults to passed label in fit)
+        hidden_sizes: list of hidden layer sizes
+        linear_skip: whether to concatenate input to hidden layer output
+        dropout: dropout probability
+        device: device to run the model on
+        epochs: number of epochs to train for
+        batch_size: batch size for training
+        optim_params: dictionary of parameters to pass to the optimizer
+        train_pct: percentage of training data to use
+        step_lr_params: dictionary of parameters to pass to the step learning rate scheduler
+        """
+
+        self.features = model_config.get("features", None)
+        self.label = model_config.get("label", None)
+        self.hidden_sizes = model_config.get("hidden_sizes", [4096])
+        self.linear_skip = model_config.get("linear_skip", True)
+        self.dropout = model_config.get("dropout", 0)
+        self.device = model_config.get("device", "cpu")
+        self.epochs = model_config.get("epochs", 3)
+        self.batch_size = model_config.get("batch_size", 2048)
+        self.optim_params = model_config.get("optim_params", {})
+        self.train_pct = model_config.get("train_pct", 1)
+        self.step_lr_params = model_config.get("step_lr_params", {"step_size": 1, "gamma": 0.1})
 
         self.model = None
         self.scaler = StandardScaler()
 
-    def set_params(self, features, label, hidden_sizes, linear_skip,
-                   dropout, device, epochs, batch_size, optim_params,
-                   train_pct, step_lr_params):
-        """
-        Set all the parameters for the neural network.
-        """
-        self.features = features
-        self.label = label
-        self.hidden_sizes = hidden_sizes
-        self.linear_skip = linear_skip
-        self.dropout = dropout
-        self.device = device
-        self.epochs = epochs
-        self.batch_size = batch_size
-        self.optim_params = optim_params
-        self.train_pct = train_pct
-        self.step_lr_params = step_lr_params
-
-    def load(self, path: str):
+    @classmethod
+    def load(cls, path: str) -> "NeuralNetPredictor":
         """
         Loads a model from a given folder containing a config.json, model.pt, and scaler.joblib.
         :param path: path to folder containing model files.
         """
-        load_path = Path(path)
+        if isinstance(path, str):
+            load_path = Path(path)
+        else:
+            load_path = path
         if not load_path.exists():
             raise FileNotFoundError(f"Path {path} does not exist.")
 
         # Initialize model with config
         with open(load_path / "config.json", "r", encoding="utf-8") as file:
             config = json.load(file)
-        self.set_params(**config)
 
-        self.model = ELUCNeuralNet(len(self.features), self.hidden_sizes, self.linear_skip, self.dropout)
-        self.model.load_state_dict(torch.load(load_path / "model.pt"))
-        self.model.to(self.device)
-        self.model.eval()
-        self.scaler = joblib.load(load_path / "scaler.joblib")
+        nnp = cls(config)
+
+        nnp.model = ELUCNeuralNet(len(config["features"]), config["hidden_sizes"], config["linear_skip"], config["dropout"])
+        nnp.model.load_state_dict(torch.load(load_path / "model.pt"))
+        nnp.model.to(config["device"])
+        nnp.model.eval()
+        nnp.scaler = joblib.load(load_path / "scaler.joblib")
+        return nnp
 
 
     def save(self, path: str):

diff --git a/use_cases/eluc/predictors/predictor.py b/use_cases/eluc/predictors/predictor.py
@@ -45,9 +45,9 @@ def save(self, path: str):
         :param path: path to save the model
         """
 
-
+    @classmethod
     @abstractmethod
-    def load(self, path: str):
+    def load(cls, path: str) -> "Predictor":
         """
         Loads a model from a path.
         :param path: path to the model

diff --git a/use_cases/eluc/predictors/sklearn/sklearn_predictor.py b/use_cases/eluc/predictors/sklearn/sklearn_predictor.py
@@ -19,9 +19,15 @@ class SKLearnPredictor(Predictor, ABC):
     Simple abstract class for sklearn predictors.
     Keeps track of features fit on and label to predict.
     """
-    def __init__(self, features=None, label=None):
-        self.features = features
-        self.label = label
+    def __init__(self, model_config: dict):
+        """
+        Model config contains the following:
+        features: list of features to use for prediction (optional, defaults to all features)
+        label: name of the label to predict (optional, defaults to passed label during fit)
+        """
+        self.features = model_config.get("features", None)
+        self.label = model_config.get("label", None)
+
         self.model = None
 
     def save(self, path: str):
@@ -30,7 +36,10 @@ def save(self, path: str):
         Generates path to folder if it does not exist.
         :param path: path to folder to save model files.
         """
-        save_path = Path(path)
+        if isinstance(path, str):
+            save_path = Path(path)
+        else:
+            save_path = path
         save_path.mkdir(parents=True, exist_ok=True)
         config = {
             "features": self.features,
@@ -40,17 +49,18 @@ def save(self, path: str):
             json.dump(config, file)
         joblib.dump(self.model, save_path / "model.joblib")
 
-    def load(self, path):
+    @classmethod
+    def load(cls, path) -> "SKLearnPredictor":
         """
         Loads saved model and features from a folder.
         :param path: path to folder to load model files from.
         """
         load_path = Path(path)
         with open(load_path / "config.json", "r", encoding="utf-8") as file:
             config = json.load(file)
-            self.features = config["features"]
-            self.label = config["label"]
-        self.model = joblib.load(load_path / "model.joblib")
+        sklearn_predictor = cls(config)
+        sklearn_predictor.model = joblib.load(load_path / "model.joblib")
+        return sklearn_predictor
 
     def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
         """
@@ -83,19 +93,23 @@ class LinearRegressionPredictor(SKLearnPredictor):
     Simple linear regression predictor.
     See SKLearnPredictor for more details.
     """
-    def __init__(self, features=None, **kwargs):
-        super().__init__(features)
-        self.model = LinearRegression(**kwargs)
+    def __init__(self, model_config: dict):
+        super().__init__(model_config)
+        model_config.pop("features", None)
+        model_config.pop("label", None)
+        self.model = LinearRegression(**model_config)
 
 class RandomForestPredictor(SKLearnPredictor):
     """
     Simple random forest predictor.
     See SKLearnPredictor for more details.
     Overrides save method in order to compress it.
     """
-    def __init__(self, features=None, **kwargs):
-        super().__init__(features)
-        self.model = RandomForestRegressor(**kwargs)
+    def __init__(self, model_config: dict):
+        super().__init__(model_config)
+        model_config.pop("features", None)
+        model_config.pop("label", None)
+        self.model = RandomForestRegressor(**model_config)
 
     def save(self, path: str, compression=0):
         """

diff --git a/use_cases/eluc/prescriptors/nsga2/train_prescriptors.py b/use_cases/eluc/prescriptors/nsga2/train_prescriptors.py
@@ -25,9 +25,7 @@
 
     print("Loading predictor...")
     # TODO: We need to make it so you can load any predictor here
-    nnp = NeuralNetPredictor()
-    nnp_path = Path(config["predictor_path"])
-    nnp.load(nnp_path)
+    nnp = NeuralNetPredictor.load(Path(config["predictor_path"]))
 
     print("Initializing prescription...")
     if "seed_dir" in config["evolution_params"].keys():

diff --git a/use_cases/eluc/tests/test_nsga2.py b/use_cases/eluc/tests/test_nsga2.py
@@ -69,7 +69,7 @@ def setUpClass(cls):
         fields = get_fields(cls.dummy_data)
         encoder = ELUCEncoder(fields)
 
-        predictor = LinearRegressionPredictor(features=constants.DIFF_LAND_USE_COLS, n_jobs=-1)
+        predictor = LinearRegressionPredictor(dict(features=constants.DIFF_LAND_USE_COLS, n_jobs=-1))
         predictor.fit(cls.dummy_data[constants.DIFF_LAND_USE_COLS], cls.dummy_data["ELUC"])
         cls.prescriptor = TorchPrescriptor(
             100,

diff --git a/use_cases/eluc/tests/test_predictors.py b/use_cases/eluc/tests/test_predictors.py
@@ -45,7 +45,7 @@ def test_save_file_names(self):
         ]
         for model, config, test_names in zip(self.models, self.configs, save_file_names):
             with self.subTest(model=model):
-                predictor = model(**config)
+                predictor = model(config)
                 predictor.fit(self.dummy_data, self.dummy_target)
                 predictor.save(self.temp_path)
                 files = [f.name for f in self.temp_path.glob("**/*") if f.is_file()]
@@ -61,13 +61,12 @@ def test_loaded_same(self):
 
         for model, config in zip(self.models, self.configs):
             with self.subTest(model=model):
-                predictor = model(**config)
+                predictor = model(config)
                 predictor.fit(self.dummy_data.iloc[:2], self.dummy_target.iloc[:2])
                 output = predictor.predict(self.dummy_data.iloc[2:])
                 predictor.save(self.temp_path)
 
-                loaded = model(**config)
-                loaded.load(self.temp_path)
+                loaded = model.load(self.temp_path)
                 loaded_output = loaded.predict(self.dummy_data.iloc[2:])
 
                 self.assertTrue((output == loaded_output).all().all()) # Pandas is so annoying why is this necessary?
@@ -91,7 +90,7 @@ def test_single_input(self):
         """
         Tests the neural net with a single input.
         """
-        predictor = NeuralNetPredictor(hidden_sizes=[4], epochs=1, batch_size=1, device="cpu")
+        predictor = NeuralNetPredictor(dict(hidden_sizes=[4], epochs=1, batch_size=1, device="cpu"))
 
         train_data = pd.DataFrame({"a": [1], "b": [2], "c": [3], "label": [4]})
         test_data = pd.DataFrame({"a": [4], "b": [5], "c": [6]})
@@ -104,7 +103,7 @@ def test_multi_input(self):
         """
         Tests the neural net with multiple inputs.
         """
-        predictor = NeuralNetPredictor(hidden_sizes=[4], epochs=1, batch_size=1, device="cpu")
+        predictor = NeuralNetPredictor(dict(hidden_sizes=[4], epochs=1, batch_size=1, device="cpu"))
 
         train_data = pd.DataFrame({"a": [1, 2], "b": [2, 3], "c": [3, 4], "label": [4, 5]})
         test_data = pd.DataFrame({"a": [4, 5], "b": [5, 6], "c": [6, 7]})