From 4ea59283b9402a05de703b7bb8e840a46471319a Mon Sep 17 00:00:00 2001 From: Daniel Young Date: Thu, 16 May 2024 16:06:19 -0700 Subject: [PATCH 1/5] Added from_pretrained to load predictor from hf. Refactored some things to work with this --- .../neural_network/neural_net_predictor.py | 11 +++-- use_cases/eluc/predictors/predictor.py | 34 ++++++++++++--- .../predictors/sklearn/sklearn_predictor.py | 42 +++++++------------ 3 files changed, 49 insertions(+), 38 deletions(-) diff --git a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py index 25a605f..b345bb1 100644 --- a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py +++ b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py @@ -4,15 +4,14 @@ """ import copy import json -import time from pathlib import Path +import time - +import joblib import numpy as np import pandas as pd -import joblib -from tqdm import tqdm from sklearn.preprocessing import StandardScaler +from tqdm import tqdm import torch from torch.utils.data import DataLoader @@ -112,9 +111,10 @@ def __init__(self, model_config: dict): @classmethod def load(cls, path: str) -> "NeuralNetPredictor": """ - Loads a model from a given folder containing a config.json, model.pt, and scaler.joblib. + Loads a model from a given folder or huggingface repo containing a config.json, model.pt, and scaler.joblib. :param path: path to folder containing model files. """ + if isinstance(path, str): load_path = Path(path) else: @@ -135,7 +135,6 @@ def load(cls, path: str) -> "NeuralNetPredictor": nnp.scaler = joblib.load(load_path / "scaler.joblib") return nnp - def save(self, path: str): """ Saves model, config, and scaler into format for loading. diff --git a/use_cases/eluc/predictors/predictor.py b/use_cases/eluc/predictors/predictor.py index 9f677d9..5663b71 100644 --- a/use_cases/eluc/predictors/predictor.py +++ b/use_cases/eluc/predictors/predictor.py @@ -2,7 +2,9 @@ Abstract class for predictors to inherit from. """ from abc import ABC, abstractmethod +from pathlib import Path +from huggingface_hub import snapshot_download import pandas as pd @@ -25,7 +27,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series): It is up to the model to decide which columns to use. :param y_train: series with target data """ - + raise NotImplementedError @abstractmethod def predict(self, X_test: pd.DataFrame) -> pd.DataFrame: @@ -36,7 +38,7 @@ def predict(self, X_test: pd.DataFrame) -> pd.DataFrame: :param X_test: DataFrame with input data :return: DataFrame with predictions """ - + raise NotImplementedError @abstractmethod def save(self, path: str): @@ -44,11 +46,31 @@ def save(self, path: str): Saves the model to a path. :param path: path to save the model """ + raise NotImplementedError @classmethod - @abstractmethod - def load(cls, path: str) -> "Predictor": + def from_pretrained(cls, path_or_url: str, **hf_args) -> "Predictor": + """ + Loads a model from a path or if this path is not found, searches for it on huggingface and loads from there. + :param path: path to the model or url to the huggingface repo. + """ + path = Path(path_or_url) + if path.exists() and path.is_dir(): + return cls.load(path) + else: + # TODO: Need a try except block to catch download errors + url_path = path_or_url.replace("/", "--") + local_dir = hf_args.get("local_dir", f"predictors/trained_models/{url_path}") + + if not Path(local_dir).exists() or not Path(local_dir).is_dir(): + hf_args["local_dir"] = local_dir + snapshot_download(repo_id=path_or_url, **hf_args) + + return cls.load(Path(local_dir)) + + @classmethod + def load(cls, path: Path) -> "Predictor": """ - Loads a model from a path. - :param path: path to the model + Loads a model from the path """ + raise NotImplementedError \ No newline at end of file diff --git a/use_cases/eluc/predictors/sklearn/sklearn_predictor.py b/use_cases/eluc/predictors/sklearn/sklearn_predictor.py index 8079d75..f25a03a 100644 --- a/use_cases/eluc/predictors/sklearn/sklearn_predictor.py +++ b/use_cases/eluc/predictors/sklearn/sklearn_predictor.py @@ -24,10 +24,9 @@ def __init__(self, model_config: dict): Model config contains the following: features: list of features to use for prediction (optional, defaults to all features) label: name of the label to predict (optional, defaults to passed label during fit) + Any other parameters are passed to the model. """ - self.features = model_config.get("features", None) - self.label = model_config.get("label", None) - + self.config = model_config self.model = None def save(self, path: str): @@ -41,12 +40,8 @@ def save(self, path: str): else: save_path = path save_path.mkdir(parents=True, exist_ok=True) - config = { - "features": self.features, - "label": self.label - } with open(save_path / "config.json", "w", encoding="utf-8") as file: - json.dump(config, file) + json.dump(self.config, file) joblib.dump(self.model, save_path / "model.joblib") @classmethod @@ -69,11 +64,11 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series): :param X_train: DataFrame with input data :param y_train: series with target data """ - if self.features: - X_train = X_train[self.features] + if "features" in self.config: + X_train = X_train[self.config["features"]] else: - self.features = list(X_train.columns) - self.label = y_train.name + self.config["features"] = list(X_train.columns) + self.config["label"] = y_train.name self.model.fit(X_train, y_train) def predict(self, X_test: pd.DataFrame) -> pd.DataFrame: @@ -83,10 +78,9 @@ def predict(self, X_test: pd.DataFrame) -> pd.DataFrame: :param X_test: DataFrame with input data :return: properly labeled DataFrame with predictions and matching index. """ - if self.features: - X_test = X_test[self.features] + X_test = X_test[self.config["features"]] y_pred = self.model.predict(X_test) - return pd.DataFrame(y_pred, index=X_test.index, columns=[self.label]) + return pd.DataFrame(y_pred, index=X_test.index, columns=[self.config["label"]]) class LinearRegressionPredictor(SKLearnPredictor): """ @@ -94,10 +88,11 @@ class LinearRegressionPredictor(SKLearnPredictor): See SKLearnPredictor for more details. """ def __init__(self, model_config: dict): + if not model_config: + model_config = {} super().__init__(model_config) - model_config.pop("features", None) - model_config.pop("label", None) - self.model = LinearRegression(**model_config) + lr_config = {key: value for key, value in model_config.items() if key not in ["features", "label"]} + self.model = LinearRegression(**lr_config) class RandomForestPredictor(SKLearnPredictor): """ @@ -107,9 +102,8 @@ class RandomForestPredictor(SKLearnPredictor): """ def __init__(self, model_config: dict): super().__init__(model_config) - model_config.pop("features", None) - model_config.pop("label", None) - self.model = RandomForestRegressor(**model_config) + rf_config = {key: value for key, value in model_config.items() if key not in ["features", "label"]} + self.model = RandomForestRegressor(**rf_config) def save(self, path: str, compression=0): """ @@ -118,11 +112,7 @@ def save(self, path: str, compression=0): """ save_path = Path(path) save_path.mkdir(parents=True, exist_ok=True) - config = { - "features": self.features, - "label": self.label - } with open(save_path / "config.json", "w", encoding="utf-8") as file: - json.dump(config, file) + json.dump(self.config, file) joblib.dump(self.model, save_path / "model.joblib", compress=compression) \ No newline at end of file From 1429fc62f5be2da6c27004667390e898848c5d28 Mon Sep 17 00:00:00 2001 From: Daniel Young Date: Thu, 16 May 2024 16:12:52 -0700 Subject: [PATCH 2/5] Added script to upload model to huggingface --- use_cases/eluc/predictors/upload_model.py | 49 +++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 use_cases/eluc/predictors/upload_model.py diff --git a/use_cases/eluc/predictors/upload_model.py b/use_cases/eluc/predictors/upload_model.py new file mode 100644 index 0000000..046fd37 --- /dev/null +++ b/use_cases/eluc/predictors/upload_model.py @@ -0,0 +1,49 @@ +""" +Script to upload a model to huggingface hub. +""" +from argparse import ArgumentParser +from pathlib import Path + +from huggingface_hub import HfApi + +def write_readme(model_path: str): + """ + Writes readme to model save path to upload. + TODO: Need to add more info to the readme and make it a proper template. + """ + model_path = Path(model_path) + with open(model_path / "README.md", "w", encoding="utf-8") as f: + f.write("This is a demo model created for project resilience") + +def upload_to_repo(model_path: str, repo_id: str, token: str=None): + """ + Uses huggingface hub to upload the model to a repo. + """ + model_path = Path(model_path) + api = HfApi() + api.create_repo( + repo_id=repo_id, + repo_type="model", + exist_ok=True, + token=token + ) + + api.upload_folder( + folder_path=model_path, + repo_id=repo_id, + repo_type="model", + token=token + ) + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--model_path", type=str, required=True) + parser.add_argument("--repo_id", type=str, required=True) + parser.add_argument("--token", type=str, required=False) + args = parser.parse_args() + + write_readme(args.model_path) + upload_args = {"model_path": args.model_path, "repo_id": args.repo_id} + if args.token: + upload_args["token"] = args.token + upload_to_repo(**upload_args) From 09e200f8afc0b68f9d9b1a9196fe34f9fba9f8f0 Mon Sep 17 00:00:00 2001 From: Daniel Young Date: Thu, 16 May 2024 16:34:18 -0700 Subject: [PATCH 3/5] Updated documentation to match new save/load --- .../predictors/neural_network/neural_net_predictor.py | 4 ++-- use_cases/eluc/predictors/predictor.py | 10 ++++++---- use_cases/eluc/predictors/sklearn/sklearn_predictor.py | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py index b345bb1..598d74c 100644 --- a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py +++ b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py @@ -111,7 +111,7 @@ def __init__(self, model_config: dict): @classmethod def load(cls, path: str) -> "NeuralNetPredictor": """ - Loads a model from a given folder or huggingface repo containing a config.json, model.pt, and scaler.joblib. + Loads a model from a given folder. :param path: path to folder containing model files. """ @@ -119,7 +119,7 @@ def load(cls, path: str) -> "NeuralNetPredictor": load_path = Path(path) else: load_path = path - if not load_path.exists(): + if not load_path.exists() or not load_path.is_dir(): raise FileNotFoundError(f"Path {path} does not exist.") # Initialize model with config diff --git a/use_cases/eluc/predictors/predictor.py b/use_cases/eluc/predictors/predictor.py index 5663b71..eaef851 100644 --- a/use_cases/eluc/predictors/predictor.py +++ b/use_cases/eluc/predictors/predictor.py @@ -43,7 +43,7 @@ def predict(self, X_test: pd.DataFrame) -> pd.DataFrame: @abstractmethod def save(self, path: str): """ - Saves the model to a path. + Saves the model to a local path. :param path: path to save the model """ raise NotImplementedError @@ -51,8 +51,9 @@ def save(self, path: str): @classmethod def from_pretrained(cls, path_or_url: str, **hf_args) -> "Predictor": """ - Loads a model from a path or if this path is not found, searches for it on huggingface and loads from there. - :param path: path to the model or url to the huggingface repo. + Loads a model from a path or if it is not found, from a huggingface repo. + :param path_or_url: path to the model or url to the huggingface repo. + :param hf_args: arguments to pass to the snapshot_download function from huggingface. """ path = Path(path_or_url) if path.exists() and path.is_dir(): @@ -71,6 +72,7 @@ def from_pretrained(cls, path_or_url: str, **hf_args) -> "Predictor": @classmethod def load(cls, path: Path) -> "Predictor": """ - Loads a model from the path + Loads a model from the path on disk. + :param path: path to the model """ raise NotImplementedError \ No newline at end of file diff --git a/use_cases/eluc/predictors/sklearn/sklearn_predictor.py b/use_cases/eluc/predictors/sklearn/sklearn_predictor.py index f25a03a..542059f 100644 --- a/use_cases/eluc/predictors/sklearn/sklearn_predictor.py +++ b/use_cases/eluc/predictors/sklearn/sklearn_predictor.py @@ -47,7 +47,7 @@ def save(self, path: str): @classmethod def load(cls, path) -> "SKLearnPredictor": """ - Loads saved model and features from a folder. + Loads saved model and config from a local folder. :param path: path to folder to load model files from. """ load_path = Path(path) From f02d134f9f27760528338a8ba02ec85c86c9fbb8 Mon Sep 17 00:00:00 2001 From: Daniel Young Date: Thu, 16 May 2024 16:34:32 -0700 Subject: [PATCH 4/5] Updated gitignore to not upload models downloaded from hf --- use_cases/eluc/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/use_cases/eluc/.gitignore b/use_cases/eluc/.gitignore index a6b769c..e3a1a6f 100644 --- a/use_cases/eluc/.gitignore +++ b/use_cases/eluc/.gitignore @@ -1,5 +1,6 @@ # Ignores saved predictors predictors/*/trained_models/ +predictors/trained_models # Ignores predictor significance results experiments/predictor_significance From c075a551724491f2eceb7f5789b2d2525c29cfdb Mon Sep 17 00:00:00 2001 From: Daniel Young Date: Thu, 16 May 2024 16:36:56 -0700 Subject: [PATCH 5/5] Added checks to make sure all our model files are there when we download them --- .../eluc/predictors/neural_network/neural_net_predictor.py | 4 ++++ use_cases/eluc/predictors/sklearn/sklearn_predictor.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py index 598d74c..72df361 100644 --- a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py +++ b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py @@ -121,6 +121,10 @@ def load(cls, path: str) -> "NeuralNetPredictor": load_path = path if not load_path.exists() or not load_path.is_dir(): raise FileNotFoundError(f"Path {path} does not exist.") + if not (load_path / "config.json").exists() or \ + not (load_path / "model.pt").exists() or \ + not (load_path / "scaler.joblib").exists(): + raise FileNotFoundError("Model files not found in path.") # Initialize model with config with open(load_path / "config.json", "r", encoding="utf-8") as file: diff --git a/use_cases/eluc/predictors/sklearn/sklearn_predictor.py b/use_cases/eluc/predictors/sklearn/sklearn_predictor.py index 542059f..2ff700c 100644 --- a/use_cases/eluc/predictors/sklearn/sklearn_predictor.py +++ b/use_cases/eluc/predictors/sklearn/sklearn_predictor.py @@ -51,6 +51,11 @@ def load(cls, path) -> "SKLearnPredictor": :param path: path to folder to load model files from. """ load_path = Path(path) + if not load_path.exists() or not load_path.is_dir(): + raise FileNotFoundError(f"Path {path} does not exist.") + if not (load_path / "config.json").exists() or not (load_path / "model.joblib").exists(): + raise FileNotFoundError("Model files not found in path.") + with open(load_path / "config.json", "r", encoding="utf-8") as file: config = json.load(file) sklearn_predictor = cls(config)