Project-Resilience · danyoungday · Jul 23, 2024 · Jul 18, 2024 · Jul 20, 2024 · Jul 22, 2024
diff --git a/README.md b/README.md
@@ -34,4 +34,8 @@ Please have a look at the [draft dataset description](https://docs.google.com/sp
 
 Proposed architecture for Project Resilience: [PDF](./project_resilience_conceptual_architecture.pdf)
 
+## Test Status
+
+![ELUC Use Case](https://github.com/ProjectResilience/mvp/actions/workflows/eluc.yml/badge.svg)
+
 
diff --git a/use_cases/eluc/experiments/predictor_experiments.ipynb b/use_cases/eluc/experiments/predictor_experiments.ipynb
@@ -32,7 +32,7 @@
     "from persistence.serializers.sklearn_serializer import SKLearnSerializer\n",
     "from predictors.predictor import Predictor\n",
     "from predictors.neural_network.neural_net_predictor import NeuralNetPredictor\n",
-    "from predictors.sklearn.sklearn_predictor import LinearRegressionPredictor, RandomForestPredictor"
+    "from predictors.sklearn_predictor.sklearn_predictor import LinearRegressionPredictor, RandomForestPredictor"
    ]
   },
   {
@@ -554,7 +554,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.1.-1"
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,

diff --git a/use_cases/eluc/experiments/predictor_significance.py b/use_cases/eluc/experiments/predictor_significance.py
@@ -14,7 +14,7 @@
 from data import constants
 from data.conversion import construct_countries_df
 from predictors.neural_network.neural_net_predictor import NeuralNetPredictor
-from predictors.sklearn.sklearn_predictor import RandomForestPredictor, LinearRegressionPredictor
+from predictors.sklearn_predictor.sklearn_predictor import RandomForestPredictor, LinearRegressionPredictor
 
 def train_and_test(n: int,
                    model_constructor,

diff --git a/use_cases/eluc/persistence/serializers/sklearn_serializer.py b/use_cases/eluc/persistence/serializers/sklearn_serializer.py
@@ -7,7 +7,7 @@
 import joblib
 
 from persistence.serializers.serializer import Serializer
-from predictors.sklearn.sklearn_predictor import SKLearnPredictor
+from predictors.sklearn_predictor.sklearn_predictor import SKLearnPredictor
 
 class SKLearnSerializer(Serializer):
     """

diff --git a/use_cases/eluc/predictors/custom/README.md b/use_cases/eluc/predictors/custom/README.md
@@ -0,0 +1,17 @@
+# Custom Predictors
+
+This directory contains custom predictors that can be used with the ELUC use case. Since percent change is measurable, we look for predictors that can predict ELUC.
+
+## Create a Custom Predictor
+
+An example custom predictor can be found in the [template](template) folder. In order to create a custom predictor, 2 steps must be completed.
+
+1. You need to implement the `Predictor` interface. This is defined in [predictor.py](../predictor.py). It is a simple abstract class that requires a `predict` method that takes in a dataframe of context and actions and returns a dataframe of outcomes.
+
+2. You need, either in the same class or a specific serializer class, to implement a `load` method that takes in a path to a model on disk and returns an instance of the `Predictor`. (See [serializer.py](../../persistence/serializers/serializer.py) for the interface for serialization and [neural_network_serializer.py](../../persistence/serializers/neural_network_serializer.py) for an example of how to implement serialization.)
+
+Finally, you must add your custom predictor to the [config](../scoring/config.json) file in order to score it.
+
+### Load from HuggingFace
+
+To load a custom model saved on HuggingFace, see the [HuggingFacePersistor](../../persistence/persistors/hf_persistor.py) class. It takes in a `FileSerializer` to download a HuggingFace model to disk then load it. An example of how to score a model from HuggingFace can be found in the [config](../scoring/config.json).
diff --git a/use_cases/eluc/predictors/custom/template/template_predictor.py b/use_cases/eluc/predictors/custom/template/template_predictor.py
@@ -0,0 +1,33 @@
+"""
+See here for how to impelement a predictor:
+"""
+import pandas as pd
+
+from data import constants
+from predictors.predictor import Predictor
+
+class TemplatePredictor(Predictor):
+    """
+    A template predictor returning dummy values for ELUC.
+    The class that gets passed into the Evaluator should call the load method which should return a Predictor.
+    The Predictor just needs to impelement predict.
+    """
+    def __init__(self):
+        super().__init__(context=constants.CAO_MAPPING["context"],
+                         actions=constants.CAO_MAPPING["actions"],
+                         outcomes=constants.CAO_MAPPING["outcomes"])
+
+    def fit(self, X_train, y_train):
+        pass
+
+    def predict(self, context_actions_df: pd.DataFrame) -> pd.DataFrame:
+        dummy_eluc = list(range(len(context_actions_df)))
+        return pd.DataFrame({"ELUC": dummy_eluc}, index=context_actions_df.index)
+
+    @classmethod
+    def load(cls, path: str) -> "TemplatePredictor":
+        """
+        Dummy load function that just returns a new instance of the class.
+        """
+        print("Loading model from", path)
+        return cls()
diff --git a/use_cases/eluc/predictors/scoring/config.json b/use_cases/eluc/predictors/scoring/config.json
@@ -0,0 +1,31 @@
+{
+    "models": [
+        {
+            "type": "local",
+            "name": "TemplatePredictor",
+            "classpath": "predictors/custom/template/template_predictor.py",
+            "filepath": "predictors/custom/template/model.pt"
+        },
+        {
+            "type": "hf",
+            "name": "NeuralNetSerializer",
+            "classpath": "persistence/serializers/neural_network_serializer.py",
+            "url": "danyoung/eluc-global-nn",
+            "filepath": "predictors/trained_models/danyoung--eluc-global-nn"
+        },
+        {
+            "type": "hf",
+            "name": "SKLearnSerializer",
+            "url": "danyoung/eluc-global-linreg",
+            "classpath": "persistence/serializers/sklearn_serializer.py",
+            "filepath": "predictors/trained_models/danyoung--eluc-global-linreg"
+        },
+        {
+            "type": "hf",
+            "name": "SKLearnSerializer",
+            "url": "danyoung/eluc-global-rf",
+            "classpath": "persistence/serializers/sklearn_serializer.py",
+            "filepath": "predictors/trained_models/danyoung--eluc-global-rf"
+        }
+    ]
+}
diff --git a/use_cases/eluc/predictors/scoring/scorer.py b/use_cases/eluc/predictors/scoring/scorer.py
@@ -0,0 +1,94 @@
+"""
+Class to score predictors given a config on a dataset.
+Also a script to demo how it works.
+"""
+import importlib
+import json
+from pathlib import Path
+
+import pandas as pd
+
+import data.constants as constants
+from data.eluc_data import ELUCData
+from persistence.persistors.hf_persistor import HuggingFacePersistor
+from predictors.predictor import Predictor
+from predictors.scoring.validator import Validator
+
+class PredictorScorer:
+    """
+    Scoring class to evaluate predictors on a dataset.
+    Uses a config to dynamically load predictors.
+    The config must point to the classpath of a serializer that can call .load() to return a Predictor object.
+    Alternatively, it may use a HuggingFace url to download a model to a given path, THEN load with the serializer.
+    """
+    def __init__(self, config: dict):
+        """
+        Initializes the Scorer with the custom classes it has to load.
+        """
+        self.predictors = self.dynamically_load_models(config)
+        # We don't pass change into the outcomes column.
+        self.validator = Validator(constants.CAO_MAPPING["context"], constants.CAO_MAPPING["actions"], ["ELUC"])
+
+    def dynamically_load_models(self, config: dict) -> list[Predictor]:
+        """
+        Uses importlib to dynamically load models from a config.
+        Config must have a list of models with the following:
+            - type: "hf" or "local" to determine if it is a HuggingFace model or local model.
+            - name: name of the serializer class to load.
+            - classpath: path to the class that calls .load()
+            - filepath: path to the model on disk or where to save the HuggingFace model.
+            - (optional) url: url to download the model from HuggingFace.
+        Returns a dict with keys being the filepath and values being the Predictor object.
+        """
+        predictors = {}
+        for model in config["models"]:
+            # We dynamically instantiate model_instance as some sort of class that can handle .load() and returns
+            # a Predictor object.
+            spec = importlib.util.spec_from_file_location(model["name"], model["classpath"])
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            model_instance = getattr(module, model["name"])
+
+            # Once we have our model_instance we can load the model from disk or from HuggingFace.
+            if model["type"] == "hf":
+                persistor = HuggingFacePersistor(model_instance())
+                predictor = persistor.from_pretrained(model["url"], local_dir=model["filepath"])
+            elif model["type"] == "local":
+                predictor = model_instance().load(Path(model["filepath"]))
+            else:
+                raise ValueError("Model type must be either 'hf' or 'local'")
+            predictors[model["filepath"]] = predictor
+        return predictors
+
+    def score_models(self, test_df: pd.DataFrame) -> dict[str, float]:
+        """
+        Scores our list of predictors on a given test dataframe.
+        The dataframe is expected to be raw data.
+        We sort our results by MAE.
+        """
+        y_true = test_df["ELUC"]
+        test_df = self.validator.validate_input(test_df)
+        results = {}
+        for predictor_path, predictor in self.predictors.items():
+            outcome_df = predictor.predict(test_df)
+            assert self.validator.validate_output(test_df, outcome_df)
+            y_pred = outcome_df["ELUC"]
+            mae = (y_true - y_pred).abs().mean()
+            results[predictor_path] = mae
+        results = dict(sorted(results.items(), key=lambda item: item[1]))
+        return results
+
+def run_scoring():
+    """
+    A demo script to show how the PredictorScorer class works.
+    """
+    print("Evaluating models in config.json...")
+    config = json.load(open(Path("predictors/scoring/config.json"), "r", encoding="utf-8"))
+    comparator = PredictorScorer(config)
+    dataset = ELUCData.from_hf()
+    results = comparator.score_models(dataset.test_df)
+    print("Results:")
+    print(results)
+
+if __name__ == "__main__":
+    run_scoring()
diff --git a/use_cases/eluc/predictors/scoring/validator.py b/use_cases/eluc/predictors/scoring/validator.py
@@ -0,0 +1,42 @@
+"""
+Validation of input and output dataframes for predictor scoring.
+"""
+import pandas as pd
+
+class Validator():
+    """
+    Validates input and output dataframes for predictor scoring.
+    Context, actions, outcomes do not necessarily have to match the project's CAO_MAPPING. For example, if we are
+    just scoring on ELUC we can just pass the single column as outcomes.
+    """
+    def __init__(self, context: list[str], actions: list[str], outcomes: list[str]):
+        self.context = context
+        self.actions = actions
+        self.outcomes = outcomes
+
+    def validate_input(self, context_actions_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Verifies all the context and actions columns are in context_actions_df.
+        Then removes outcomes from context_actions_df and returns a deep copy of it.
+        """
+        if not set(self.context + self.actions) <= set(context_actions_df.columns):
+            not_seen = set(self.context + self.actions) - set(context_actions_df.columns)
+            raise ValueError(f"Columns {not_seen} not found in input dataframe.")
+
+        seen_outcomes = [col for col in self.outcomes if col in context_actions_df.columns]
+        return context_actions_df.drop(columns=seen_outcomes).copy()
+
+    def validate_output(self, context_actions_df: pd.DataFrame, outcomes_df: pd.DataFrame):
+        """
+        Makes sure the index of context_actions_df and outcomes_df match so we can compute metrics like MAE.
+        Also checks if all outcomes are present in the outcomes_df.
+        """
+        if not context_actions_df.index.equals(outcomes_df.index):
+            raise ValueError("Index of context_actions_df and outcomes_df do not match.")
+
+        if not set(self.outcomes) == set(outcomes_df.columns):
+            print(self.outcomes, outcomes_df.columns)
+            not_seen = set(self.outcomes) - set(outcomes_df.columns)
+            raise ValueError(f"Outcomes {not_seen} not found in output dataframe.")
+
+        return True
diff --git a/...cases/eluc/predictors/sklearn/__init__.py → .../predictors/sklearn_predictor/__init__.py b/...cases/eluc/predictors/sklearn/__init__.py → .../predictors/sklearn_predictor/__init__.py
diff --git a/...c/predictors/sklearn/sklearn_predictor.py → ...rs/sklearn_predictor/sklearn_predictor.py b/...c/predictors/sklearn/sklearn_predictor.py → ...rs/sklearn_predictor/sklearn_predictor.py
diff --git a/use_cases/eluc/tests/test_predictors.py b/use_cases/eluc/tests/test_predictors.py
@@ -10,7 +10,7 @@
 from persistence.serializers.neural_network_serializer import NeuralNetSerializer
 from persistence.serializers.sklearn_serializer import SKLearnSerializer
 from predictors.neural_network.neural_net_predictor import NeuralNetPredictor
-from predictors.sklearn.sklearn_predictor import LinearRegressionPredictor, RandomForestPredictor
+from predictors.sklearn_predictor.sklearn_predictor import LinearRegressionPredictor, RandomForestPredictor
 
 class TestPredictors(unittest.TestCase):
     """
Original file line number	Diff line number	Diff line change
Expand Up		@@ -34,4 +34,8 @@ Please have a look at the [draft dataset description](https://docs.google.com/sp

		Proposed architecture for Project Resilience: [PDF](./project_resilience_conceptual_architecture.pdf)

		## Test Status

		![ELUC Use Case](https://github.com/ProjectResilience/mvp/actions/workflows/eluc.yml/badge.svg)