Merged lint branch in so we pass lint test?

Project-Resilience · Mar 22, 2024 · 2a86f5b · 2a86f5b
2 parents 0fe5d78 + 4b85488
commit 2a86f5b
Show file tree

Hide file tree

Showing 24 changed files with 165 additions and 130 deletions.
diff --git a/use_cases/eluc/.pylintrc b/use_cases/eluc/.pylintrc
@@ -0,0 +1,10 @@
+[MASTER]
+ignore=demo
+
+jobs=0
+
+max-line-length=120
+
+suggestion-mode=yes
+
+good-names=X_train, X_val, X_test, y_train, y_val, y_test, X, Y, y, X_test_scaled
diff --git a/use_cases/eluc/README.md b/use_cases/eluc/README.md
@@ -23,7 +23,7 @@ BLUE simulations with committed emissions could be used to estimate the long-ter
 "Committed emissions" means all the emissions that are caused by a land-use change event are attributed to the year
 of the event.
 BLUE (bookkeeping of land use emissions) is a bookkeeping model that attributes carbon fluxes to land use activities.
-See [BLUE: Bookkeeping of land use emissions](https://doi.org/10.1002/2014GB004997) for more details.  
+See [BLUE: Bookkeeping of land use emissions](https://doi.org/10.1002/2014GB004997) for more details. 
 
 ### LUC
 

diff --git a/use_cases/eluc/data/constants.py b/use_cases/eluc/data/constants.py
@@ -9,8 +9,8 @@
 CODES_PATH = "data/codes.csv"
 
 # Different variations of land-use change columns
-LAND_USE_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', 
-                 'pastr', 'primf', 'primn', 
+LAND_USE_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
+                 'pastr', 'primf', 'primn',
                  'range', 'secdf', 'secdn', 'urban']
 CROP_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per']
 LAND_USE_COLS = ["crop"] + [col for col in LAND_USE_COLS if col not in CROP_COLS]
@@ -29,7 +29,8 @@
 
 # ["United Kingdom", "France", "Germany", "Netherlands", "Belgium", "Switzerland", "Ireland"]
 EU_COUNTRIES = ["GB", "FR", "DE", "NL", "BE", "CH", "IE"]
-# ["Brazil", "Bolivia", "Paraguay", "Peru", "Ecuador", "Colombia", "Venezuela", "Guyana", "Suriname", "Uruguay", "Argentina", "Chile"]
+# ["Brazil", "Bolivia", "Paraguay", "Peru", "Ecuador", "Colombia",
+# "Venezuela", "Guyana", "Suriname", "Uruguay", "Argentina", "Chile"]
 SA_COUNTRIES = ["BR", "BO", "PY", "PE", "EC", "CO", "VE", "GY", "SR", "UY", "AR", "CL"]
 # ["United States"]
 US_COUNTRIES = ["US"]

diff --git a/use_cases/eluc/data/conversion.py b/use_cases/eluc/data/conversion.py
@@ -9,7 +9,7 @@
 from data import constants
 
 # TODO: Note: This table is not perfect and has some errors,
-# we should consider manually fixing them. I tried my best but 
+# we should consider manually fixing them. I tried my best but
 # I'm not 100% sure it's correct.
 MANUAL_MAP = {
     "INDO": 360,
@@ -57,7 +57,7 @@ def construct_countries_df():
     # Replace all the bad codes with their real ones
     for i in range(len(countries_df)):
         old_abbrev = countries_df.iloc[i]["abbrevs"]
-        if old_abbrev in MANUAL_MAP.keys() and MANUAL_MAP[old_abbrev] in codes_df["Numeric code"].unique():
+        if old_abbrev in MANUAL_MAP and MANUAL_MAP[old_abbrev] in codes_df["Numeric code"].unique():
             countries_df.iloc[i]["abbrevs"] = codes_df[codes_df["Numeric code"] == MANUAL_MAP[old_abbrev]]["Alpha-2 code"].iloc[0]
 
     return countries_df
diff --git a/use_cases/eluc/data/eluc_data.py b/use_cases/eluc/data/eluc_data.py
@@ -42,9 +42,9 @@ def encode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
                 if min_val == max_val:
                     new_df[col] = 0
                 else:
-                    new_df[col] = (new_df[col] - self.fields[col]["range"][0]) / (self.fields[col]["range"][1] - self.fields[col]["range"][0])
+                    new_df[col] = (new_df[col] - min_val) / (max_val - min_val)
         return new_df
-    
+
     def decode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
         """
         Decodes a dataframe using the fields given in the constructor.
@@ -53,7 +53,9 @@ def decode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
         new_df = df.copy()
         for col in new_df.columns:
             if col in self.fields:
-                new_df[col] = new_df[col] * (self.fields[col]["range"][1] - self.fields[col]["range"][0]) + self.fields[col]["range"][0]
+                min_val = self.fields[col]["range"][0]
+                max_val = self.fields[col]["range"][1]
+                new_df[col] = new_df[col] * (max_val - min_val) + min_val
         return new_df
 
 
@@ -87,22 +89,23 @@ def get_encoded_train(self):
         if self.encoded_train_df is None:
             self.encoded_train_df = self.encoder.encode_as_df(self.train_df)
         return self.encoded_train_df
-    
+
     def get_encoded_test(self):
         """
         Same as above but for test data.
         """
         if self.encoded_test_df is None:
             self.encoded_test_df = self.encoder.encode_as_df(self.test_df)
         return self.encoded_test_df
-    
+
     def get_fields(self) -> dict:
         """
         Creates fields json object for the data encoder/prescriptor.
         """
-        fields_df = self.train_df[constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]].astype("float64")
-        fields = dict()
-        for col in constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]:
+        cao_cols = constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]
+        fields_df = self.train_df[cao_cols].astype("float64")
+        fields = {}
+        for col in cao_cols:
             # Set range of land and diff land uses manually to their true ranges because they
             # do not need to be scaled
             if col in constants.LAND_USE_COLS:
@@ -132,28 +135,27 @@ def get_fields(self) -> dict:
             "valued": "CONTINUOUS"
         }
 
-        return fields 
-    
+        return fields
+
     def push_to_hf(self, repo_path, commit_message, token=None):
         """
         Pushes data to huggingface repo. Don't use this unless you're sure you want to update it!
         :param repo_path: Path to huggingface repo.
         """
-
         whole_df = pd.concat([self.train_df, self.test_df])
         # We get the indices as columns anyways so we can drop them
         whole_df = whole_df.drop(["lat", "lon", "time"], axis=1)
         ds = Dataset.from_pandas(whole_df)
         if not token:
             token = os.getenv("HF_TOKEN")
         ds.push_to_hub(repo_path, commit_message=commit_message, token=token)
-        
+
 
 class ELUCData(AbstractData):
     """
     Loads ELUC data from HuggingFace repo and processes it.
     """
-    
+
     def __init__(self, start_year=1851, test_year=2012, end_year=2022, countries=None):
         """
         If update_path is given, load raw data the old way using 2 files that are merged.
@@ -169,12 +171,13 @@ def __init__(self, start_year=1851, test_year=2012, end_year=2022, countries=Non
 
         self.train_df = df.loc[start_year:test_year-1]
         self.test_df = df.loc[test_year:end_year-1]
-        
+
         self.encoder = ELUCEncoder(self.get_fields())
 
     def hf_to_df(self, hf_repo):
         """
-        Loads dataset from huggingface, converts to pandas, then sets indices appropriately to time/lat/lon.
+        Loads dataset from huggingface, converts to pandas, then sets indices
+        appropriately to time/lat/lon.
         Keep old time/lat/lon columns so we can use them as features later.
         """
         ds = load_dataset(hf_repo)["train"]
@@ -194,7 +197,7 @@ def __init__(self, path, update_path, start_year=1851, test_year=2012, end_year=
 
         self.train_df = df.loc[start_year:test_year-1]
         self.test_df = df.loc[test_year:end_year-1]
-        
+
         self.encoder = ELUCEncoder(self.get_fields())
 
     def import_data(self, path, update_path):
@@ -217,15 +220,17 @@ def import_data(self, path, update_path):
             raw = raw.merge(eluc)
 
             # Shift actions back a year
-            raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', 'pastr', 'primf', 'primn', 'range', 'secdf', 'secdn', 'urban']
+            raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
+                         'pastr', 'primf', 'primn', 'range',
+                         'secdf', 'secdn', 'urban']
             raw_diffs = [f"{col}_diff" for col in raw_diffs]
             raw[raw_diffs] = raw[raw_diffs].shift(time=-1)
 
             # Finds country for each cell using lat/lon coordinates
             country_mask = regionmask.defined_regions.natural_earth_v5_0_0.countries_110.mask(raw)
             raw["country"] = country_mask
         return raw
-    
+
     def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=None) -> pd.DataFrame:
         """
         Converts an xarray DataArray to a pandas DataFrame.
@@ -259,10 +264,10 @@ def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=N
         # Merge crops into one column because BLUE model doesn't differentiate
         df["crop"] = df[constants.CROP_COLS].sum(axis=1)
         df["crop_diff"] = df[[f"{c}_diff" for c in constants.CROP_COLS]].sum(axis=1)
-            
+
         df['country_name'] = self.countries_df.loc[df['country'], 'names'].values
-        
+
         # Drop this column we used for preprocessing (?)
         df = df.drop("mask", axis=1)
-            
+
         return df
diff --git a/use_cases/eluc/data/torch_data.py b/use_cases/eluc/data/torch_data.py
@@ -15,7 +15,7 @@ class TorchDataset(Dataset):
     :param y: labels
     """
     def __init__(self, X: np.ndarray, y: np.ndarray, device="cpu"):
-        super().__init__()   
+        super().__init__()
         self.X = torch.tensor(X, dtype=torch.float32, device=device)
         self.y = torch.tensor(y, device=device)
         assert len(self.X) == len(self.y), "X and y must have the same length"
@@ -24,4 +24,4 @@ def __len__(self):
         return len(self.X)
 
     def __getitem__(self, idx: int) -> tuple:
-        return self.X[idx], self.y[idx]
+        return self.X[idx], self.y[idx]
diff --git a/use_cases/eluc/predictors/neural_network/__init__.py b/use_cases/eluc/predictors/neural_network/__init__.py
diff --git a/use_cases/eluc/predictors/neural_network/neural_net_predictor.py b/use_cases/eluc/predictors/neural_network/neural_net_predictor.py
@@ -79,25 +79,37 @@ class NeuralNetPredictor(Predictor):
     in order to take advantage of the linear relationship in the data.
     Data is automatically standardized and the scaler is saved with the model.
     """
-    def __init__(self, features=None, label=None, hidden_sizes=[4096], linear_skip=True, dropout=0, device="cpu",
-            epochs=3, batch_size=2048, optim_params={}, train_pct=1, step_lr_params={"step_size": 1, "gamma": 0.1}): 
-        # Model setup params
+    def __init__(self, features=None, label=None, hidden_sizes=[4096], linear_skip=True,
+                 dropout=0, device="mps", epochs=3, batch_size=2048, optim_params={},
+                 train_pct=1, step_lr_params={"step_size": 1, "gamma": 0.1}):
+
+        self.features=None
+        self.label=None
+
+        self.set_params(features, label, hidden_sizes, linear_skip,
+                        dropout, device, epochs, batch_size, optim_params,
+                        train_pct, step_lr_params)
+
         self.model = None
+        self.scaler = StandardScaler()
+
+    def set_params(self, features, label, hidden_sizes, linear_skip,
+                   dropout, device, epochs, batch_size, optim_params,
+                   train_pct, step_lr_params):
+        """
+        Set all the parameters for the neural network.
+        """
         self.features = features
         self.label = label
         self.hidden_sizes = hidden_sizes
         self.linear_skip = linear_skip
         self.dropout = dropout
         self.device = device
-
-        # Training params
-        self.scaler = StandardScaler()
         self.epochs = epochs
         self.batch_size = batch_size
         self.optim_params = optim_params
         self.train_pct = train_pct
         self.step_lr_params = step_lr_params
-
 
     def load(self, path: str):
         """
@@ -107,11 +119,11 @@ def load(self, path: str):
         load_path = Path(path)
         if not load_path.exists():
             raise FileNotFoundError(f"Path {path} does not exist.")
-        
+
         # Initialize model with config
-        with open(load_path / "config.json", "r", encoding="utf-8") as f:
-            config = json.load(f)
-        self.__init__(**config)
+        with open(load_path / "config.json", "r", encoding="utf-8") as file:
+            config = json.load(file)
+        self.set_params(**config)
 
         self.model = ELUCNeuralNet(len(self.features), self.hidden_sizes, self.linear_skip, self.dropout)
         self.model.load_state_dict(torch.load(load_path / "model.pt"))
@@ -144,13 +156,16 @@ def save(self, path: str):
             "train_pct": self.train_pct,
             "step_lr_params": self.step_lr_params
         }
-        with open(save_path / "config.json", "w", encoding="utf-8") as f:
-            json.dump(config, f)
+        with open(save_path / "config.json", "w", encoding="utf-8") as file:
+            json.dump(config, file)
         torch.save(self.model.state_dict(), save_path / "model.pt")
         joblib.dump(self.scaler, save_path / "scaler.joblib")
 
 
-    def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None, X_test=None, y_test=None, log_path=None, verbose=False) -> dict:
+    def fit(self, X_train: pd.DataFrame, y_train: pd.Series,
+            X_val=None, y_val=None,
+            X_test=None, y_test=None,
+            log_path=None, verbose=False) -> dict:
         """
         Fits neural network to given data using predefined parameters and hyperparameters.
         If no features were specified we use all the columns in X_train.
@@ -164,7 +179,8 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None,
         :param y_test: test labels.
         :param log_path: path to log training data to tensorboard.
         :param verbose: whether to print progress bars.
-        :return: dictionary of results from training containing time taken, best epoch, best loss, and test loss if applicable.
+        :return: dictionary of results from training containing time taken, best epoch, best loss, 
+        and test loss if applicable.
         """
         if not self.features:
             self.features = X_train.columns.tolist()
@@ -174,7 +190,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None,
         self.model.to(self.device)
         self.model.train()
 
-        s = time.time()
+        start = time.time()
 
         # Set up train set
         X_train = self.scaler.fit_transform(X_train[self.features])
@@ -203,7 +219,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None,
         result_dict = {}
         best_model = None
         best_loss = np.inf
-        e = 0
+        end = 0
 
         step = 0
         for epoch in range(self.epochs):
@@ -220,7 +236,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None,
                 step += 1
                 loss.backward()
                 optimizer.step()
-            
+
             # LR Decay
             if self.step_lr_params:
                 scheduler.step()
@@ -235,25 +251,25 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None,
                         out = self.model(X)
                         loss = loss_fn(out.squeeze(), y.squeeze())
                         total += loss.item() * y.shape[0]
-                
+
                 if log_path:
                     writer.add_scalar("val_loss", total / len(val_ds), step)
-                
+
                 if total < best_loss:
                     best_model = copy.deepcopy(self.model.state_dict())
                     best_loss = total
-                    e = time.time()
+                    end = time.time()
                     result_dict["best_epoch"] = epoch
                     result_dict["best_loss"] = total / len(val_ds)
-                    result_dict["time"] = e - s
+                    result_dict["time"] = end - start
 
                 print(f"epoch {epoch} mae {total / len(val_ds)}")
-        
+
         if best_model:
             self.model.load_state_dict(best_model)
         else:
-            e = time.time()
-            result_dict["time"] = e - s
+            end = time.time()
+            result_dict["time"] = end - start
 
         # If we provide a test dataset
         if X_test is not None and y_test is not None:

diff --git a/use_cases/eluc/predictors/predictor.py b/use_cases/eluc/predictors/predictor.py
@@ -51,4 +51,4 @@ def load(self, path: str):
         """
         Loads a model from a path.
         :param path: path to the model
-        """
+        """
diff --git a/use_cases/eluc/predictors/sklearn/__init__.py b/use_cases/eluc/predictors/sklearn/__init__.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -51,4 +51,4 @@ def load(self, path: str): @@
             """
             Loads a model from a path.
             :param path: path to the model
-            """
+            """