Skip to content

Commit

Permalink
Merged lint branch in so we pass lint test?
Browse files Browse the repository at this point in the history
  • Loading branch information
danyoungday committed Mar 22, 2024
2 parents 0fe5d78 + 4b85488 commit 2a86f5b
Show file tree
Hide file tree
Showing 24 changed files with 165 additions and 130 deletions.
10 changes: 10 additions & 0 deletions use_cases/eluc/.pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[MASTER]
ignore=demo

jobs=0

max-line-length=120

suggestion-mode=yes

good-names=X_train, X_val, X_test, y_train, y_val, y_test, X, Y, y, X_test_scaled
2 changes: 1 addition & 1 deletion use_cases/eluc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ BLUE simulations with committed emissions could be used to estimate the long-ter
"Committed emissions" means all the emissions that are caused by a land-use change event are attributed to the year
of the event.
BLUE (bookkeeping of land use emissions) is a bookkeeping model that attributes carbon fluxes to land use activities.
See [BLUE: Bookkeeping of land use emissions](https://doi.org/10.1002/2014GB004997) for more details.
See [BLUE: Bookkeeping of land use emissions](https://doi.org/10.1002/2014GB004997) for more details.

### LUC

Expand Down
7 changes: 4 additions & 3 deletions use_cases/eluc/data/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
CODES_PATH = "data/codes.csv"

# Different variations of land-use change columns
LAND_USE_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
'pastr', 'primf', 'primn',
LAND_USE_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
'pastr', 'primf', 'primn',
'range', 'secdf', 'secdn', 'urban']
CROP_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per']
LAND_USE_COLS = ["crop"] + [col for col in LAND_USE_COLS if col not in CROP_COLS]
Expand All @@ -29,7 +29,8 @@

# ["United Kingdom", "France", "Germany", "Netherlands", "Belgium", "Switzerland", "Ireland"]
EU_COUNTRIES = ["GB", "FR", "DE", "NL", "BE", "CH", "IE"]
# ["Brazil", "Bolivia", "Paraguay", "Peru", "Ecuador", "Colombia", "Venezuela", "Guyana", "Suriname", "Uruguay", "Argentina", "Chile"]
# ["Brazil", "Bolivia", "Paraguay", "Peru", "Ecuador", "Colombia",
# "Venezuela", "Guyana", "Suriname", "Uruguay", "Argentina", "Chile"]
SA_COUNTRIES = ["BR", "BO", "PY", "PE", "EC", "CO", "VE", "GY", "SR", "UY", "AR", "CL"]
# ["United States"]
US_COUNTRIES = ["US"]
Expand Down
4 changes: 2 additions & 2 deletions use_cases/eluc/data/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from data import constants

# TODO: Note: This table is not perfect and has some errors,
# we should consider manually fixing them. I tried my best but
# we should consider manually fixing them. I tried my best but
# I'm not 100% sure it's correct.
MANUAL_MAP = {
"INDO": 360,
Expand Down Expand Up @@ -57,7 +57,7 @@ def construct_countries_df():
# Replace all the bad codes with their real ones
for i in range(len(countries_df)):
old_abbrev = countries_df.iloc[i]["abbrevs"]
if old_abbrev in MANUAL_MAP.keys() and MANUAL_MAP[old_abbrev] in codes_df["Numeric code"].unique():
if old_abbrev in MANUAL_MAP and MANUAL_MAP[old_abbrev] in codes_df["Numeric code"].unique():
countries_df.iloc[i]["abbrevs"] = codes_df[codes_df["Numeric code"] == MANUAL_MAP[old_abbrev]]["Alpha-2 code"].iloc[0]

return countries_df
47 changes: 26 additions & 21 deletions use_cases/eluc/data/eluc_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def encode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
if min_val == max_val:
new_df[col] = 0
else:
new_df[col] = (new_df[col] - self.fields[col]["range"][0]) / (self.fields[col]["range"][1] - self.fields[col]["range"][0])
new_df[col] = (new_df[col] - min_val) / (max_val - min_val)
return new_df

def decode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Decodes a dataframe using the fields given in the constructor.
Expand All @@ -53,7 +53,9 @@ def decode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
new_df = df.copy()
for col in new_df.columns:
if col in self.fields:
new_df[col] = new_df[col] * (self.fields[col]["range"][1] - self.fields[col]["range"][0]) + self.fields[col]["range"][0]
min_val = self.fields[col]["range"][0]
max_val = self.fields[col]["range"][1]
new_df[col] = new_df[col] * (max_val - min_val) + min_val
return new_df


Expand Down Expand Up @@ -87,22 +89,23 @@ def get_encoded_train(self):
if self.encoded_train_df is None:
self.encoded_train_df = self.encoder.encode_as_df(self.train_df)
return self.encoded_train_df

def get_encoded_test(self):
"""
Same as above but for test data.
"""
if self.encoded_test_df is None:
self.encoded_test_df = self.encoder.encode_as_df(self.test_df)
return self.encoded_test_df

def get_fields(self) -> dict:
"""
Creates fields json object for the data encoder/prescriptor.
"""
fields_df = self.train_df[constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]].astype("float64")
fields = dict()
for col in constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]:
cao_cols = constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]
fields_df = self.train_df[cao_cols].astype("float64")
fields = {}
for col in cao_cols:
# Set range of land and diff land uses manually to their true ranges because they
# do not need to be scaled
if col in constants.LAND_USE_COLS:
Expand Down Expand Up @@ -132,28 +135,27 @@ def get_fields(self) -> dict:
"valued": "CONTINUOUS"
}

return fields
return fields

def push_to_hf(self, repo_path, commit_message, token=None):
"""
Pushes data to huggingface repo. Don't use this unless you're sure you want to update it!
:param repo_path: Path to huggingface repo.
"""

whole_df = pd.concat([self.train_df, self.test_df])
# We get the indices as columns anyways so we can drop them
whole_df = whole_df.drop(["lat", "lon", "time"], axis=1)
ds = Dataset.from_pandas(whole_df)
if not token:
token = os.getenv("HF_TOKEN")
ds.push_to_hub(repo_path, commit_message=commit_message, token=token)


class ELUCData(AbstractData):
"""
Loads ELUC data from HuggingFace repo and processes it.
"""

def __init__(self, start_year=1851, test_year=2012, end_year=2022, countries=None):
"""
If update_path is given, load raw data the old way using 2 files that are merged.
Expand All @@ -169,12 +171,13 @@ def __init__(self, start_year=1851, test_year=2012, end_year=2022, countries=Non

self.train_df = df.loc[start_year:test_year-1]
self.test_df = df.loc[test_year:end_year-1]

self.encoder = ELUCEncoder(self.get_fields())

def hf_to_df(self, hf_repo):
"""
Loads dataset from huggingface, converts to pandas, then sets indices appropriately to time/lat/lon.
Loads dataset from huggingface, converts to pandas, then sets indices
appropriately to time/lat/lon.
Keep old time/lat/lon columns so we can use them as features later.
"""
ds = load_dataset(hf_repo)["train"]
Expand All @@ -194,7 +197,7 @@ def __init__(self, path, update_path, start_year=1851, test_year=2012, end_year=

self.train_df = df.loc[start_year:test_year-1]
self.test_df = df.loc[test_year:end_year-1]

self.encoder = ELUCEncoder(self.get_fields())

def import_data(self, path, update_path):
Expand All @@ -217,15 +220,17 @@ def import_data(self, path, update_path):
raw = raw.merge(eluc)

# Shift actions back a year
raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', 'pastr', 'primf', 'primn', 'range', 'secdf', 'secdn', 'urban']
raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
'pastr', 'primf', 'primn', 'range',
'secdf', 'secdn', 'urban']
raw_diffs = [f"{col}_diff" for col in raw_diffs]
raw[raw_diffs] = raw[raw_diffs].shift(time=-1)

# Finds country for each cell using lat/lon coordinates
country_mask = regionmask.defined_regions.natural_earth_v5_0_0.countries_110.mask(raw)
raw["country"] = country_mask
return raw

def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=None) -> pd.DataFrame:
"""
Converts an xarray DataArray to a pandas DataFrame.
Expand Down Expand Up @@ -259,10 +264,10 @@ def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=N
# Merge crops into one column because BLUE model doesn't differentiate
df["crop"] = df[constants.CROP_COLS].sum(axis=1)
df["crop_diff"] = df[[f"{c}_diff" for c in constants.CROP_COLS]].sum(axis=1)

df['country_name'] = self.countries_df.loc[df['country'], 'names'].values

# Drop this column we used for preprocessing (?)
df = df.drop("mask", axis=1)

return df
4 changes: 2 additions & 2 deletions use_cases/eluc/data/torch_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class TorchDataset(Dataset):
:param y: labels
"""
def __init__(self, X: np.ndarray, y: np.ndarray, device="cpu"):
super().__init__()
super().__init__()
self.X = torch.tensor(X, dtype=torch.float32, device=device)
self.y = torch.tensor(y, device=device)
assert len(self.X) == len(self.y), "X and y must have the same length"
Expand All @@ -24,4 +24,4 @@ def __len__(self):
return len(self.X)

def __getitem__(self, idx: int) -> tuple:
return self.X[idx], self.y[idx]
return self.X[idx], self.y[idx]
Empty file.
66 changes: 41 additions & 25 deletions use_cases/eluc/predictors/neural_network/neural_net_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,25 +79,37 @@ class NeuralNetPredictor(Predictor):
in order to take advantage of the linear relationship in the data.
Data is automatically standardized and the scaler is saved with the model.
"""
def __init__(self, features=None, label=None, hidden_sizes=[4096], linear_skip=True, dropout=0, device="cpu",
epochs=3, batch_size=2048, optim_params={}, train_pct=1, step_lr_params={"step_size": 1, "gamma": 0.1}):
# Model setup params
def __init__(self, features=None, label=None, hidden_sizes=[4096], linear_skip=True,
dropout=0, device="mps", epochs=3, batch_size=2048, optim_params={},
train_pct=1, step_lr_params={"step_size": 1, "gamma": 0.1}):

self.features=None
self.label=None

self.set_params(features, label, hidden_sizes, linear_skip,
dropout, device, epochs, batch_size, optim_params,
train_pct, step_lr_params)

self.model = None
self.scaler = StandardScaler()

def set_params(self, features, label, hidden_sizes, linear_skip,
dropout, device, epochs, batch_size, optim_params,
train_pct, step_lr_params):
"""
Set all the parameters for the neural network.
"""
self.features = features
self.label = label
self.hidden_sizes = hidden_sizes
self.linear_skip = linear_skip
self.dropout = dropout
self.device = device

# Training params
self.scaler = StandardScaler()
self.epochs = epochs
self.batch_size = batch_size
self.optim_params = optim_params
self.train_pct = train_pct
self.step_lr_params = step_lr_params


def load(self, path: str):
"""
Expand All @@ -107,11 +119,11 @@ def load(self, path: str):
load_path = Path(path)
if not load_path.exists():
raise FileNotFoundError(f"Path {path} does not exist.")

# Initialize model with config
with open(load_path / "config.json", "r", encoding="utf-8") as f:
config = json.load(f)
self.__init__(**config)
with open(load_path / "config.json", "r", encoding="utf-8") as file:
config = json.load(file)
self.set_params(**config)

self.model = ELUCNeuralNet(len(self.features), self.hidden_sizes, self.linear_skip, self.dropout)
self.model.load_state_dict(torch.load(load_path / "model.pt"))
Expand Down Expand Up @@ -144,13 +156,16 @@ def save(self, path: str):
"train_pct": self.train_pct,
"step_lr_params": self.step_lr_params
}
with open(save_path / "config.json", "w", encoding="utf-8") as f:
json.dump(config, f)
with open(save_path / "config.json", "w", encoding="utf-8") as file:
json.dump(config, file)
torch.save(self.model.state_dict(), save_path / "model.pt")
joblib.dump(self.scaler, save_path / "scaler.joblib")


def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None, X_test=None, y_test=None, log_path=None, verbose=False) -> dict:
def fit(self, X_train: pd.DataFrame, y_train: pd.Series,
X_val=None, y_val=None,
X_test=None, y_test=None,
log_path=None, verbose=False) -> dict:
"""
Fits neural network to given data using predefined parameters and hyperparameters.
If no features were specified we use all the columns in X_train.
Expand All @@ -164,7 +179,8 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None,
:param y_test: test labels.
:param log_path: path to log training data to tensorboard.
:param verbose: whether to print progress bars.
:return: dictionary of results from training containing time taken, best epoch, best loss, and test loss if applicable.
:return: dictionary of results from training containing time taken, best epoch, best loss,
and test loss if applicable.
"""
if not self.features:
self.features = X_train.columns.tolist()
Expand All @@ -174,7 +190,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None,
self.model.to(self.device)
self.model.train()

s = time.time()
start = time.time()

# Set up train set
X_train = self.scaler.fit_transform(X_train[self.features])
Expand Down Expand Up @@ -203,7 +219,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None,
result_dict = {}
best_model = None
best_loss = np.inf
e = 0
end = 0

step = 0
for epoch in range(self.epochs):
Expand All @@ -220,7 +236,7 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None,
step += 1
loss.backward()
optimizer.step()

# LR Decay
if self.step_lr_params:
scheduler.step()
Expand All @@ -235,25 +251,25 @@ def fit(self, X_train: pd.DataFrame, y_train: pd.Series, X_val=None, y_val=None,
out = self.model(X)
loss = loss_fn(out.squeeze(), y.squeeze())
total += loss.item() * y.shape[0]

if log_path:
writer.add_scalar("val_loss", total / len(val_ds), step)

if total < best_loss:
best_model = copy.deepcopy(self.model.state_dict())
best_loss = total
e = time.time()
end = time.time()
result_dict["best_epoch"] = epoch
result_dict["best_loss"] = total / len(val_ds)
result_dict["time"] = e - s
result_dict["time"] = end - start

print(f"epoch {epoch} mae {total / len(val_ds)}")

if best_model:
self.model.load_state_dict(best_model)
else:
e = time.time()
result_dict["time"] = e - s
end = time.time()
result_dict["time"] = end - start

# If we provide a test dataset
if X_test is not None and y_test is not None:
Expand Down
2 changes: 1 addition & 1 deletion use_cases/eluc/predictors/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ def load(self, path: str):
"""
Loads a model from a path.
:param path: path to the model
"""
"""
Empty file.
Loading

0 comments on commit 2a86f5b

Please sign in to comment.