Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create yaml file for ELUC use case CI #77

Merged
merged 12 commits into from
Mar 22, 2024
Merged
36 changes: 36 additions & 0 deletions .github/workflows/eluc.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# This runs the unit tests for the ELUC use case

name: ELUC Use Case

on:
push:
branches: [ "main" ]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we run it on all branch? Because we shouldn't be able to merge if pylint or the tests fail

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently runs whenever a PR goes into main

pull_request:
branches: [ "main" ]

jobs:
build:
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./use_cases/eluc
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Set PYTHONPATH
run: echo "PYTHONPATH=$PWD" >> $GITHUB_ENV
- name: Test PYTHONPATH
run: printenv PYTHONPATH
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pylint
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with PyLint
run: pylint --ignore="demo" --recursive=y --fail-under=9 ./*
- name: Run unit tests
run: python -m unittest

10 changes: 10 additions & 0 deletions use_cases/eluc/.pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[MASTER]
ignore=demo

jobs=0

max-line-length=120

suggestion-mode=yes

good-names=X_train, X_val, X_test, y_train, y_val, y_test, X, Y, y, X_test_scaled
2 changes: 1 addition & 1 deletion use_cases/eluc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ BLUE simulations with committed emissions could be used to estimate the long-ter
"Committed emissions" means all the emissions that are caused by a land-use change event are attributed to the year
of the event.
BLUE (bookkeeping of land use emissions) is a bookkeeping model that attributes carbon fluxes to land use activities.
See [BLUE: Bookkeeping of land use emissions](https://doi.org/10.1002/2014GB004997) for more details.
See [BLUE: Bookkeeping of land use emissions](https://doi.org/10.1002/2014GB004997) for more details.

### LUC

Expand Down
7 changes: 4 additions & 3 deletions use_cases/eluc/data/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
CODES_PATH = "data/codes.csv"

# Different variations of land-use change columns
LAND_USE_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
'pastr', 'primf', 'primn',
LAND_USE_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
'pastr', 'primf', 'primn',
'range', 'secdf', 'secdn', 'urban']
CROP_COLS = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per']
LAND_USE_COLS = ["crop"] + [col for col in LAND_USE_COLS if col not in CROP_COLS]
Expand All @@ -29,7 +29,8 @@

# ["United Kingdom", "France", "Germany", "Netherlands", "Belgium", "Switzerland", "Ireland"]
EU_COUNTRIES = ["GB", "FR", "DE", "NL", "BE", "CH", "IE"]
# ["Brazil", "Bolivia", "Paraguay", "Peru", "Ecuador", "Colombia", "Venezuela", "Guyana", "Suriname", "Uruguay", "Argentina", "Chile"]
# ["Brazil", "Bolivia", "Paraguay", "Peru", "Ecuador", "Colombia",
# "Venezuela", "Guyana", "Suriname", "Uruguay", "Argentina", "Chile"]
SA_COUNTRIES = ["BR", "BO", "PY", "PE", "EC", "CO", "VE", "GY", "SR", "UY", "AR", "CL"]
# ["United States"]
US_COUNTRIES = ["US"]
Expand Down
4 changes: 2 additions & 2 deletions use_cases/eluc/data/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from data import constants

# TODO: Note: This table is not perfect and has some errors,
# we should consider manually fixing them. I tried my best but
# we should consider manually fixing them. I tried my best but
# I'm not 100% sure it's correct.
MANUAL_MAP = {
"INDO": 360,
Expand Down Expand Up @@ -57,7 +57,7 @@ def construct_countries_df():
# Replace all the bad codes with their real ones
for i in range(len(countries_df)):
old_abbrev = countries_df.iloc[i]["abbrevs"]
if old_abbrev in MANUAL_MAP.keys() and MANUAL_MAP[old_abbrev] in codes_df["Numeric code"].unique():
if old_abbrev in MANUAL_MAP and MANUAL_MAP[old_abbrev] in codes_df["Numeric code"].unique():
countries_df.iloc[i]["abbrevs"] = codes_df[codes_df["Numeric code"] == MANUAL_MAP[old_abbrev]]["Alpha-2 code"].iloc[0]

return countries_df
47 changes: 26 additions & 21 deletions use_cases/eluc/data/eluc_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def encode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
if min_val == max_val:
new_df[col] = 0
else:
new_df[col] = (new_df[col] - self.fields[col]["range"][0]) / (self.fields[col]["range"][1] - self.fields[col]["range"][0])
new_df[col] = (new_df[col] - min_val) / (max_val - min_val)
return new_df

def decode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Decodes a dataframe using the fields given in the constructor.
Expand All @@ -53,7 +53,9 @@ def decode_as_df(self, df: pd.DataFrame) -> pd.DataFrame:
new_df = df.copy()
for col in new_df.columns:
if col in self.fields:
new_df[col] = new_df[col] * (self.fields[col]["range"][1] - self.fields[col]["range"][0]) + self.fields[col]["range"][0]
min_val = self.fields[col]["range"][0]
max_val = self.fields[col]["range"][1]
new_df[col] = new_df[col] * (max_val - min_val) + min_val
return new_df


Expand Down Expand Up @@ -87,22 +89,23 @@ def get_encoded_train(self):
if self.encoded_train_df is None:
self.encoded_train_df = self.encoder.encode_as_df(self.train_df)
return self.encoded_train_df

def get_encoded_test(self):
"""
Same as above but for test data.
"""
if self.encoded_test_df is None:
self.encoded_test_df = self.encoder.encode_as_df(self.test_df)
return self.encoded_test_df

def get_fields(self) -> dict:
"""
Creates fields json object for the data encoder/prescriptor.
"""
fields_df = self.train_df[constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]].astype("float64")
fields = dict()
for col in constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]:
cao_cols = constants.CAO_MAPPING["context"] + constants.CAO_MAPPING["actions"] + ["ELUC"]
fields_df = self.train_df[cao_cols].astype("float64")
fields = {}
for col in cao_cols:
# Set range of land and diff land uses manually to their true ranges because they
# do not need to be scaled
if col in constants.LAND_USE_COLS:
Expand Down Expand Up @@ -132,28 +135,27 @@ def get_fields(self) -> dict:
"valued": "CONTINUOUS"
}

return fields
return fields

def push_to_hf(self, repo_path, commit_message, token=None):
"""
Pushes data to huggingface repo. Don't use this unless you're sure you want to update it!
:param repo_path: Path to huggingface repo.
"""

whole_df = pd.concat([self.train_df, self.test_df])
# We get the indices as columns anyways so we can drop them
whole_df = whole_df.drop(["lat", "lon", "time"], axis=1)
ds = Dataset.from_pandas(whole_df)
if not token:
token = os.getenv("HF_TOKEN")
ds.push_to_hub(repo_path, commit_message=commit_message, token=token)


class ELUCData(AbstractData):
"""
Loads ELUC data from HuggingFace repo and processes it.
"""

def __init__(self, start_year=1851, test_year=2012, end_year=2022, countries=None):
"""
If update_path is given, load raw data the old way using 2 files that are merged.
Expand All @@ -169,12 +171,13 @@ def __init__(self, start_year=1851, test_year=2012, end_year=2022, countries=Non

self.train_df = df.loc[start_year:test_year-1]
self.test_df = df.loc[test_year:end_year-1]

self.encoder = ELUCEncoder(self.get_fields())

def hf_to_df(self, hf_repo):
"""
Loads dataset from huggingface, converts to pandas, then sets indices appropriately to time/lat/lon.
Loads dataset from huggingface, converts to pandas, then sets indices
appropriately to time/lat/lon.
Keep old time/lat/lon columns so we can use them as features later.
"""
ds = load_dataset(hf_repo)["train"]
Expand All @@ -194,7 +197,7 @@ def __init__(self, path, update_path, start_year=1851, test_year=2012, end_year=

self.train_df = df.loc[start_year:test_year-1]
self.test_df = df.loc[test_year:end_year-1]

self.encoder = ELUCEncoder(self.get_fields())

def import_data(self, path, update_path):
Expand All @@ -217,15 +220,17 @@ def import_data(self, path, update_path):
raw = raw.merge(eluc)

# Shift actions back a year
raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per', 'pastr', 'primf', 'primn', 'range', 'secdf', 'secdn', 'urban']
raw_diffs = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
'pastr', 'primf', 'primn', 'range',
'secdf', 'secdn', 'urban']
raw_diffs = [f"{col}_diff" for col in raw_diffs]
raw[raw_diffs] = raw[raw_diffs].shift(time=-1)

# Finds country for each cell using lat/lon coordinates
country_mask = regionmask.defined_regions.natural_earth_v5_0_0.countries_110.mask(raw)
raw["country"] = country_mask
return raw

def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=None) -> pd.DataFrame:
"""
Converts an xarray DataArray to a pandas DataFrame.
Expand Down Expand Up @@ -259,10 +264,10 @@ def da_to_df(self, da: xr.DataArray, start_year=None, end_year=None, countries=N
# Merge crops into one column because BLUE model doesn't differentiate
df["crop"] = df[constants.CROP_COLS].sum(axis=1)
df["crop_diff"] = df[[f"{c}_diff" for c in constants.CROP_COLS]].sum(axis=1)

df['country_name'] = self.countries_df.loc[df['country'], 'names'].values

# Drop this column we used for preprocessing (?)
df = df.drop("mask", axis=1)

return df
4 changes: 2 additions & 2 deletions use_cases/eluc/data/torch_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class TorchDataset(Dataset):
:param y: labels
"""
def __init__(self, X: np.ndarray, y: np.ndarray, device="cpu"):
super().__init__()
super().__init__()
self.X = torch.tensor(X, dtype=torch.float32, device=device)
self.y = torch.tensor(y, device=device)
assert len(self.X) == len(self.y), "X and y must have the same length"
Expand All @@ -24,4 +24,4 @@ def __len__(self):
return len(self.X)

def __getitem__(self, idx: int) -> tuple:
return self.X[idx], self.y[idx]
return self.X[idx], self.y[idx]
Empty file.
Loading
Loading