From 173371c144aec27a2fa0db0c4dbf8ed1b4c39103 Mon Sep 17 00:00:00 2001
From: Jacob Bieker <jacob@bieker.tech>
Date: Fri, 2 Jun 2023 10:56:42 +0100
Subject: [PATCH 1/5] Add LSTM model

---
 model/lstm.py | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 model/lstm.py

diff --git a/model/lstm.py b/model/lstm.py
new file mode 100644
index 0000000..d2bdb70
--- /dev/null
+++ b/model/lstm.py
@@ -0,0 +1,143 @@
+"""
+You will want to install the following packages:
+xarray
+pytorch
+xbatcher
+"""
+import numpy as np
+import xarray as xr
+import torch
+import xbatcher
+import regionmask
+import os
+import einops
+
+
+#f = "zip:///::https://huggingface.co/datasets/jacobbieker/project-resilience/resolve/main/merged_aggregated_dataset.zarr.zip"
+#dataset = xr.open_dataset(f, engine='zarr', chunks={})
+#if not os.path.exists("merged_aggregated_dataset.zarr"):
+#    dataset.to_zarr("merged_aggregated_dataset.zarr", consolidated=True, compute=True)
+dataset = xr.open_zarr("../data/merged_aggregated_dataset_1850_2022.zarr.zip", consolidated=True)
+dataset['ELUC'] = dataset['ELUC'].shift(time=1)
+dataset['ELUC_diff'] = dataset['ELUC_diff'].shift(time=1)
+dataset['time'] = dataset.time - 1
+mask = dataset['ELUC_diff'].isnull()
+dataset = dataset.where(~mask, drop=True)
+# Train on countries 0-80, and test on the rest, and test on the last 15 years of data
+country_mask = regionmask.defined_regions.natural_earth_v5_0_0.countries_110.mask(dataset)
+LAND_FEATURES = ['c3ann', 'c3nfx', 'c3per','c4ann', 'c4per',
+                 'pastr', 'primf', 'primn', 'range', 'secdf', 'secdn', 'urban', 'cell_area']
+
+LAND_DIFF_FEATURES = ['c3ann_diff', 'c3nfx_diff', 'c3per_diff','c4ann_diff', 'c4per_diff',
+                      'pastr_diff', 'primf_diff', 'primn_diff', 'range_diff', 'secdf_diff', 'secdn_diff', 'urban_diff']
+
+FEATURES = LAND_FEATURES + LAND_DIFF_FEATURES
+
+LABEL = 'ELUC'
+#print(country_mask)
+
+#print(train_da)
+#print(test_da)
+
+# Simple LSTM model
+class LSTMModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lstm = torch.nn.LSTM(28, 128, num_layers=2, batch_first=True)
+        self.average = torch.nn.AdaptiveAvgPool1d(1)
+        self.linear = torch.nn.Linear(128, 1)
+
+    def forward(self, x):
+        x, _ = self.lstm(x)
+        x = torch.relu(x)
+        x = self.linear(x)
+        x = torch.relu(x)
+        x = torch.squeeze(x)
+        x = self.average(x)
+        return x
+
+
+def train_country_lstm(country_code):
+    test_da = dataset.where(country_mask == country_code, drop=True).where(dataset.time > 2007, drop=True).load()
+    train_da = dataset.where(country_mask == country_code, drop=True).where(dataset.time <= 2007, drop=True).load() # 143 is the code for the UK
+    model = LSTMModel().cuda()
+    crit = torch.nn.MSELoss()
+    stat = torch.nn.L1Loss()
+    optim = torch.optim.Adam(model.parameters(), lr=0.001)
+
+    train_times = train_da.time.values[11:]
+    for epoch in range(25):
+        np.random.shuffle(train_times)
+        for time in train_times:
+            # Select random lat/lons ones in batch
+            example = train_da.isel(lat=np.random.randint(0, len(train_da.lat.values), 12), lon=np.random.randint(0, len(train_da.lon.values), 10)).sel(time=slice(time-10,time))
+            #country_mask = regionmask.defined_regions.natural_earth_v5_0_0.countries_110.mask(example)
+            # example = example.where(country_mask <= 80, drop=True)
+            # stack all data variables in example into a single data array
+            data_names = list(example.data_vars.keys())
+            example = xr.concat([example[var] for var in example.data_vars], dim='variable')
+            example = example.assign_coords(variable=data_names)
+            example = example.transpose('time', 'lat', 'lon', 'variable')
+            target = example.sel(variable='ELUC').isel(time=9)
+            train = example.isel(time=slice(0,9))
+            #assert np.isfinite(example['ELUC'].values).all()
+            # Convert infinite values and NaN to 0.0
+            train = train.fillna(0.0)
+            target = target.fillna(0.0)
+            torch_example = torch.from_numpy(np.nan_to_num(train.values, posinf=0.0, neginf=0.0))
+            # Flatten with einops to get the shape (batch_size, time, features)
+            torch_example = einops.rearrange(torch_example, 'time lat lon features -> (lat lon) time features')
+            torch_target = torch.unsqueeze(torch.from_numpy(np.nan_to_num(target.values, posinf=0.0, neginf=0.0)), dim=-1)
+            # Flatten with einops to get the shape (batch_size, time, features)
+            torch_target = einops.rearrange(torch_target, 'lat lon features -> (lat lon) features')
+            # Run the model and pass back the loss
+            # zero the parameter gradients
+            optim.zero_grad()
+            # forward + backward + optimize
+            outputs = model(torch_example.cuda())
+            loss = crit(outputs, torch_target.cuda())
+            loss.backward()
+            optim.step()
+            #print(loss.item())
+
+    with torch.no_grad():
+        model.eval()
+        test_mse = 0.0
+        test_mae = 0.0
+        num = 0
+        test_times = test_da.time.values[11:]
+        for time in test_times:
+            for lon in range(0, len(test_da.lon.values), 10):
+                for lat in range(0, len(test_da.lat.values), 10):
+                    example = test_da.isel(lat=slice(lat,lat+10), lon=slice(lon,lon+10)).sel(time=slice(time-10,time))
+                    data_names = list(example.data_vars.keys())
+                    example = xr.concat([example[var] for var in example.data_vars], dim='variable')
+                    example = example.assign_coords(variable=data_names)
+                    example = example.transpose('time', 'lat', 'lon', 'variable')
+                    target = example.sel(variable='ELUC').isel(time=9)
+                    train = example.isel(time=slice(0,9))
+                    #assert np.isfinite(example['ELUC'].values).all()
+                    # Convert infinite values and NaN to 0.0
+                    train = train.fillna(0.0)
+                    target = target.fillna(0.0)
+                    torch_example = torch.from_numpy(np.nan_to_num(train.values, posinf=0.0, neginf=0.0))
+                    # Flatten with einops to get the shape (batch_size, time, features)
+                    torch_example = einops.rearrange(torch_example, 'time lat lon features -> (lat lon) time features')
+                    torch_target = torch.unsqueeze(torch.from_numpy(np.nan_to_num(target.values, posinf=0.0, neginf=0.0)), dim=-1)
+                    # Flatten with einops to get the shape (batch_size, time, features)
+                    torch_target = einops.rearrange(torch_target, 'lat lon features -> (lat lon) features')
+                    num += torch_target.shape[0]
+                    outputs = model(torch_example.cuda())
+                    loss = crit(outputs.cpu(), torch_target)
+                    mae_loss = stat(outputs.cpu(), torch_target)
+                    #print(loss.item())
+                    test_mse += loss.item()
+                    test_mae += mae_loss.item()
+                    # Already averaged, so just need to track num for time, lat, lon
+                    num += 1
+        print(f"Test  {country_code} MSE: {test_mse/num}")
+        print(f"Test {country_code} MAE: {test_mae/num}")
+        torch.save(model, f"lstm_model_{country_code}.pt")
+
+train_country_lstm(143) # UK
+train_country_lstm(29) # Brazil

From 29c8f340db616fc8cd85a5861783f350a3a636bf Mon Sep 17 00:00:00 2001
From: Jacob Bieker <jacob@bieker.tech>
Date: Sun, 4 Jun 2023 16:09:34 +0100
Subject: [PATCH 2/5] Update to save to HuggingFace

---
 model/lstm.py | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/model/lstm.py b/model/lstm.py
index d2bdb70..46f5bc4 100644
--- a/model/lstm.py
+++ b/model/lstm.py
@@ -11,6 +11,7 @@
 import regionmask
 import os
 import einops
+from huggingface_hub import PyTorchModelHubMixin
 
 
 #f = "zip:///::https://huggingface.co/datasets/jacobbieker/project-resilience/resolve/main/merged_aggregated_dataset.zarr.zip"
@@ -34,18 +35,21 @@
 FEATURES = LAND_FEATURES + LAND_DIFF_FEATURES
 
 LABEL = 'ELUC'
+
+ALL_FEATURES = FEATURES + [LABEL]
+dataset = dataset[ALL_FEATURES]
 #print(country_mask)
 
 #print(train_da)
 #print(test_da)
 
 # Simple LSTM model
-class LSTMModel(torch.nn.Module):
-    def __init__(self):
+class LSTMModel(torch.nn.Module, PyTorchModelHubMixin):
+    def __init__(self, layers=2, hidden_dim=128, input_dim=26, output_dim=1):
         super().__init__()
-        self.lstm = torch.nn.LSTM(28, 128, num_layers=2, batch_first=True)
+        self.lstm = torch.nn.LSTM(input_dim, hidden_dim, num_layers=layers, batch_first=True)
         self.average = torch.nn.AdaptiveAvgPool1d(1)
-        self.linear = torch.nn.Linear(128, 1)
+        self.linear = torch.nn.Linear(hidden_dim, output_dim)
 
     def forward(self, x):
         x, _ = self.lstm(x)
@@ -57,16 +61,23 @@ def forward(self, x):
         return x
 
 
+model_config = {"layers": 2, "hidden_dim": 128, "input_dim": 26, "output_dim": 1}
 def train_country_lstm(country_code):
-    test_da = dataset.where(country_mask == country_code, drop=True).where(dataset.time > 2007, drop=True).load()
-    train_da = dataset.where(country_mask == country_code, drop=True).where(dataset.time <= 2007, drop=True).load() # 143 is the code for the UK
-    model = LSTMModel().cuda()
+    if len(country_code) == 1:
+        test_da = dataset.where(country_mask == country_code[0], drop=True).where(dataset.time > 2007, drop=True).load()
+        train_da = dataset.where(country_mask == country_code[0], drop=True).where(dataset.time <= 2007, drop=True).load() # 143 is the code for the UK
+    else:
+        c_mask = xr.DataArray(np.in1d(country_mask, country_code).reshape(country_mask.shape),
+                              dims=country_mask.dims, coords=country_mask.coords)
+        test_da = dataset.where(c_mask, drop=True).where(dataset.time > 2007, drop=True).load()
+        train_da = dataset.where(c_mask, drop=True).where(dataset.time <= 2007, drop=True).load() # 143 is the code for the UK
+    model = LSTMModel(**model_config).cuda()
     crit = torch.nn.MSELoss()
     stat = torch.nn.L1Loss()
     optim = torch.optim.Adam(model.parameters(), lr=0.001)
 
     train_times = train_da.time.values[11:]
-    for epoch in range(25):
+    for epoch in range(50):
         np.random.shuffle(train_times)
         for time in train_times:
             # Select random lat/lons ones in batch
@@ -137,7 +148,15 @@ def train_country_lstm(country_code):
                     num += 1
         print(f"Test  {country_code} MSE: {test_mse/num}")
         print(f"Test {country_code} MAE: {test_mae/num}")
-        torch.save(model, f"lstm_model_{country_code}.pt")
+        names = [f'{c}_' for c in country_code]
+        # Join the names together
+        name = ''.join(names)
+        torch.save(model, f"lstm_model_{name}.pt")
+        # Save the weights to huggingface
+        model.save_pretrained(f"lstm_model_{name}", push_to_hub=True, repo_name=f"project_resilience_lstm_model_{name}", use_auth_token=None, config=model_config)
 
-train_country_lstm(143) # UK
-train_country_lstm(29) # Brazil
+train_country_lstm([143, 29]) # UK and India
+train_country_lstm([143]) # UK
+train_country_lstm([29]) # Brazil
+# Check ELUC when no change
+# Send over weights, put them on HF and integrate that as well as config options

From 080a265eea21a95d67ff25fc9136d37cb0b41b36 Mon Sep 17 00:00:00 2001
From: Jacob Bieker <jacob@bieker.tech>
Date: Sun, 4 Jun 2023 16:25:13 +0100
Subject: [PATCH 3/5] Minor fix

---
 model/lstm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model/lstm.py b/model/lstm.py
index 46f5bc4..7e8655d 100644
--- a/model/lstm.py
+++ b/model/lstm.py
@@ -153,7 +153,7 @@ def train_country_lstm(country_code):
         name = ''.join(names)
         torch.save(model, f"lstm_model_{name}.pt")
         # Save the weights to huggingface
-        model.save_pretrained(f"lstm_model_{name}", push_to_hub=True, repo_name=f"project_resilience_lstm_model_{name}", use_auth_token=None, config=model_config)
+        model.save_pretrained(f"project_resilience_lstm_model_{name}", push_to_hub=True, config=model_config)
 
 train_country_lstm([143, 29]) # UK and India
 train_country_lstm([143]) # UK

From 95785f631666e02a6663ea8755a3dc074507e123 Mon Sep 17 00:00:00 2001
From: Jacob Bieker <jacob@bieker.tech>
Date: Sun, 4 Jun 2023 16:50:26 +0100
Subject: [PATCH 4/5] Auto set device

---
 model/lstm.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/model/lstm.py b/model/lstm.py
index 7e8655d..4d350a7 100644
--- a/model/lstm.py
+++ b/model/lstm.py
@@ -62,6 +62,7 @@ def forward(self, x):
 
 
 model_config = {"layers": 2, "hidden_dim": 128, "input_dim": 26, "output_dim": 1}
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def train_country_lstm(country_code):
     if len(country_code) == 1:
         test_da = dataset.where(country_mask == country_code[0], drop=True).where(dataset.time > 2007, drop=True).load()
@@ -71,7 +72,7 @@ def train_country_lstm(country_code):
                               dims=country_mask.dims, coords=country_mask.coords)
         test_da = dataset.where(c_mask, drop=True).where(dataset.time > 2007, drop=True).load()
         train_da = dataset.where(c_mask, drop=True).where(dataset.time <= 2007, drop=True).load() # 143 is the code for the UK
-    model = LSTMModel(**model_config).cuda()
+    model = LSTMModel(**model_config).to(device)
     crit = torch.nn.MSELoss()
     stat = torch.nn.L1Loss()
     optim = torch.optim.Adam(model.parameters(), lr=0.001)
@@ -105,8 +106,8 @@ def train_country_lstm(country_code):
             # zero the parameter gradients
             optim.zero_grad()
             # forward + backward + optimize
-            outputs = model(torch_example.cuda())
-            loss = crit(outputs, torch_target.cuda())
+            outputs = model(torch_example.to(device))
+            loss = crit(outputs, torch_target.to(device))
             loss.backward()
             optim.step()
             #print(loss.item())
@@ -138,7 +139,7 @@ def train_country_lstm(country_code):
                     # Flatten with einops to get the shape (batch_size, time, features)
                     torch_target = einops.rearrange(torch_target, 'lat lon features -> (lat lon) features')
                     num += torch_target.shape[0]
-                    outputs = model(torch_example.cuda())
+                    outputs = model(torch_example.to(device))
                     loss = crit(outputs.cpu(), torch_target)
                     mae_loss = stat(outputs.cpu(), torch_target)
                     #print(loss.item())

From 3c09283ce429411e4de0a1d99f84b67e686cc28c Mon Sep 17 00:00:00 2001
From: Jacob Bieker <jacob@bieker.tech>
Date: Fri, 9 Jun 2023 14:16:56 +0100
Subject: [PATCH 5/5] Update script for testing with no change

---
 model/lstm.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 73 insertions(+), 8 deletions(-)

diff --git a/model/lstm.py b/model/lstm.py
index 4d350a7..e8c29ff 100644
--- a/model/lstm.py
+++ b/model/lstm.py
@@ -45,7 +45,7 @@
 
 # Simple LSTM model
 class LSTMModel(torch.nn.Module, PyTorchModelHubMixin):
-    def __init__(self, layers=2, hidden_dim=128, input_dim=26, output_dim=1):
+    def __init__(self, layers=2, hidden_dim=128, input_dim=26, output_dim=1, **kwargs):
         super().__init__()
         self.lstm = torch.nn.LSTM(input_dim, hidden_dim, num_layers=layers, batch_first=True)
         self.average = torch.nn.AdaptiveAvgPool1d(1)
@@ -149,15 +149,80 @@ def train_country_lstm(country_code):
                     num += 1
         print(f"Test  {country_code} MSE: {test_mse/num}")
         print(f"Test {country_code} MAE: {test_mae/num}")
-        names = [f'{c}_' for c in country_code]
+        names = [f'_{c}' for c in country_code]
         # Join the names together
         name = ''.join(names)
-        torch.save(model, f"lstm_model_{name}.pt")
+        torch.save(model, f"lstm_model{name}.pt")
         # Save the weights to huggingface
-        model.save_pretrained(f"project_resilience_lstm_model_{name}", push_to_hub=True, config=model_config)
+        model.save_pretrained(f"project_resilience_lstm_model{name}", push_to_hub=True, config=model_config)
 
-train_country_lstm([143, 29]) # UK and India
-train_country_lstm([143]) # UK
-train_country_lstm([29]) # Brazil
+def test_no_change(country_code):
+    if len(country_code) == 1:
+        test_da = dataset.where(country_mask == country_code[0], drop=True).where(dataset.time > 2007, drop=True).load()
+        #train_da = dataset.where(country_mask == country_code[0], drop=True).where(dataset.time <= 2007, drop=True).load() # 143 is the code for the UK
+    else:
+        c_mask = xr.DataArray(np.in1d(country_mask, country_code).reshape(country_mask.shape),
+                              dims=country_mask.dims, coords=country_mask.coords)
+        test_da = dataset.where(c_mask, drop=True).where(dataset.time > 2007, drop=True).load()
+        #train_da = dataset.where(c_mask, drop=True).where(dataset.time <= 2007, drop=True).load() # 143 is the code for the UK
+    #model = LSTMModel(**model_config).to(device)
+    model = LSTMModel.from_pretrained("jacobbieker/project_resilience_lstm_model_143_29_").to(device)
+    crit = torch.nn.MSELoss()
+    stat = torch.nn.L1Loss()
+
+    with torch.no_grad():
+        model.eval()
+        test_mse = 0.0
+        test_mae = 0.0
+        num = 0
+        test_times = test_da.time.values[11:]
+        for time in test_times:
+            for lon in range(0, len(test_da.lon.values), 10):
+                for lat in range(0, len(test_da.lat.values), 10):
+                    example = test_da.isel(lat=slice(lat,lat+10), lon=slice(lon,lon+10)).sel(time=slice(time-10,time))
+                    data_names = list(example.data_vars.keys())
+                    example = xr.concat([example[var] for var in example.data_vars], dim='variable')
+                    example = example.assign_coords(variable=data_names)
+                    example = example.transpose('time', 'lat', 'lon', 'variable')
+                    target = example.sel(variable='ELUC').isel(time=8) # Set it to ELUC of step 8, which is same as ELUC for the train sample, so no change
+                    train = example.isel(time=slice(8,9)) # Select only 8, but be 4D still
+                    #assert np.isfinite(example['ELUC'].values).all()
+                    # Convert infinite values and NaN to 0.0
+                    train = train.fillna(0.0)
+                    target = target.fillna(0.0)
+                    #exit()
+                    # Set the target to be the same as train and set all values of train other than current to the same
+                    #target = train.isel(time=8)
+                    #train = train.isel(time=8)
+                    train = np.repeat(train.values, 9, axis=0) # Copy so no changes across the 9 years of looking
+                    torch_example = torch.from_numpy(np.nan_to_num(train, posinf=0.0, neginf=0.0))
+                    # Flatten with einops to get the shape (batch_size, time, features)
+                    torch_example = einops.rearrange(torch_example, 'time lat lon features -> (lat lon) time features')
+                    torch_target = torch.unsqueeze(torch.from_numpy(np.nan_to_num(target.values, posinf=0.0, neginf=0.0)), dim=-1)
+                    # Flatten with einops to get the shape (batch_size, time, features)
+                    torch_target = einops.rearrange(torch_target, 'lat lon features -> (lat lon) features')
+                    num += torch_target.shape[0]
+                    outputs = model(torch_example.to(device))
+                    loss = crit(outputs.cpu(), torch_target)
+                    mae_loss = stat(outputs.cpu(), torch_target)
+                    #print(loss.item())
+                    test_mse += loss.item()
+                    test_mae += mae_loss.item()
+                    # Already averaged, so just need to track num for time, lat, lon
+                    num += 1
+                    print(f"Test  {country_code} MSE: {test_mse/num}")
+                    print(f"Test {country_code} MAE: {test_mae/num}")
+        #names = [f'_{c}' for c in country_code]
+        # Join the names together
+        #name = ''.join(names)
+        #torch.save(model, f"lstm_model{name}.pt")
+        # Save the weights to huggingface
+        #model.save_pretrained(f"project_resilience_lstm_model{name}", push_to_hub=True, config=model_config)
+
+
+#train_country_lstm([143, 29]) # UK and India
+#train_country_lstm([143]) # UK
+#train_country_lstm([29]) # Brazil
+#train_country_lstm(list(range(80))) # First 80 countries
 # Check ELUC when no change
-# Send over weights, put them on HF and integrate that as well as config options
+test_no_change([143, 29])