Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

preprocess/icdc #91

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions scripts/drafts/icdc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from src.preprocess.icdc import (
ESACCISoilMoisture,
LAIModisAvhrr,
ModisNDVI
)


def modis_ndvi():
processor = ModisNDVI()
processor.preprocess()


def cci_soil_moisture():
processor = ESACCISoilMoisture()
processor.preprocess()


def modis_lai():
processor = LAIModisAvhrr()
processor.preprocess()


if __name__ == '__main__':
modis_ndvi()
cci_soil_moisture()
modis_lai()
35 changes: 27 additions & 8 deletions scripts/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from src.preprocess.admin_boundaries import KenyaAdminPreprocessor


def process_vci_2018():
# if the working directory is alread ml_drought don't need ../data
if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
Expand Down Expand Up @@ -127,11 +126,31 @@ def preprocess_era5():
processor.preprocess(subset_str='kenya', regrid=regrid_path)


def preprocess_icdc():
from src.preprocess.icdc import (
ModisNDVIPreprocessor,
ModisLSTPreprocessor,
)
if Path('.').absolute().as_posix().split('/')[-1] == 'ml_drought':
data_path = Path('data')
else:
data_path = Path('../data')
processor = ModisNDVIPreprocessor(data_path)
processor.preprocess(
subset_str='ethiopia_safe',
)

processor = ModisLSTPreprocessor(data_path)
processor.preprocess(
subset_str='ethiopia_safe'
)

if __name__ == '__main__':
process_vci_2018()
process_precip_2018()
process_era5POS_2018()
process_gleam()
process_esa_cci_landcover()
preprocess_srtm()
preprocess_era5()
# process_vci_2018()
# process_precip_2018()
# process_era5POS_2018()
# process_gleam()
# process_esa_cci_landcover()
# preprocess_srtm()
# preprocess_era5()
preprocess_icdc()
247 changes: 247 additions & 0 deletions src/preprocess/icdc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
from pathlib import Path
import xarray as xr
from shutil import rmtree
from typing import Optional, List

from .base import BasePreProcessor


class ICDCPreprocessor(BasePreProcessor):
""" For working with data on ICDC (SPECIFIC to Uni Server)
"""
variable: str # the name of the variable on icdc
source: str # {'land', 'atmosphere', 'climate_indices', 'ocean', 'ice_and_snow'}

def __init__(self, data_folder: Path = Path('data')) -> None:
super().__init__(data_folder)
self.icdc_data_dir = Path(f'/pool/data/ICDC/{self.source}/')

def get_icdc_filepaths(self) -> List[Path]:
dir = self.icdc_data_dir / self.dataset / 'DATA'
years = [d.name for d in dir.iterdir() if d.is_dir()]

filepaths: List = []
for year in years:
filepaths.extend((dir / year).glob('*.nc'))

if filepaths != []:
return filepaths
else:
filepaths.extend((dir).glob('*.nc'))

if filepaths != []:
return filepaths

else:
# HACKY: for the lst dataset
filepaths.extend((dir).glob('MONTHLY/**/*.nc'))
return filepaths

@staticmethod
def create_filename(netcdf_filename: str,
subset_name: Optional[str] = None) -> str:
"""
{base_str}.nc
"""
filename_stem = netcdf_filename[:-3]
if subset_name is not None:
new_filename = f'{filename_stem}_{subset_name}.nc'
else:
new_filename = f'{filename_stem}.nc'
return new_filename

def _preprocess_single(self, netcdf_filepath: Path,
subset_str: Optional[str] = 'kenya',
regrid: Optional[xr.Dataset] = None) -> None:
"""Run the Preprocessing steps for the data stored on ICDC
https://icdc.cen.uni-hamburg.de/1/daten.html

Process:
-------
* chop out ROI
* create new dataset with regrid dimensions
* Save the output file to new folder
"""
print(f'Starting work on {netcdf_filepath.name}')
# 1. read in the dataset
ds = xr.open_dataset(netcdf_filepath)

# 2. chop out EastAfrica
if subset_str is not None:
try:
ds = self.chop_roi(ds, subset_str, inverse_lat=True)
except AssertionError:
ds = self.chop_roi(ds, subset_str, inverse_lat=False)

if regrid is not None:
ds = self.regrid(ds, regrid)

# 6. create the filepath and save to that location
assert netcdf_filepath.name[-3:] == '.nc', \
f'filepath name should be a .nc file. Currently: {netcdf_filepath.name}'

filename = self.create_filename(
netcdf_filepath.name,
subset_name=subset_str if subset_str is not None else None
)
print(f"Saving to {self.interim}/{filename}")
ds.to_netcdf(self.interim / filename)

print(f"** Done for {self.dataset} {netcdf_filepath.name} **")

def preprocess(self, subset_str: Optional[str] = 'kenya',
regrid: Optional[Path] = None,
resample_time: Optional[str] = 'M',
upsampling: bool = False,
cleanup: bool = False) -> None:
""" Preprocess all of the GLEAM .nc files to produce
one subset file.

Arguments
----------
subset_str: Optional[str] = 'kenya'
Whether to subset Kenya when preprocessing
regrid: Optional[Path] = None
If a Path is passed, the CHIRPS files will be regridded to have the same
grid as the dataset at that Path. If None, no regridding happens
resample_time: str = 'M'
If not None, defines the time length to which the data will be resampled
upsampling: bool = False
If true, tells the class the time-sampling will be upsampling. In this case,
nearest instead of mean is used for the resampling
cleanup: bool = True
If true, delete interim files created by the class
"""
nc_files = self.get_icdc_filepaths()

if regrid is not None:
regrid = self.load_reference_grid(regrid)

for file in nc_files:
self._preprocess_single(file, subset_str, regrid)

# merge all of the timesteps
self.merge_files(subset_str, resample_time, upsampling)

if cleanup:
rmtree(self.interim)


class ESACCISoilMoisturePreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'esa_cci_soilmoisture'


class LAIModisAvhrrPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'avhrr_modis_lai'


class ModisNDVIPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'modis_aqua_vegetationindex'


class AMSRESoilMoisturePreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'amsre_soilmoisture'


class ASCATSoilMoisturePreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'ascat_soilmoisture'


class EUMetsatAlbedoPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'eumetsat_albedo'


class EUMetSatAlbedo2Preprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'eumetsat_clara2_surfacealbedo'


class EUMetSatRadiationPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'eumetsat_clara2_surfaceradiation'


class EUMetSatIrradiancePreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'eumetsat_surfacesolarirradiance'


class SpotFAPARPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'fapar_spot_proba_v'


class GLEAMEvaporationPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'gleam_evaporation'


class SpotLaiPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'lai_spot_proba_v'


class SpotLSAlbedoPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'land_surface_albedo_spot'


class ModisAlbedoPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'modis_albedo'


class ModisForestCoverPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'modis_forestcoverfraction'


class ModisLandcoverPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'modis_landcover'


class ModisLatLonPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'modis_latlon'


class ModisLSTClimatologyPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'modis_lst_climatology'


class ModisNPPPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'modis_primary_production'


class ModisSRTMPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'modis-srtm_landwaterdistribution'


class ModisLSTPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'modis_terra_landsurfacetemperature'


class SMOSSoilMoisturePreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'smos_soilmoisture'


class TopographyPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'topography'


class SpotVegetationCoverFractionPreprocessor(ICDCPreprocessor):
source = 'land'
dataset = 'vegetationcoverfraction_spot_proba_v'
14 changes: 12 additions & 2 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,17 @@ def get_kenya() -> Region:


def get_ethiopia() -> Region:
return Region(name='ethiopia', lonmin=32.9975838, lonmax=47.9823797,
latmin=3.397448, latmax=14.8940537)
return Region(
name='ethiopia', lonmin=32.9975838, lonmax=47.9823797,
latmin=3.397448, latmax=14.8940537
)


def get_ethiopia_safe() -> Region:
return Region(
name='ethiopia_safe', lonmin=30, lonmax=50,
latmin=2, latmax=15
)


def get_east_africa() -> Region:
Expand Down Expand Up @@ -181,5 +190,6 @@ def drop_nans_and_flatten(dataArray: xr.DataArray) -> np.ndarray:
region_lookup = {
"kenya": get_kenya(),
"ethiopia": get_ethiopia(),
"ethiopia_safe": get_ethiopia_safe(),
"east_africa": get_east_africa(),
}
Loading