diff --git a/notebooks/2.0-test-normalization.ipynb b/notebooks/2.0-test-normalization.ipynb new file mode 100644 index 0000000..d703dce --- /dev/null +++ b/notebooks/2.0-test-normalization.ipynb @@ -0,0 +1,8697 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/freischem/.conda/envs/rs_tools_iti/lib/python3.11/site-packages/goes2go/data.py:519: FutureWarning: 'H' is deprecated and will be removed in a future version. Please use 'h' instead of 'H'.\n", + " within=pd.to_timedelta(config[\"nearesttime\"].get(\"within\", \"1H\")),\n", + "/home/freischem/.conda/envs/rs_tools_iti/lib/python3.11/site-packages/goes2go/NEW.py:188: FutureWarning: 'H' is deprecated and will be removed in a future version. Please use 'h' instead of 'H'.\n", + " within=pd.to_timedelta(config[\"nearesttime\"].get(\"within\", \"1H\")),\n" + ] + } + ], + "source": [ + "import autoroot\n", + "from rs_tools._src.preprocessing.normalize import normalize\n", + "\n", + "%matplotlib inline\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "goes_files = os.listdir('/mnt/disks/data/miniset/goes16/geoprocessed/')\n", + "goes_files = ['/mnt/disks/data/miniset/goes16/geoprocessed/' + f for f in goes_files]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# f = goes_files[0].split('/')[-1]\n", + "# f" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# ls /mnt/disks/data/miniset/goes16/geoprocessed/" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# get_list_filenames('/mnt/disks/data/miniset/goes16/geoprocessed/', ext='nc')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# from iti.data.geo_utils import get_split, get_list_filenames\n", + "\n", + "\n", + "# splits_dict = { \n", + "# \"train\": {\"years\": [2020], \"months\": [10], \"days\": list(range(1,20))},\n", + "# \"val\": {\"years\": [2020], \"months\": [10], \"days\": list(range(20,32))},\n", + "# }\n", + "\n", + "# get_split(goes_files, splits_dict['train'])\n", + " \n", + "\n", + "# # def get_files(self):\n", + "# # # Get filenames from data_dir\n", + "# # files = get_list_filenames(data_path=self.data_dir, ext=self.ext)\n", + "# # # split files based on split criteria\n", + "# # files = get_split(files=files, split_dict=self.splits_dict)\n", + "# # return files\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "# ds_goes_files = xr.open_mfdataset(goes_files[:100])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "goes_time = xr.open_dataset(goes_files[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
<xarray.DataArray 'Rad' ()> Size: 4B\n", + "array(20.26424, dtype=float32)\n", + "Coordinates:\n", + " band int8 1B 16
<xarray.Dataset> Size: 272B\n", + "Dimensions: (band_wavelength: 16, band: 16)\n", + "Coordinates:\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " mean (band) float32 64B 144.0 74.45 43.39 ... 89.34 96.57 85.23\n", + " std (band) float64 128B 97.21 80.72 53.49 ... 23.48 23.76 17.25
<xarray.Dataset> Size: 220B\n", + "Dimensions: (band_wavelength: 3, band: 16)\n", + "Coordinates:\n", + " * band_wavelength (band_wavelength) float32 12B 0.47 1.38 1.61\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " mean (band) float32 64B 144.0 74.45 43.39 ... 89.34 96.57 85.23\n", + " std (band) float64 128B 97.21 80.72 53.49 ... 23.48 23.76 17.25
<xarray.Dataset> Size: 484B\n", + "Dimensions: (band: 11, band_wavelength: 11)\n", + "Coordinates:\n", + " * band (band) <U6 264B 'IR_016' 'IR_039' ... 'WV_062' 'WV_073'\n", + " * band_wavelength (band_wavelength) float64 88B 1.64 3.92 8.7 ... 6.25 7.35\n", + "Data variables:\n", + " mean (band) float32 44B 1.814 0.6359 51.78 ... 3.091 3.14 13.84\n", + " std (band) float64 88B 1.735 0.2418 16.02 ... 0.9149 3.935
<xarray.Dataset> Size: 440B\n", + "Dimensions: (band: 11, band_wavelength: 11)\n", + "Coordinates:\n", + " * band (band) <U6 264B 'IR_016' 'IR_039' ... 'WV_062' 'WV_073'\n", + " * band_wavelength (band_wavelength) float64 88B 1.64 3.92 8.7 ... 6.25 7.35\n", + "Data variables:\n", + " mean (band) float32 44B 1.814 0.6359 51.78 ... 3.091 3.14 13.84\n", + " std (band) float32 44B 5.322 0.1253 417.6 ... 30.0 1.242 24.09" + ], + "text/plain": [ + "
<xarray.Dataset> Size: 275MB\n", + "Dimensions: (x: 504, y: 3687, time: 1, band_wavelength: 16, band: 16)\n", + "Coordinates:\n", + " * x (x) float32 2kB 3.108e+06 3.109e+06 ... 3.611e+06 3.612e+06\n", + " * y (y) float32 15kB 501.3 1.503e+03 ... 3.693e+06 3.694e+06\n", + " * time (time) <U16 64B '2020-10-27 13:05'\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + " latitude (y, x) float64 15MB ...\n", + " longitude (y, x) float64 15MB ...\n", + " cloud_mask (y, x) float32 7MB ...\n", + "Data variables:\n", + " Rad (band, y, x, time) float32 119MB -19.53 -13.84 ... -9.927\n", + " DQF (band, y, x, time) float32 119MB -0.0002013 ... 0.0
<xarray.Dataset> Size: 400B\n", + "Dimensions: (time: 1, band_wavelength: 16, band: 16)\n", + "Coordinates:\n", + " * time (time) <U16 64B '2020-10-27 13:05'\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " Rad (band, time) float64 128B 7.591e+03 5.06e+03 ... 229.4\n", + " DQF (band, time) float64 128B 0.0004025 0.0 ... 0.0 0.0" + ], + "text/plain": [ + "
<xarray.Dataset> Size: 400B\n", + "Dimensions: (band: 16, time: 1, band_wavelength: 16)\n", + "Coordinates:\n", + " * time (time) <U16 64B '2020-10-27 13:05'\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " Rad (band, time) float64 128B 87.13 71.13 46.68 ... 20.72 15.15\n", + " DQF (band, time) float64 128B 0.02006 0.0 0.006225 ... 0.0 0.0" + ], + "text/plain": [ + "
<xarray.Dataset> Size: 2kB\n", + "Dimensions: (time: 10, band_wavelength: 16, band: 16)\n", + "Coordinates:\n", + " * time (time) <U16 640B '2020-10-07 17:05' ... '2020-10-30 15:05'\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " Rad (band, time) float32 640B 1.152e+04 2.916e+04 ... 535.9\n", + " DQF (band, time) float32 640B 0.04563 0.2669 ... 0.0 0.0
<xarray.Dataset> Size: 144B\n", + "Dimensions: (band_wavelength: 16, band: 16)\n", + "Coordinates:\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " mean (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>
<xarray.Dataset> Size: 480B\n", + "Dimensions: (band_wavelength: 16, band: 32)\n", + "Coordinates:\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 32B 1 2 3 4 5 6 7 8 ... 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " mean (band) float32 128B dask.array<chunksize=(8,), meta=np.ndarray>\n", + " std (band) float32 128B dask.array<chunksize=(24,), meta=np.ndarray>\n", + " DQF (band) float32 128B dask.array<chunksize=(24,), meta=np.ndarray>
<xarray.Dataset> Size: 272B\n", + "Dimensions: (band_wavelength: 16, band: 16)\n", + "Coordinates:\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " std (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>\n", + " DQF (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>\n", + " mean (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>
<xarray.Dataset> Size: 208B\n", + "Dimensions: (band_wavelength: 16, band: 16)\n", + "Coordinates:\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " std (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>\n", + " mean (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>
<xarray.Dataset> Size: 208B\n", + "Dimensions: (band_wavelength: 16, band: 16)\n", + "Coordinates:\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " std (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>\n", + " mean (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>
<xarray.Dataset> Size: 208B\n", + "Dimensions: (band_wavelength: 16, band: 16)\n", + "Coordinates:\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " Rad (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>\n", + " DQF (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>
<xarray.Dataset> Size: 208B\n", + "Dimensions: (band_wavelength: 16, band: 16)\n", + "Coordinates:\n", + " * band_wavelength (band_wavelength) float32 64B 0.47 0.64 ... 12.27 13.27\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16\n", + "Data variables:\n", + " Rad (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>\n", + " DQF (band) float32 64B dask.array<chunksize=(8,), meta=np.ndarray>
<xarray.DataArray 'Rad' (band: 16)> Size: 64B\n", + "dask.array<mean_agg-aggregate, shape=(16,), dtype=float32, chunksize=(8,), chunktype=numpy.ndarray>\n", + "Coordinates:\n", + " * band (band) int8 16B 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16