diff --git a/.github/workflows/build_on_pull_request.yaml b/.github/workflows/build_on_pull_request.yaml index 7689fd8..92f1dca 100644 --- a/.github/workflows/build_on_pull_request.yaml +++ b/.github/workflows/build_on_pull_request.yaml @@ -20,7 +20,7 @@ jobs: build --user - name: Build a binary wheel and a source for drivers - run: python3 -m build ./drivers + run: python3 -m build ./drivers - name: Set Docker image tag name run: echo "TAG=$(date +'%Y.%m.%d.%H.%M')" >> $GITHUB_ENV - name: TAG ECHO @@ -32,7 +32,7 @@ jobs: password: ${{ secrets.DOCKER_PASSWORD }} registry: ${{ vars.DOCKER_REGISTRY }} - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v2 - name: Build and push drivers uses: docker/build-push-action@v5 with: @@ -45,7 +45,7 @@ jobs: REGISTRY=${{ vars.GEOKUBE_REGISTRY }} tags: | ${{ vars.DOCKER_REGISTRY }}/geolake-drivers:${{ env.TAG }} - ${{ vars.DOCKER_REGISTRY }}/geolake-drivers:latest + ${{ vars.DOCKER_REGISTRY }}/geolake-drivers:latest - name: Build and push datastore component uses: docker/build-push-action@v5 with: @@ -58,7 +58,7 @@ jobs: cache-to: type=gha,mode=max tags: | ${{ vars.DOCKER_REGISTRY }}/geolake-datastore:${{ env.TAG }} - ${{ vars.DOCKER_REGISTRY }}/geolake-datastore:latest + ${{ vars.DOCKER_REGISTRY }}/geolake-datastore:latest - name: Build and push api component uses: docker/build-push-action@v5 with: @@ -71,7 +71,7 @@ jobs: cache-to: type=gha,mode=max tags: | ${{ vars.DOCKER_REGISTRY }}/geolake-api:${{ env.TAG }} - ${{ vars.DOCKER_REGISTRY }}/geolake-api:latest + ${{ vars.DOCKER_REGISTRY }}/geolake-api:latest - name: Build and push executor component uses: docker/build-push-action@v5 with: @@ -84,4 +84,4 @@ jobs: cache-to: type=gha,mode=max tags: | ${{ vars.DOCKER_REGISTRY }}/geolake-executor:${{ env.TAG }} - ${{ vars.DOCKER_REGISTRY }}/geolake-executor:latest \ No newline at end of file + ${{ vars.DOCKER_REGISTRY }}/geolake-executor:latest \ No newline at end of file diff --git a/.github/workflows/build_on_release.yaml b/.github/workflows/build_on_release.yaml index 4a8da66..4c7d8be 100644 --- a/.github/workflows/build_on_release.yaml +++ b/.github/workflows/build_on_release.yaml @@ -32,17 +32,20 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 - name: Build and push drivers - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: ./drivers file: ./drivers/Dockerfile push: true build-args: | REGISTRY=${{ vars.GEOKUBE_REGISTRY }} + TAG=v0.2a6 + cache-from: type=gha + cache-to: type=gha,mode=max tags: | ${{ vars.GEOLAKE_REGISTRY }}/geolake-drivers:${{ env.RELEASE_TAG }} - name: Build and push datastore component - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: ./datastore file: ./datastore/Dockerfile @@ -50,10 +53,12 @@ jobs: build-args: | REGISTRY=${{ vars.GEOLAKE_REGISTRY }} TAG=${{ env.RELEASE_TAG }} + cache-from: type=gha + cache-to: type=gha,mode=max tags: | ${{ vars.GEOLAKE_REGISTRY }}/geolake-datastore:${{ env.RELEASE_TAG }} - name: Build and push api component - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: ./api file: ./api/Dockerfile @@ -61,10 +66,12 @@ jobs: build-args: | REGISTRY=${{ vars.GEOLAKE_REGISTRY }} TAG=${{ env.RELEASE_TAG }} + cache-from: type=gha + cache-to: type=gha,mode=max tags: | ${{ vars.GEOLAKE_REGISTRY }}/geolake-api:${{ env.RELEASE_TAG }} - name: Build and push executor component - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: ./executor file: ./executor/Dockerfile @@ -72,5 +79,7 @@ jobs: build-args: | REGISTRY=${{ vars.GEOLAKE_REGISTRY }} TAG=${{ env.RELEASE_TAG }} + cache-from: type=gha + cache-to: type=gha,mode=max tags: | ${{ vars.GEOLAKE_REGISTRY }}/geolake-executor:${{ env.RELEASE_TAG }} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 44f8be3..2c15e0a 100644 --- a/.gitignore +++ b/.gitignore @@ -112,5 +112,3 @@ venv.bak/ _catalogs/ _old/ -# Netcdf files -*.nc diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..02b3311 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,51 @@ +# This CITATION.cff file was generated with cffinit. +# Visit https://bit.ly/cffinit to generate yours today! + +cff-version: 1.2.0 +title: geolake +message: >- + If you use this software, please cite it using the + metadata from this file. +type: software +authors: + - given-names: Marco + family-names: Mancini + orcid: 'https://orcid.org/0000-0002-9150-943X' + - given-names: Jakub + family-names: Walczak + orcid: 'https://orcid.org/0000-0002-5632-9484' + - given-names: Mirko + family-names: Stojiljković + - given-names: Valentina + family-names: Scardigno + orcid: 'https://orcid.org/0000-0002-0123-5368' +identifiers: + - type: doi + value: 10.5281/zenodo.10598417 +repository-code: 'https://github.com/CMCC-Foundation/geolake' +abstract: >+ + geolake is an open source framework for management, + storage, and analytics of Earth Science data. geolake + implements the concept of a data lake as a central + location that holds a large amount of data in its native + and raw format. geolake does not impose any schema when + ingesting the data, however it provides a unified Data + Model and API for geoscientific datasets. The data is kept + in the original format and storage, and the in-memory data + structure is built on-the-fly for the processing analysis. + + The system has been designed using a cloud-native + architecture, based on containerized microservices, that + facilitates the development, deployment and maintenance of + the system itself. It has been implemented by integrating + different open source frameworks, tools and libraries and + can be easily deployed using the Kubernetes platform and + related tools such as kubectl. + +keywords: + - python framework + - earth science + - data analytics +license: Apache-2.0 +version: 0.1.0 +date-released: '2024-01-29' diff --git a/README.md b/README.md index e1490aa..4d775a5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10598417.svg)](https://doi.org/10.5281/zenodo.10598417) + # geolake ## Description diff --git a/api/app/main.py b/api/app/main.py index 09271f3..1c9bd4f 100644 --- a/api/app/main.py +++ b/api/app/main.py @@ -1,5 +1,5 @@ -"""Main module with dekube-dds API endpoints defined""" -__version__ = "2.0" +"""Main module with geolake API endpoints defined""" +__version__ = "0.1.0" import os from typing import Optional, Dict from datetime import datetime @@ -66,12 +66,12 @@ def map_to_geoquery( extend_json_encoders() app = FastAPI( - title="geokube-dds API", - description="REST API for geokube-dds", + title="geolake API", + description="REST API for geolake", version=__version__, contact={ - "name": "geokube Contributors", - "email": "geokube@googlegroups.com", + "name": "geolake Contributors", + "email": "geolake@googlegroups.com", }, license_info={ "name": "Apache 2.0", @@ -118,9 +118,9 @@ def map_to_geoquery( # ======== Endpoints definitions ========= # @app.get("/", tags=[tags.BASIC]) -async def dds_info(): - """Return current version of the DDS API""" - return f"DDS API {__version__}" +async def geolake_info(): + """Return current version of the geolake API""" + return f"geolake API {__version__}" @app.get("/datasets", tags=[tags.DATASET]) diff --git a/catalog/README.md b/catalog/README.md new file mode 100644 index 0000000..72f90f3 --- /dev/null +++ b/catalog/README.md @@ -0,0 +1,2 @@ +# geolake-sample-catalog +geolake Catalog Sample diff --git a/catalog/RS_indices.yaml b/catalog/RS_indices.yaml new file mode 100644 index 0000000..e85ecbe --- /dev/null +++ b/catalog/RS_indices.yaml @@ -0,0 +1,40 @@ +metadata: + description: >- + Remote Sensing Indices derived from SENTINEL S2A data + contact: + name: Data Deliver System Support Team + email: dds-support@cmcc.it + webpage: https://www.cmcc.it/research-organization/research-divisions/advanced-scientific-computing-division#1553329820238-2055494b-9aa6 + label: Remote Sensing Indices from Sentinel S2A + image: null + doi: null + update_frequency: null + license: null + publication_date: 2023-11-22 + related_data: null + +sources: + 10m: + description: Remote Sensing Indices at 10m + metadata: + role: public + filters: + - name: pasture + user_defined: T + label: Pasture + driver: geokube_netcdf + args: + path: '{{ CATALOG_DIR }}/datasets/RS_indices/*/10m/*.nc' + pattern: '{{ CATALOG_DIR }}/datasets/RS_indices/{pasture}/10m/{}.nc' + field_id: '{standard_name}' + mapping: + NDVI: {'name': 'NDVI', 'description': 'Normalized Difference Vegetation Index'} + NDWI: {'name': 'NDWI', 'description': 'Normalized Difference Water Index'} + GLI: {'name': 'GLI', 'description': 'Green Leaf Index'} + GCI: {'name': 'GCI', 'description': 'Green Chlorophyll Index'} + RGR: {'name': 'RGR', 'description': 'Red-Green Ratio'} + metadata_caching: false + metadata_cache_path: '{{ CACHE_DIR }}/s2-indices-10m.cache' + xarray_kwargs: + parallel: true + decode_coords: 'all' \ No newline at end of file diff --git a/catalog/cache.py b/catalog/cache.py new file mode 100644 index 0000000..15bca03 --- /dev/null +++ b/catalog/cache.py @@ -0,0 +1,22 @@ +import argparse +import intake + +parser = argparse.ArgumentParser( + prog="Cache generator", + description="The script generating cache for the catalog", +) +parser.add_argument( + "--cachedir", + type=str, + help="Directory where the cache should be saved. Default: .cache", + default=".cache", +) + +if __name__ == "__main__": + args = parser.parse_args() + catalog = intake.open_catalog("catalog.yaml") + for ds in list(catalog): + for p in list(catalog[ds]): + print(f"dataset: {ds} product: {p}:") + catalog = catalog(CACHE_DIR=args.cachedir) + kube = catalog[ds][p].read() \ No newline at end of file diff --git a/catalog/catalog.yaml b/catalog/catalog.yaml new file mode 100644 index 0000000..19e5913 --- /dev/null +++ b/catalog/catalog.yaml @@ -0,0 +1,23 @@ +metadata: + version: 0.1 + parameters: + CACHE_DIR: + type: str + description: folder to store metadata cache files + default: .cache + +sources: + era5-downscaled: + driver: yaml_file_cat + args: + path: '{{ CATALOG_DIR }}/era5_downscaled.yaml' + + thi: + driver: yaml_file_cat + args: + path: '{{ CATALOG_DIR }}/thi.yaml' + + rs-indices: + driver: yaml_file_cat + args: + path: '{{ CATALOG_DIR }}/RS_indices.yaml' diff --git a/catalog/datasets/RS_indices/Donnola/10m/regular.nc b/catalog/datasets/RS_indices/Donnola/10m/regular.nc new file mode 100644 index 0000000..18e011a Binary files /dev/null and b/catalog/datasets/RS_indices/Donnola/10m/regular.nc differ diff --git a/catalog/datasets/THI/20240101.nc b/catalog/datasets/THI/20240101.nc new file mode 100644 index 0000000..f764727 Binary files /dev/null and b/catalog/datasets/THI/20240101.nc differ diff --git a/catalog/datasets/era5_downscaled.nc b/catalog/datasets/era5_downscaled.nc new file mode 100644 index 0000000..4d569d9 Binary files /dev/null and b/catalog/datasets/era5_downscaled.nc differ diff --git a/catalog/era5_downscaled.yaml b/catalog/era5_downscaled.yaml new file mode 100644 index 0000000..ed7fe9f --- /dev/null +++ b/catalog/era5_downscaled.yaml @@ -0,0 +1,12 @@ +metadata: + description: >- + This dataset is related to ERA5 downscaled over Italy at 2km. + +sources: + hourly: + description: ERA5 downscaled at 2km over italy hourly. + driver: geokube_netcdf + args: + path: '{{ CATALOG_DIR }}/datasets/era5_downscaled.nc' + metadata_caching: true + metadata_cache_path: '{{ CACHE_DIR }}/era5_downscaled.cache' diff --git a/catalog/thi.yaml b/catalog/thi.yaml new file mode 100644 index 0000000..6bc9c97 --- /dev/null +++ b/catalog/thi.yaml @@ -0,0 +1,37 @@ +metadata: + description: >- + Thermohygrometric Indices derived from MISTRAL COSMO-2I data + contact: + name: Data Deliver System Support Team + email: dds-support@cmcc.it + webpage: https://www.cmcc.it/research-organization/research-divisions/advanced-scientific-computing-division#1553329820238-2055494b-9aa6 + label: Thermohygrometric Indices over Italy + image: null + doi: null + update_frequency: null + license: null + publication_date: 2023-06-19 + related_data: null + +sources: + hourly: + description: Hourly Thermohygrometric Indices + metadata: + role: public + filters: + - name: date + user_defined: T + label: Date + driver: geokube_netcdf + args: + path: '{{ CATALOG_DIR }}/datasets/THI/*.nc' + pattern: '{{ CATALOG_DIR }}/datasets/THI/{date}.nc' + field_id: '{standard_name}' + mapping: + THI_ext: {'name': 'external_thermohygrometric_index', 'description': 'External Thermohygrometric Index'} + THI_int: {'name': 'internal_thermohygrometric_index', 'description': 'Internal Thermohygrometric Index'} + metadata_caching: false + metadata_cache_path: '{{ CACHE_DIR }}/thi-hourly.cache' + xarray_kwargs: + parallel: true + decode_coords: 'all' \ No newline at end of file diff --git a/datastore/Dockerfile b/datastore/Dockerfile index 3269834..7e051cc 100644 --- a/datastore/Dockerfile +++ b/datastore/Dockerfile @@ -3,7 +3,7 @@ ARG TAG=latest FROM $REGISTRY/geolake-drivers:$TAG COPY requirements.txt /app/requirements.txt -RUN pip install --no-cache-dir -r /app/requirements.txt +RUN pip install --no-cache-dir -r /app/requirements.txt COPY ./datastore /app/datastore COPY ./workflow /app/workflow COPY ./dbmanager /app/dbmanager diff --git a/drivers/Dockerfile b/drivers/Dockerfile index 24c0861..b3184db 100644 --- a/drivers/Dockerfile +++ b/drivers/Dockerfile @@ -4,5 +4,6 @@ ARG TAG=latest FROM $REGISTRY/geokube:$TAG COPY dist/intake_geokube-0.1a0-py3-none-any.whl / + RUN pip3.10 install /intake_geokube-0.1a0-py3-none-any.whl RUN rm /intake_geokube-0.1a0-py3-none-any.whl