From a2609d68440f4eb0667bef43a220c230498c493f Mon Sep 17 00:00:00 2001 From: raphaelrpl Date: Mon, 26 Apr 2021 09:37:58 -0300 Subject: [PATCH 1/2] Add support to publish HDF files (modis) (close #221) --- bdc_collection_builder/celery/publish.py | 17 +++- bdc_collection_builder/collections/hdf.py | 99 +++++++++++++++++++++ bdc_collection_builder/collections/utils.py | 4 + 3 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 bdc_collection_builder/collections/hdf.py diff --git a/bdc_collection_builder/celery/publish.py b/bdc_collection_builder/celery/publish.py index facc5de..0d80786 100644 --- a/bdc_collection_builder/celery/publish.py +++ b/bdc_collection_builder/celery/publish.py @@ -243,7 +243,22 @@ def publish_collection(scene_id: str, data: BaseCollection, collection: Collecti else: destination.mkdir(parents=True, exist_ok=True) - files = data.get_files(collection, path=file) + if file.endswith('.hdf'): + from ..collections.hdf import to_geotiff + + destination.mkdir(parents=True, exist_ok=True) + item_result = to_geotiff(file, temporary_dir.name) + files = dict() + + for _band, _geotiff in item_result.files.items(): + destination_path = destination / Path(_geotiff).name + shutil.move(str(_geotiff), str(destination_path)) + files[_band] = destination_path + + file = destination + cloud_cover = item_result.cloud_cover + else: + files = data.get_files(collection, path=file) extra_assets = data.get_assets(collection, path=file) diff --git a/bdc_collection_builder/collections/hdf.py b/bdc_collection_builder/collections/hdf.py new file mode 100644 index 0000000..e0c0707 --- /dev/null +++ b/bdc_collection_builder/collections/hdf.py @@ -0,0 +1,99 @@ +# +# This file is part of Brazil Data Cube Collection Builder. +# Copyright (C) 2019-2020 INPE. +# +# Brazil Data Cube Collection Builder is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. +# + +"""Module to deal with Hierarchical Data Format (HDF4/HDF5).""" + +from pathlib import Path +from typing import NamedTuple + +from osgeo import gdal + +DTYPES = dict( + uint8=gdal.GDT_Byte, + int16=gdal.GDT_Int16, + uint16=gdal.GDT_UInt16, + int32=gdal.GDT_Int32, + uint32=gdal.GDT_UInt32, +) + +ItemResult = NamedTuple('ItemResult', [('files', dict), ('cloud_cover', float)]) +"""Type to represent the extracted scenes from an Hierarchical Data Format (HDF4/HDF5).""" + + +def to_geotiff(hdf_path: str, destination: str) -> ItemResult: + """Convert a Hierarchical Data Format (HDF4/HDF5) file to set of GeoTIFF files. + + Args: + hdf_path (str) - Path to the HDF file to be extracted + destination (str) - The destination folder. + + Note: + The output GeoTIFF files are not Cloud Optimized GeoTIFF (COG). + + Tip: + You may use the utility :meth:bdc_collection_builder.collections.utils.generate_cogs to generate Cloud Optimized GeoTIFF files. + + Raises: + IOError When the input file is not a valid data set. + + Returns: + ItemResult A struct containing the extracted files + """ + data_set = gdal.Open(hdf_path) + + if data_set is None: + raise IOError(f'Could not open {str(hdf_path)}') + + base_name = Path(hdf_path).stem + metadata = data_set.GetMetadata() + cloud_cover = float(metadata.get('QAPERCENTCLOUDCOVER.1') or 0) + output_path = Path(destination) + + geotiff_driver = gdal.GetDriverByName('GTiff') + files = dict() + # Band iterator index to retrieve metadata value + band_idx = 1 + for data_set_name, _ in data_set.GetSubDatasets(): + formal_name = metadata[f'PARAMETERNAME.{band_idx}'] + band_name = '_'.join(formal_name.split(' ')[3:]) + + data_set = gdal.Open(data_set_name) + band = data_set.GetRasterBand(1) + array = band.ReadAsArray() + + tiff_file = output_path / f'{base_name}_{band_name}.tif' + + output_data_set = geotiff_driver.Create( + str(tiff_file), + data_set.RasterXSize, + data_set.RasterYSize, + 1, + DTYPES[array.dtype.name] + ) + output_data_set_band = output_data_set.GetRasterBand(1) + output_data_set.SetGeoTransform(data_set.GetGeoTransform()) + output_data_set.SetProjection(data_set.GetProjection()) + output_data_set.SetMetadata(metadata) + output_data_set_band.WriteArray(array) + output_data_set_band.SetNoDataValue(0) + + files[band_name] = str(tiff_file) + + output_data_set_band = None + output_data_set = None + + band_idx += 1 + + return ItemResult(files=files, cloud_cover=cloud_cover) + + +def is_valid(file_path: str) -> bool: + """Check the HDF file integrity with GDAL library.""" + ds = gdal.Open(file_path) + + return ds is not None diff --git a/bdc_collection_builder/collections/utils.py b/bdc_collection_builder/collections/utils.py index df2845b..6e42115 100644 --- a/bdc_collection_builder/collections/utils.py +++ b/bdc_collection_builder/collections/utils.py @@ -449,6 +449,10 @@ def is_valid_compressed_file(file_path: str) -> bool: return is_valid_tar(file_path) if file_path.endswith('.tar.gz'): return is_valid_tar_gz(file_path) + if file_path.endswith('.hdf'): + from .hdf import is_valid + return is_valid(file_path) + return True def is_valid_tar(file_path: str) -> bool: From c4c096eb3be8e31bde14d4801c311337baf84a48 Mon Sep 17 00:00:00 2001 From: raphaelrpl Date: Mon, 26 Apr 2021 09:47:37 -0300 Subject: [PATCH 2/2] Improve setup.py to support GDAL as optional extra --- setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bc4dd86..066c6ad 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,11 @@ extras_require = { 'docs': docs_require, 'tests': tests_require, - 'harmonization': harmonization_require + 'harmonization': harmonization_require, + 'gdal': [ + 'GDAL>=2.3', + 'bdc-collectors @ git+git://github.com/brazil-data-cube/bdc-collectors.git@v0.2.1#egg=bdc-collectors[modis]', + ] } extras_require['all'] = [req for exts, reqs in extras_require.items() for req in reqs]