From a2609d68440f4eb0667bef43a220c230498c493f Mon Sep 17 00:00:00 2001
From: raphaelrpl <raphael.wcosta@gmail.com>
Date: Mon, 26 Apr 2021 09:37:58 -0300
Subject: [PATCH 1/2] Add support to publish HDF files (modis) (close #221)

---
 bdc_collection_builder/celery/publish.py    | 17 +++-
 bdc_collection_builder/collections/hdf.py   | 99 +++++++++++++++++++++
 bdc_collection_builder/collections/utils.py |  4 +
 3 files changed, 119 insertions(+), 1 deletion(-)
 create mode 100644 bdc_collection_builder/collections/hdf.py

diff --git a/bdc_collection_builder/celery/publish.py b/bdc_collection_builder/celery/publish.py
index facc5de..0d80786 100644
--- a/bdc_collection_builder/celery/publish.py
+++ b/bdc_collection_builder/celery/publish.py
@@ -243,7 +243,22 @@ def publish_collection(scene_id: str, data: BaseCollection, collection: Collecti
     else:
         destination.mkdir(parents=True, exist_ok=True)
 
-    files = data.get_files(collection, path=file)
+    if file.endswith('.hdf'):
+        from ..collections.hdf import to_geotiff
+
+        destination.mkdir(parents=True, exist_ok=True)
+        item_result = to_geotiff(file, temporary_dir.name)
+        files = dict()
+
+        for _band, _geotiff in item_result.files.items():
+            destination_path = destination / Path(_geotiff).name
+            shutil.move(str(_geotiff), str(destination_path))
+            files[_band] = destination_path
+
+        file = destination
+        cloud_cover = item_result.cloud_cover
+    else:
+        files = data.get_files(collection, path=file)
 
     extra_assets = data.get_assets(collection, path=file)
 
diff --git a/bdc_collection_builder/collections/hdf.py b/bdc_collection_builder/collections/hdf.py
new file mode 100644
index 0000000..e0c0707
--- /dev/null
+++ b/bdc_collection_builder/collections/hdf.py
@@ -0,0 +1,99 @@
+#
+# This file is part of Brazil Data Cube Collection Builder.
+# Copyright (C) 2019-2020 INPE.
+#
+# Brazil Data Cube Collection Builder is free software; you can redistribute it and/or modify it
+# under the terms of the MIT License; see LICENSE file for more details.
+#
+
+"""Module to deal with Hierarchical Data Format (HDF4/HDF5)."""
+
+from pathlib import Path
+from typing import NamedTuple
+
+from osgeo import gdal
+
+DTYPES = dict(
+    uint8=gdal.GDT_Byte,
+    int16=gdal.GDT_Int16,
+    uint16=gdal.GDT_UInt16,
+    int32=gdal.GDT_Int32,
+    uint32=gdal.GDT_UInt32,
+)
+
+ItemResult = NamedTuple('ItemResult', [('files', dict), ('cloud_cover', float)])
+"""Type to represent the extracted scenes from an Hierarchical Data Format (HDF4/HDF5)."""
+
+
+def to_geotiff(hdf_path: str, destination: str) -> ItemResult:
+    """Convert a Hierarchical Data Format (HDF4/HDF5) file to set of GeoTIFF files.
+
+    Args:
+        hdf_path (str) - Path to the HDF file to be extracted
+        destination (str) - The destination folder.
+
+    Note:
+        The output GeoTIFF files are not Cloud Optimized GeoTIFF (COG).
+
+    Tip:
+        You may use the utility :meth:bdc_collection_builder.collections.utils.generate_cogs to generate Cloud Optimized GeoTIFF files.
+
+    Raises:
+        IOError When the input file is not a valid data set.
+
+    Returns:
+        ItemResult A struct containing the extracted files
+    """
+    data_set = gdal.Open(hdf_path)
+
+    if data_set is None:
+        raise IOError(f'Could not open {str(hdf_path)}')
+
+    base_name = Path(hdf_path).stem
+    metadata = data_set.GetMetadata()
+    cloud_cover = float(metadata.get('QAPERCENTCLOUDCOVER.1') or 0)
+    output_path = Path(destination)
+
+    geotiff_driver = gdal.GetDriverByName('GTiff')
+    files = dict()
+    # Band iterator index to retrieve metadata value
+    band_idx = 1
+    for data_set_name, _ in data_set.GetSubDatasets():
+        formal_name = metadata[f'PARAMETERNAME.{band_idx}']
+        band_name = '_'.join(formal_name.split(' ')[3:])
+
+        data_set = gdal.Open(data_set_name)
+        band = data_set.GetRasterBand(1)
+        array = band.ReadAsArray()
+
+        tiff_file = output_path / f'{base_name}_{band_name}.tif'
+
+        output_data_set = geotiff_driver.Create(
+            str(tiff_file),
+            data_set.RasterXSize,
+            data_set.RasterYSize,
+            1,
+            DTYPES[array.dtype.name]
+        )
+        output_data_set_band = output_data_set.GetRasterBand(1)
+        output_data_set.SetGeoTransform(data_set.GetGeoTransform())
+        output_data_set.SetProjection(data_set.GetProjection())
+        output_data_set.SetMetadata(metadata)
+        output_data_set_band.WriteArray(array)
+        output_data_set_band.SetNoDataValue(0)
+
+        files[band_name] = str(tiff_file)
+
+        output_data_set_band = None
+        output_data_set = None
+
+        band_idx += 1
+
+    return ItemResult(files=files, cloud_cover=cloud_cover)
+
+
+def is_valid(file_path: str) -> bool:
+    """Check the HDF file integrity with GDAL library."""
+    ds = gdal.Open(file_path)
+
+    return ds is not None
diff --git a/bdc_collection_builder/collections/utils.py b/bdc_collection_builder/collections/utils.py
index df2845b..6e42115 100644
--- a/bdc_collection_builder/collections/utils.py
+++ b/bdc_collection_builder/collections/utils.py
@@ -449,6 +449,10 @@ def is_valid_compressed_file(file_path: str) -> bool:
         return is_valid_tar(file_path)
     if file_path.endswith('.tar.gz'):
         return is_valid_tar_gz(file_path)
+    if file_path.endswith('.hdf'):
+        from .hdf import is_valid
+        return is_valid(file_path)
+    return True
 
 
 def is_valid_tar(file_path: str) -> bool:

From c4c096eb3be8e31bde14d4801c311337baf84a48 Mon Sep 17 00:00:00 2001
From: raphaelrpl <raphael.wcosta@gmail.com>
Date: Mon, 26 Apr 2021 09:47:37 -0300
Subject: [PATCH 2/2] Improve setup.py to support GDAL as optional extra

---
 setup.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index bc4dd86..066c6ad 100644
--- a/setup.py
+++ b/setup.py
@@ -39,7 +39,11 @@
 extras_require = {
     'docs': docs_require,
     'tests': tests_require,
-    'harmonization': harmonization_require
+    'harmonization': harmonization_require,
+    'gdal': [
+        'GDAL>=2.3',
+        'bdc-collectors @ git+git://github.com/brazil-data-cube/bdc-collectors.git@v0.2.1#egg=bdc-collectors[modis]',
+    ]
 }
 
 extras_require['all'] = [req for exts, reqs in extras_require.items() for req in reqs]