From cf2386441149baf276f45162e7cb19325f1d3a3d Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 9 May 2024 17:23:12 +0100 Subject: [PATCH 01/35] Add non-pickle azure_io_manager, revise cloud_assets to write to azure --- python/popgetter/__init__.py | 16 + python/popgetter/azure/__init__.py | 0 python/popgetter/azure/azure_io_manager.py | 326 +++++++++++++++++++++ python/popgetter/cloud_outputs.py | 135 ++++----- 4 files changed, 391 insertions(+), 86 deletions(-) create mode 100644 python/popgetter/azure/__init__.py create mode 100644 python/popgetter/azure/azure_io_manager.py diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index eccfaba..a49f36c 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -10,6 +10,8 @@ __all__ = ["__version__"] +import os + from dagster import ( AssetsDefinition, AssetSelection, @@ -26,8 +28,10 @@ from dagster._core.definitions.unresolved_asset_job_definition import ( UnresolvedAssetJobDefinition, ) +from dagster_azure.adls2 import adls2_resource from popgetter import assets, cloud_outputs +from popgetter.azure.azure_io_manager import adls2_io_manager all_assets: Sequence[AssetsDefinition | SourceAsset | CacheableAssetsDefinition] = [ *load_assets_from_package_module(assets.us, group_name="us"), @@ -65,6 +69,18 @@ "staging_res": StagingDirResource( staging_dir=str(Path(__file__).parent.joinpath("staging_dir").resolve()) ), + "azure_io_manager": adls2_io_manager.configured( + { + "adls2_file_system": os.getenv("AZURE_CONTAINER"), + "adls2_prefix": os.getenv("AZURE_DIRECTORY"), + } + ), + "adls2": adls2_resource.configured( + { + "storage_account": os.getenv("AZURE_STORAGE_ACCOUNT"), + "credential": {"sas": os.getenv("SAS_TOKEN")}, + } + ), }, jobs=[job_be, job_us, job_uk], ) diff --git a/python/popgetter/azure/__init__.py b/python/popgetter/azure/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/popgetter/azure/azure_io_manager.py b/python/popgetter/azure/azure_io_manager.py new file mode 100644 index 0000000..b04561e --- /dev/null +++ b/python/popgetter/azure/azure_io_manager.py @@ -0,0 +1,326 @@ +""" +ADLS2IOManager without pickling based on: +https://docs.dagster.io/_modules/dagster_azure/adls2/io_manager#ADLS2PickleIOManager + +""" +from __future__ import annotations + +from collections.abc import Iterator +from contextlib import contextmanager +from typing import Any + +from dagster import ( + InputContext, + OutputContext, + ResourceDependency, + io_manager, +) +from dagster import ( + _check as check, +) +from dagster._annotations import deprecated +from dagster._config.pythonic_config import ConfigurableIOManager +from dagster._core.storage.io_manager import dagster_maintained_io_manager +from dagster._core.storage.upath_io_manager import UPathIOManager +from dagster._utils.cached_method import cached_method +from dagster_azure.adls2.resources import ADLS2Resource +from dagster_azure.adls2.utils import ResourceNotFoundError +from pydantic import Field +from upath import UPath + +# Note: this might need to be longer for some large files, but there is an issue with header's not matching +_LEASE_DURATION = 60 # One minute + +# Set connection timeout to be larger than default: +# https://github.com/Azure/azure-sdk-for-python/issues/26993#issuecomment-1289799860 +_CONNECTION_TIMEOUT = 6000 + + +class ADLS2InnerIOManager(UPathIOManager): + def __init__( + self, + file_system: Any, + adls2_client: Any, + blob_client: Any, + lease_client_constructor: Any, + prefix: str = "dagster", + ): + self.adls2_client = adls2_client + self.file_system_client = self.adls2_client.get_file_system_client(file_system) + # We also need a blob client to handle copying as ADLS doesn't have a copy API yet + self.blob_client = blob_client + self.blob_container_client = self.blob_client.get_container_client(file_system) + self.prefix = check.str_param(prefix, "prefix") + + self.lease_client_constructor = lease_client_constructor + self.lease_duration = _LEASE_DURATION + self.file_system_client.get_file_system_properties() + super().__init__(base_path=UPath(self.prefix)) + + def get_op_output_relative_path( + self, context: InputContext | OutputContext + ) -> UPath: + parts = context.get_identifier() + run_id = parts[0] + output_parts = parts[1:] + return UPath("storage", run_id, "files", *output_parts) + + def get_loading_input_log_message(self, path: UPath) -> str: + return f"Loading ADLS2 object from: {self._uri_for_path(path)}" + + def get_writing_output_log_message(self, path: UPath) -> str: + return f"Writing ADLS2 object at: {self._uri_for_path(path)}" + + def unlink(self, path: UPath) -> None: + file_client = self.file_system_client.get_file_client(path.as_posix()) + with self._acquire_lease(file_client, is_rm=True) as lease: + file_client.delete_file(lease=lease, recursive=True) + + def make_directory(self, path: UPath) -> None: # noqa: ARG002 + # It is not necessary to create directories in ADLS2 + return None + + def path_exists(self, path: UPath) -> bool: + try: + self.file_system_client.get_file_client( + path.as_posix() + ).get_file_properties() + except ResourceNotFoundError: + return False + return True + + def _uri_for_path(self, path: UPath, protocol: str = "abfss://") -> str: + return "{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}".format( + protocol=protocol, + filesystem=self.file_system_client.file_system_name, + account=self.file_system_client.account_name, + key=path.as_posix(), + ) + + @contextmanager + def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]: + lease_client = self.lease_client_constructor(client=client) + try: + lease_client.acquire(lease_duration=self.lease_duration) + yield lease_client.id + finally: + # cannot release a lease on a file that no longer exists, so need to check + if not is_rm: + lease_client.release() + + def load_from_path(self, context: InputContext, path: UPath) -> bytes | None: + if context.dagster_type.typing_type == type(None): + return None + file = self.file_system_client.get_file_client(path.as_posix()) + stream = file.download_file() + # return pickle.loads(stream.readall()) + return stream.readall() + + def dump_to_path(self, context: OutputContext, obj: bytes, path: UPath) -> None: + if self.path_exists(path): + context.log.warning(f"Removing existing ADLS2 key: {path}") # noqa: G004 + self.unlink(path) + + # pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL) + pickled_obj = obj + file = self.file_system_client.create_file(path.as_posix()) + with self._acquire_lease(file) as lease: + # Note: chunk_size can also be specified, see API for Azure SDK for Python, DataLakeFileClient: + # https://learn.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.datalakefileclient + file.upload_data( + pickled_obj, + lease=lease, + overwrite=True, + connection_timeout=_CONNECTION_TIMEOUT, + ) + + +class ADLS2IOManager(ConfigurableIOManager): + """Persistent IO manager using Azure Data Lake Storage Gen2 for storage. + + Serializes objects via pickling. Suitable for objects storage for distributed executors, so long + as each execution node has network connectivity and credentials for ADLS and the backing + container. + + Assigns each op output to a unique filepath containing run ID, step key, and output name. + Assigns each asset to a single filesystem path, at "/". If the asset key + has multiple components, the final component is used as the name of the file, and the preceding + components as parent directories under the base_dir. + + Subsequent materializations of an asset will overwrite previous materializations of that asset. + With a base directory of "/my/base/path", an asset with key + `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory + with path "/my/base/path/one/two/". + + Example usage: + + 1. Attach this IO manager to a set of assets. + + .. code-block:: python + + from dagster import Definitions, asset + from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource + + + @asset + def asset1(): + # create df ... + return df + + + @asset + def asset2(asset1): + return df[:5] + + + defs = Definitions( + assets=[asset1, asset2], + resources={ + "io_manager": ADLS2PickleIOManager( + adls2_file_system="my-cool-fs", adls2_prefix="my-cool-prefix" + ), + "adls2": adls2_resource, + }, + ) + + + 2. Attach this IO manager to your job to make it available to your ops. + + .. code-block:: python + + from dagster import job + from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource + + + @job( + resource_defs={ + "io_manager": ADLS2PickleIOManager( + adls2_file_system="my-cool-fs", adls2_prefix="my-cool-prefix" + ), + "adls2": adls2_resource, + }, + ) + def my_job(): + ... + """ + + adls2: ResourceDependency[ADLS2Resource] + adls2_file_system: str = Field(description="ADLS Gen2 file system name.") + adls2_prefix: str = Field( + default="dagster", description="ADLS Gen2 file system prefix to write to." + ) + + @classmethod + def _is_dagster_maintained(cls) -> bool: + return True + + @property + @cached_method + def _internal_io_manager(self) -> ADLS2InnerIOManager: + return ADLS2InnerIOManager( + self.adls2_file_system, + self.adls2.adls2_client, + self.adls2.blob_client, + self.adls2.lease_client_constructor, + self.adls2_prefix, + ) + + def load_input(self, context: InputContext) -> Any: + return self._internal_io_manager.load_input(context) + + def handle_output(self, context: OutputContext, obj: Any) -> None: + self._internal_io_manager.handle_output(context, obj) + + +@deprecated( + breaking_version="2.0", + additional_warn_text="Please use GCSPickleIOManager instead.", +) +class ConfigurableADLS2IOManager(ADLS2IOManager): + """Renamed to ADLS2PickleIOManager. See ADLS2PickleIOManager for documentation.""" + + +@dagster_maintained_io_manager +@io_manager( + config_schema=ADLS2IOManager.to_config_schema(), + required_resource_keys={"adls2"}, +) +def adls2_io_manager(init_context): + """Persistent IO manager using Azure Data Lake Storage Gen2 for storage. + + Serializes objects via pickling. Suitable for objects storage for distributed executors, so long + as each execution node has network connectivity and credentials for ADLS and the backing + container. + + Assigns each op output to a unique filepath containing run ID, step key, and output name. + Assigns each asset to a single filesystem path, at "/". If the asset key + has multiple components, the final component is used as the name of the file, and the preceding + components as parent directories under the base_dir. + + Subsequent materializations of an asset will overwrite previous materializations of that asset. + With a base directory of "/my/base/path", an asset with key + `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory + with path "/my/base/path/one/two/". + + Example usage: + + 1. Attach this IO manager to a set of assets. + + .. code-block:: python + + from dagster import Definitions, asset + from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource + + + @asset + def asset1(): + # create df ... + return df + + + @asset + def asset2(asset1): + return df[:5] + + + defs = Definitions( + assets=[asset1, asset2], + resources={ + "io_manager": adls2_pickle_io_manager.configured( + {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"} + ), + "adls2": adls2_resource, + }, + ) + + + 2. Attach this IO manager to your job to make it available to your ops. + + .. code-block:: python + + from dagster import job + from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource + + + @job( + resource_defs={ + "io_manager": adls2_pickle_io_manager.configured( + {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"} + ), + "adls2": adls2_resource, + }, + ) + def my_job(): + ... + """ + adls_resource = init_context.resources.adls2 + adls2_client = adls_resource.adls2_client + blob_client = adls_resource.blob_client + lease_client = adls_resource.lease_client_constructor + return ADLS2InnerIOManager( + init_context.resource_config["adls2_file_system"], + adls2_client, + blob_client, + lease_client, + init_context.resource_config.get("adls2_prefix"), + ) diff --git a/python/popgetter/cloud_outputs.py b/python/popgetter/cloud_outputs.py index e923c0b..d1095dd 100644 --- a/python/popgetter/cloud_outputs.py +++ b/python/popgetter/cloud_outputs.py @@ -1,11 +1,14 @@ from __future__ import annotations +import os +import tempfile +import uuid from pathlib import Path import geopandas as gpd +import pandas as pd from dagster import ( AssetKey, - AssetOut, AssetSelection, Config, DefaultSensorStatus, @@ -20,22 +23,21 @@ define_asset_job, load_assets_from_current_module, local_file_manager, - multi_asset, multi_asset_sensor, ) from dagster_azure.adls2 import adls2_file_manager from icecream import ic from slugify import slugify +# See https://docs.dagster.io/_apidocs/libraries/dagster-azure#dagster_azure.adls2.adls2_file_manager resources = { "DEV": {"publishing_file_manager": local_file_manager}, "PRODUCTION": { - "publishing_file_manager": adls2_file_manager - # See https://docs.dagster.io/_apidocs/libraries/dagster-azure#dagster_azure.adls2.adls2_file_manager - # storage_account="tbc", # The storage account name. - # credential={}, # The credential used to authenticate the connection. - # adls2_file_system="tbc", - # adls2_prefix="tbc", + "publishing_file_manager": adls2_file_manager, + "storage_account": "popgetter", + "credential": os.getenv("SAS_TOKEN"), + "adls2_file_system": "tbc", + "adls2_prefix": "tbc", }, } @@ -80,7 +82,10 @@ def __init__(self, **kwargs): # Belgium Geography + tables | ( AssetSelection.groups("be") - & AssetSelection.keys("be/source_table").downstream(include_self=False) + # Note: include_self=True as the geodataframe is currently not output for + # derived tables but only for source table: + # be-source-table-https-statbel-fgov-be-node-4726 + & AssetSelection.keys("be/source_table").downstream(include_self=True) ) ) @@ -226,84 +231,44 @@ def upstream_df(context): raise ValueError(err_msg) -@multi_asset( - outs={ - "parquet_path": AssetOut(is_required=False), - "flatgeobuff_path": AssetOut(is_required=False), - "geojson_seq_path": AssetOut(is_required=False), - }, - can_subset=True, - required_resource_keys={"staging_res"}, - partitions_def=publishing_partition, -) -def cartography_in_cloud_formats(context, upstream_df): - """ " - Returns dict of parquet, FlatGeobuf and GeoJSONSeq paths - """ - staging_res = context.resources.staging_res - # ic(staging_res) - # ic(staging_res.staging_dir) - staging_dir_str = staging_res.staging_dir - - staging_dir = Path(staging_dir_str) - - # Extract the selected keys from the context - selected_keys = [ - key.to_user_string() for key in context.op_execution_context.selected_asset_keys - ] - - def _parquet_helper(output_path): - upstream_df.to_parquet(output_path) - - def _flatgeobuff_helper(output_path): - if output_path.exists(): - if output_path.is_dir(): - # Assuming that the directory is only one level deep - for file in output_path.iterdir(): - file.unlink() - output_path.rmdir() - else: - output_path.unlink() - upstream_df.to_file(output_path, driver="FlatGeobuf") - - def _geojson_seq_helper(output_path): - upstream_df.to_file(output_path, driver="GeoJSONSeq") - - # helper functions - format_helpers = { - "parquet_path": ("parquet", _parquet_helper), - "flatgeobuff_path": ("flatgeobuff", _flatgeobuff_helper), - "geojson_seq_path": ("geojsonseq", _geojson_seq_helper), - } - - for output_type in selected_keys: - # output_type = output_asset_key.to_user_string() - context.log.debug(ic(f"yielding {output_type}")) - extension, helper_function = format_helpers[output_type] - output_file_base = context.partition_key - output_path = staging_dir / f"{output_file_base}.{extension}" - output_path.parent.mkdir(exist_ok=True) - output_path.touch(exist_ok=True) - helper_function(output_path) - - yield Output( - value=output_path, - output_name=output_type, - metadata={ - "file_type": output_type, - "file_path": output_path, - }, - ) +@asset(io_manager_key="azure_io_manager") +def test_azure(): + return pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}).to_parquet(None) + + +@asset(io_manager_key="azure_io_manager") +def test_azure_large(): + return b"0" * (45 * 1024 * 1024 + 100) - ic("end of cartography_in_cloud_formats") +def df_to_bytes(df: gpd.GeoDataFrame, output_type: str) -> bytes: + tmp = tempfile.NamedTemporaryFile(prefix=str(uuid.uuid4())) + if output_type.lower() == "parquet": + df.to_parquet(tmp.name + ".parquet") + elif output_type.lower() == "flatgeobuf": + df.to_file(tmp.name + ".flatgeobuf", driver="FlatGeobuf") + elif output_type.lower() == "geojsonseq": + df.to_file(tmp.name + ".geojsonseq", driver="GeoJSONSeq") + else: + value_error: str = f"'{output_type}' is not currently supported." + raise ValueError(value_error) + with Path(tmp.name).open(mode="rb") as f: + return f.read() -# def upload_cartography_to_cloud(context, cartography_in_cloud_formats): -# """ -# Uploads the cartography files to the cloud. -# """ -# log_msg = f"Uploading cartography to the cloud - {cartography_in_cloud_formats}" -# context.log.info(log_msg) + +@asset(io_manager_key="azure_io_manager", partitions_def=publishing_partition) +def parquet(context, upstream_df): # noqa: ARG001 + return df_to_bytes(upstream_df, "parquet") + + +@asset(io_manager_key="azure_io_manager", partitions_def=publishing_partition) +def flatgeobuf(context, upstream_df): # noqa: ARG001 + return df_to_bytes(upstream_df, "flatgeobuf") + + +@asset(io_manager_key="azure_io_manager", partitions_def=publishing_partition) +def geojsonseq(context, upstream_df): # noqa: ARG001 + return df_to_bytes(upstream_df, "geojsonseq") # Not working yet - need to figure out questions about how we run docker @@ -312,7 +277,6 @@ def _geojson_seq_helper(output_path): # def generate_pmtiles(context, geojson_seq_path): # client = docker.from_env() # mount_folder = Path(geojson_seq_path).parent.resolve() - # container = client.containers.run( # "stuartlynn/tippecanoe:latest", # "tippecanoe -o tracts.pmtiles tracts.geojsonseq", @@ -320,7 +284,6 @@ def _geojson_seq_helper(output_path): # detach=True, # remove=True, # ) - # output = container.attach(stdout=True, stream=True, logs=True) # for line in output: # context.log.info(line) From 274a785af82b2a58a8c4c5e0141e3015246aec5d Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 9 May 2024 17:44:02 +0100 Subject: [PATCH 02/35] Add publishing_io_manager resources by ENV --- python/popgetter/__init__.py | 32 ++++++++++++++++++++----------- python/popgetter/cloud_outputs.py | 28 +++++---------------------- 2 files changed, 26 insertions(+), 34 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index a49f36c..12c8d7c 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -16,6 +16,7 @@ AssetsDefinition, AssetSelection, Definitions, + FilesystemIOManager, PipesSubprocessClient, SourceAsset, define_asset_job, @@ -59,17 +60,9 @@ description="Downloads UK data.", ) - -defs: Definitions = Definitions( - assets=all_assets, - schedules=[], - sensors=[cloud_outputs.country_outputs_sensor], - resources={ - "pipes_subprocess_client": PipesSubprocessClient(), - "staging_res": StagingDirResource( - staging_dir=str(Path(__file__).parent.joinpath("staging_dir").resolve()) - ), - "azure_io_manager": adls2_io_manager.configured( +resources_by_env = { + "prod": { + "publishing_io_manager": adls2_io_manager.configured( { "adls2_file_system": os.getenv("AZURE_CONTAINER"), "adls2_prefix": os.getenv("AZURE_DIRECTORY"), @@ -82,5 +75,22 @@ } ), }, + "dev": {"publishing_io_manager": FilesystemIOManager()}, +} + +resources = { + "pipes_subprocess_client": PipesSubprocessClient(), + "staging_res": StagingDirResource( + staging_dir=str(Path(__file__).parent.joinpath("staging_dir").resolve()) + ), +} + +resources.update(resources_by_env[os.getenv("ENV", "dev")]) + +defs: Definitions = Definitions( + assets=all_assets, + schedules=[], + sensors=[cloud_outputs.country_outputs_sensor], + resources=resources, jobs=[job_be, job_us, job_uk], ) diff --git a/python/popgetter/cloud_outputs.py b/python/popgetter/cloud_outputs.py index d1095dd..92f8980 100644 --- a/python/popgetter/cloud_outputs.py +++ b/python/popgetter/cloud_outputs.py @@ -1,6 +1,5 @@ from __future__ import annotations -import os import tempfile import uuid from pathlib import Path @@ -13,7 +12,6 @@ Config, DefaultSensorStatus, DynamicPartitionsDefinition, - EnvVar, Noneable, Output, RunConfig, @@ -22,27 +20,11 @@ asset, define_asset_job, load_assets_from_current_module, - local_file_manager, multi_asset_sensor, ) -from dagster_azure.adls2 import adls2_file_manager from icecream import ic from slugify import slugify -# See https://docs.dagster.io/_apidocs/libraries/dagster-azure#dagster_azure.adls2.adls2_file_manager -resources = { - "DEV": {"publishing_file_manager": local_file_manager}, - "PRODUCTION": { - "publishing_file_manager": adls2_file_manager, - "storage_account": "popgetter", - "credential": os.getenv("SAS_TOKEN"), - "adls2_file_system": "tbc", - "adls2_prefix": "tbc", - }, -} - -current_resource = resources[EnvVar("DEV")] - cloud_assets = load_assets_from_current_module(group_name="cloud_assets") @@ -231,12 +213,12 @@ def upstream_df(context): raise ValueError(err_msg) -@asset(io_manager_key="azure_io_manager") +@asset(io_manager_key="publishing_io_manager") def test_azure(): return pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}).to_parquet(None) -@asset(io_manager_key="azure_io_manager") +@asset(io_manager_key="publishing_io_manager") def test_azure_large(): return b"0" * (45 * 1024 * 1024 + 100) @@ -256,17 +238,17 @@ def df_to_bytes(df: gpd.GeoDataFrame, output_type: str) -> bytes: return f.read() -@asset(io_manager_key="azure_io_manager", partitions_def=publishing_partition) +@asset(io_manager_key="publishing_io_manager", partitions_def=publishing_partition) def parquet(context, upstream_df): # noqa: ARG001 return df_to_bytes(upstream_df, "parquet") -@asset(io_manager_key="azure_io_manager", partitions_def=publishing_partition) +@asset(io_manager_key="publishing_io_manager", partitions_def=publishing_partition) def flatgeobuf(context, upstream_df): # noqa: ARG001 return df_to_bytes(upstream_df, "flatgeobuf") -@asset(io_manager_key="azure_io_manager", partitions_def=publishing_partition) +@asset(io_manager_key="publishing_io_manager", partitions_def=publishing_partition) def geojsonseq(context, upstream_df): # noqa: ARG001 return df_to_bytes(upstream_df, "geojsonseq") From ee6f37a91a0d7a1bd917709a9fe2761f2ade91d1 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Thu, 9 May 2024 17:49:49 +0100 Subject: [PATCH 03/35] Remove references to pickling from popgetter ADLS IO manager --- python/popgetter/azure/azure_io_manager.py | 34 +++++++++------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/python/popgetter/azure/azure_io_manager.py b/python/popgetter/azure/azure_io_manager.py index b04561e..a42121c 100644 --- a/python/popgetter/azure/azure_io_manager.py +++ b/python/popgetter/azure/azure_io_manager.py @@ -18,7 +18,6 @@ from dagster import ( _check as check, ) -from dagster._annotations import deprecated from dagster._config.pythonic_config import ConfigurableIOManager from dagster._core.storage.io_manager import dagster_maintained_io_manager from dagster._core.storage.upath_io_manager import UPathIOManager @@ -113,7 +112,6 @@ def load_from_path(self, context: InputContext, path: UPath) -> bytes | None: return None file = self.file_system_client.get_file_client(path.as_posix()) stream = file.download_file() - # return pickle.loads(stream.readall()) return stream.readall() def dump_to_path(self, context: OutputContext, obj: bytes, path: UPath) -> None: @@ -121,14 +119,12 @@ def dump_to_path(self, context: OutputContext, obj: bytes, path: UPath) -> None: context.log.warning(f"Removing existing ADLS2 key: {path}") # noqa: G004 self.unlink(path) - # pickled_obj = pickle.dumps(obj, PICKLE_PROTOCOL) - pickled_obj = obj file = self.file_system_client.create_file(path.as_posix()) with self._acquire_lease(file) as lease: # Note: chunk_size can also be specified, see API for Azure SDK for Python, DataLakeFileClient: # https://learn.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.datalakefileclient file.upload_data( - pickled_obj, + obj, lease=lease, overwrite=True, connection_timeout=_CONNECTION_TIMEOUT, @@ -159,7 +155,8 @@ class ADLS2IOManager(ConfigurableIOManager): .. code-block:: python from dagster import Definitions, asset - from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource + from dagster_azure.adls2 import adls2_resource + from popgetter.azure import ADLS2IOManager @asset @@ -176,7 +173,7 @@ def asset2(asset1): defs = Definitions( assets=[asset1, asset2], resources={ - "io_manager": ADLS2PickleIOManager( + "io_manager": ADLS2IOManager( adls2_file_system="my-cool-fs", adls2_prefix="my-cool-prefix" ), "adls2": adls2_resource, @@ -189,12 +186,13 @@ def asset2(asset1): .. code-block:: python from dagster import job - from dagster_azure.adls2 import ADLS2PickleIOManager, adls2_resource + from dagster_azure.adls2 import adls2_resource + from popgetter.azure import ADLS2IOManager @job( resource_defs={ - "io_manager": ADLS2PickleIOManager( + "io_manager": ADLS2IOManager( adls2_file_system="my-cool-fs", adls2_prefix="my-cool-prefix" ), "adls2": adls2_resource, @@ -232,14 +230,6 @@ def handle_output(self, context: OutputContext, obj: Any) -> None: self._internal_io_manager.handle_output(context, obj) -@deprecated( - breaking_version="2.0", - additional_warn_text="Please use GCSPickleIOManager instead.", -) -class ConfigurableADLS2IOManager(ADLS2IOManager): - """Renamed to ADLS2PickleIOManager. See ADLS2PickleIOManager for documentation.""" - - @dagster_maintained_io_manager @io_manager( config_schema=ADLS2IOManager.to_config_schema(), @@ -269,7 +259,8 @@ def adls2_io_manager(init_context): .. code-block:: python from dagster import Definitions, asset - from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource + from dagster_azure.adls2 import adls2_resource + from popgetter.azure import adls2_io_manager @asset @@ -286,7 +277,7 @@ def asset2(asset1): defs = Definitions( assets=[asset1, asset2], resources={ - "io_manager": adls2_pickle_io_manager.configured( + "io_manager": adls2_io_manager.configured( {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"} ), "adls2": adls2_resource, @@ -299,12 +290,13 @@ def asset2(asset1): .. code-block:: python from dagster import job - from dagster_azure.adls2 import adls2_pickle_io_manager, adls2_resource + from dagster_azure.adls2 import adls2_resource + from popgetter.azure import adls2_io_manager @job( resource_defs={ - "io_manager": adls2_pickle_io_manager.configured( + "io_manager": adls2_io_manager.configured( {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"} ), "adls2": adls2_resource, From f01ec92d7c40702b2e3096df6092c650d97d3150 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 10 May 2024 16:17:09 +0100 Subject: [PATCH 04/35] Add IO managers for local and Azure Co-authored-by: Jonathan Yong --- python/popgetter/__init__.py | 20 ++++++- python/popgetter/assets/be/census_tables.py | 6 +- python/popgetter/azure/__init__.py | 3 + python/popgetter/io_managers/__init__.py | 36 +++++++++++ python/popgetter/io_managers/azure.py | 66 +++++++++++++++++++++ python/popgetter/io_managers/local.py | 26 ++++++++ 6 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 python/popgetter/io_managers/__init__.py create mode 100644 python/popgetter/io_managers/azure.py create mode 100644 python/popgetter/io_managers/local.py diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 12c8d7c..3be695f 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -3,6 +3,7 @@ from collections.abc import Sequence from pathlib import Path +from popgetter.io_managers.local import LocalTopLevelMetadataIOManager from popgetter.utils import StagingDirResource __version__ = "0.1.0" @@ -16,7 +17,6 @@ AssetsDefinition, AssetSelection, Definitions, - FilesystemIOManager, PipesSubprocessClient, SourceAsset, define_asset_job, @@ -75,7 +75,7 @@ } ), }, - "dev": {"publishing_io_manager": FilesystemIOManager()}, + "dev": {"publishing_io_manager": LocalTopLevelMetadataIOManager()}, } resources = { @@ -94,3 +94,19 @@ resources=resources, jobs=[job_be, job_us, job_uk], ) + + +# class AzureTopLevelMetadataIOManagerWithPartion(ADLS2IOManager): +# +# class AzureTopLevelMetadataIOManager(ADLS2IOManager): +# extension = "parquet" + + +# class FlatGeobufIOManager(ADLS2IOManager): +# extension = "parquet" + +# def _get_path(self, context: InputContext | OutputContext) -> "UPath": +# ... + +# def _get_path_without_extension(self, context: InputContext | OutputContext) -> "UPath": +# ... diff --git a/python/popgetter/assets/be/census_tables.py b/python/popgetter/assets/be/census_tables.py index 2212e3a..5aa546b 100644 --- a/python/popgetter/assets/be/census_tables.py +++ b/python/popgetter/assets/be/census_tables.py @@ -57,12 +57,12 @@ dataset_node_partition = DynamicPartitionsDefinition(name="dataset_nodes") -@asset(key_prefix=asset_prefix) -def get_publisher_metadata(): +@asset(key_prefix=asset_prefix, io_manager_key="publishing_io_manager") +def data_publisher(): """ Returns a DataPublisher of metadata about the publisher. """ - return publisher + return publisher.to_dataframe() @asset(key_prefix=asset_prefix) diff --git a/python/popgetter/azure/__init__.py b/python/popgetter/azure/__init__.py index e69de29..1871aba 100644 --- a/python/popgetter/azure/__init__.py +++ b/python/popgetter/azure/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from .azure_io_manager import * diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py new file mode 100644 index 0000000..5c52ddd --- /dev/null +++ b/python/popgetter/io_managers/__init__.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +import pandas as pd +from dagster import ConfigurableIOManager, InputContext, OutputContext +from icecream import ic +from upath import UPath + + +class TopLevelMetadataIOManager(ConfigurableIOManager): + # Mapping of asset keys to output filenames + output_filenames = { + "country_metadata": "country_metadata.parquet", + "data_publisher": "data_publishers.parquet", + "source_data_release": "source_data_releases.parquet", + # New metadata struct, not yet defined + # "geography_release": "geography_releases.parquet", + # Figure out how to use this when the IOManager class is defined + # "geometries": "geometries/{}", # this.format(filename) + # "metrics": "metrics/{}" # this.format(filename) + } + + def get_relative_path(self, context: OutputContext) -> UPath: + try: + ic(context.asset_key.path) + path_components = list(context.asset_key.path) + path_components[-1] = self.output_filenames[path_components[-1]] + return UPath("/".join(path_components)) + except KeyError: + err_msg = f"Only the asset keys {','.join(self.output_filenames.keys())} are compatible with this" + raise ValueError(err_msg) + + def to_binary(self, obj: pd.DataFrame) -> bytes: + return obj.to_parquet(None) + + def load_input(self, context: InputContext) -> pd.DataFrame: + raise RuntimeError("This IOManager is only for writing outputs") diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py new file mode 100644 index 0000000..6c8e623 --- /dev/null +++ b/python/popgetter/io_managers/azure.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import pandas as pd +from dagster import InputContext, MultiPartitionKey, OutputContext +from upath import UPath + +from ..azure import ADLS2InnerIOManager +from . import TopLevelMetadataIOManager + + +class AzureTopLevelMetadataIOManager(TopLevelMetadataIOManager, ADLS2InnerIOManager): + extension: str = ".parquet" + + def handle_output(self, context: OutputContext, obj: pd.DataFrame) -> None: + rel_path = self.get_relative_path(context) + full_path = self.get_base_path() / rel_path + full_path.parent.mkdir(parents=True, exist_ok=True) + context.add_output_metadata(metadata={"parquet_path": str(full_path)}) + self.dump_to_path(context, self.to_binary(obj), full_path) + + def _get_path_without_extension( + self, context: InputContext | OutputContext + ) -> UPath: + if context.has_asset_key: + context_path = self.get_asset_relative_path(context) + else: + # we are dealing with an op output + context_path = self.get_op_output_relative_path(context) + + return self._base_path.joinpath(context_path) + + def _get_paths_for_partitions( + self, context: InputContext | OutputContext + ) -> dict[str, UPath]: + """Returns a dict of partition_keys into I/O paths for a given context.""" + if not context.has_asset_partitions: + raise TypeError( + f"Detected {context.dagster_type.typing_type} input type " + "but the asset is not partitioned" + ) + + def _formatted_multipartitioned_path(partition_key: MultiPartitionKey) -> str: + ordered_dimension_keys = [ + key[1] + for key in sorted( + partition_key.keys_by_dimension.items(), key=lambda x: x[0] + ) + ] + return "/".join(ordered_dimension_keys) + + formatted_partition_keys = { + partition_key: ( + _formatted_multipartitioned_path(partition_key) + if isinstance(partition_key, MultiPartitionKey) + else partition_key + ) + for partition_key in context.asset_partition_keys + } + + asset_path = self._get_path_without_extension(context) + return { + partition_key: self._with_extension( + self.get_path_for_partition(context, asset_path, partition) + ) + for partition_key, partition in formatted_partition_keys.items() + } diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py new file mode 100644 index 0000000..b18ee57 --- /dev/null +++ b/python/popgetter/io_managers/local.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +import os + +import pandas as pd +from dagster import OutputContext +from upath import UPath + +from . import TopLevelMetadataIOManager + + +class LocalTopLevelMetadataIOManager(TopLevelMetadataIOManager): + dagster_home: str | None = os.getenv("DAGSTER_HOME") + + def get_base_path_local(self) -> UPath: + if not self.dagster_home: + raise ValueError("The DAGSTER_HOME environment variable must be set.") + return UPath(self.dagster_home) / "cloud_outputs" + + def handle_output(self, context: OutputContext, obj: pd.DataFrame) -> None: + rel_path = self.get_relative_path(context) + full_path = self.get_base_path() / rel_path + full_path.parent.mkdir(parents=True, exist_ok=True) + context.add_output_metadata(metadata={"parquet_path": str(full_path)}) + with full_path.open("wb") as file: + file.write(self.to_binary(obj)) From 03380bee6370b69c2c62b4644e69f9f455ad1986 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 10 May 2024 16:39:06 +0100 Subject: [PATCH 05/35] Fix for updated metadata structure --- pyproject.toml | 2 ++ python/popgetter/assets/be/census_tables.py | 6 ++++-- python/popgetter/io_managers/__init__.py | 2 +- python/popgetter/io_managers/local.py | 2 +- python/popgetter/metadata.py | 13 +++++++++++++ 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5ff16c2..00743ca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,8 @@ dependencies = [ "jcs >=0.2.1", # For generating IDs from class attributes ] + + [project.optional-dependencies] test = [ "pytest >=6", diff --git a/python/popgetter/assets/be/census_tables.py b/python/popgetter/assets/be/census_tables.py index 34fe6c0..5f455c0 100644 --- a/python/popgetter/assets/be/census_tables.py +++ b/python/popgetter/assets/be/census_tables.py @@ -24,6 +24,7 @@ from popgetter.metadata import ( DataPublisher, SourceDataRelease, + metadata_to_dataframe, ) from popgetter.utils import extract_main_file_from_zip, markdown_from_plot @@ -47,11 +48,12 @@ collection_period_end=date(2015, 10, 22), expect_next_update=date(2022, 1, 1), url="https://statbel.fgov.be/en/open-data", - data_publisher_id=publisher.id, description="TBC", geography_file="TBC", geography_level="Municipality", + data_publisher_id=publisher.id, ) +source.update_forward_refs() dataset_node_partition = DynamicPartitionsDefinition(name="dataset_nodes") @@ -61,7 +63,7 @@ def data_publisher(): """ Returns a DataPublisher of metadata about the publisher. """ - return publisher.to_dataframe() + return metadata_to_dataframe([publisher]) @asset(key_prefix=asset_prefix) diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 5c52ddd..1c3ad68 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -8,7 +8,7 @@ class TopLevelMetadataIOManager(ConfigurableIOManager): # Mapping of asset keys to output filenames - output_filenames = { + output_filenames: dict[str, str] = { "country_metadata": "country_metadata.parquet", "data_publisher": "data_publishers.parquet", "source_data_release": "source_data_releases.parquet", diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index b18ee57..ce25342 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -19,7 +19,7 @@ def get_base_path_local(self) -> UPath: def handle_output(self, context: OutputContext, obj: pd.DataFrame) -> None: rel_path = self.get_relative_path(context) - full_path = self.get_base_path() / rel_path + full_path = self.get_base_path_local() / rel_path full_path.parent.mkdir(parents=True, exist_ok=True) context.add_output_metadata(metadata={"parquet_path": str(full_path)}) with full_path.open("wb") as file: diff --git a/python/popgetter/metadata.py b/python/popgetter/metadata.py index 13aa197..964e35f 100644 --- a/python/popgetter/metadata.py +++ b/python/popgetter/metadata.py @@ -5,6 +5,7 @@ from typing import Self import jcs +import pandas as pd from pydantic import BaseModel, Field, computed_field, model_validator @@ -24,6 +25,18 @@ def hash_class_vars(class_instance): return sha256(jcs.canonicalize(variables)).hexdigest() +def metadata_to_dataframe( + metadata_instances: list[ + CountryMetadata | DataPublisher | SourceDataRelease | MetricMetadata + ], +): + """ + Convert a list of metadata instances to a pandas DataFrame. Any of the four + metadata classes defined in this module can be used here. + """ + return pd.DataFrame([vars(md) | {"id": md.id} for md in metadata_instances]) + + class CountryMetadata(BaseModel): @computed_field @property From c8e497725cab7b6e7a37cd3b17b5866ab3ee2111 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 10 May 2024 18:08:48 +0100 Subject: [PATCH 06/35] Revise azure_io_manager _get_path() method --- python/popgetter/azure/azure_io_manager.py | 40 ++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/python/popgetter/azure/azure_io_manager.py b/python/popgetter/azure/azure_io_manager.py index a42121c..2631ff1 100644 --- a/python/popgetter/azure/azure_io_manager.py +++ b/python/popgetter/azure/azure_io_manager.py @@ -9,6 +9,7 @@ from contextlib import contextmanager from typing import Any +import pandas as pd from dagster import ( InputContext, OutputContext, @@ -130,6 +131,36 @@ def dump_to_path(self, context: OutputContext, obj: bytes, path: UPath) -> None: connection_timeout=_CONNECTION_TIMEOUT, ) + # TODO: copied for now from TopLevelMetadataIONManager + output_filenames: dict[str, str] = { + "country_metadata": "country_metadata.parquet", + "data_publisher": "data_publishers.parquet", + "source_data_release": "source_data_releases.parquet", + # New metadata struct, not yet defined + # "geography_release": "geography_releases.parquet", + # Figure out how to use this when the IOManager class is defined + # "geometries": "geometries/{}", # this.format(filename) + # "metrics": "metrics/{}" # this.format(filename) + } + + def to_binary(self, obj: pd.DataFrame) -> bytes: + return obj.to_parquet(None) + + def _get_path(self, context: InputContext | OutputContext) -> UPath: + try: + path_components = list(context.asset_key.path) + path_components[-1] = self.output_filenames[path_components[-1]] + return UPath("/".join([self.prefix, *path_components])) + except KeyError: + err_msg = f"Only the asset keys {','.join(self.output_filenames.keys())} are compatible with this" + raise ValueError(err_msg) + + def _get_paths_for_partitions( + self, context: InputContext | OutputContext + ) -> dict[str, UPath]: + # TODO: need to override this method with correct extension + return super()._get_paths_for_partitions(context) + class ADLS2IOManager(ConfigurableIOManager): """Persistent IO manager using Azure Data Lake Storage Gen2 for storage. @@ -226,8 +257,13 @@ def _internal_io_manager(self) -> ADLS2InnerIOManager: def load_input(self, context: InputContext) -> Any: return self._internal_io_manager.load_input(context) - def handle_output(self, context: OutputContext, obj: Any) -> None: - self._internal_io_manager.handle_output(context, obj) + def handle_output(self, context: OutputContext, obj: pd.DataFrame) -> None: + # TODO: convert to bytes + # TODO: consider handling the conversion from an obj dependent on output metadata: + # - specify geography output type + # - specify whether DataFrame or GeoDataFrame + # self._internal_io_manager.handle_output(context, obj) + self._internal_io_manager.handle_output(context, obj.to_parquet(None)) @dagster_maintained_io_manager From fc9f05fa36c52984d989f5b28c9cfba7031b2119 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Fri, 10 May 2024 18:13:13 +0100 Subject: [PATCH 07/35] Update metadata & output spec to include GeometryRelease --- output_spec.md | 16 ++- python/popgetter/assets/be/census_geometry.py | 123 +++++++++++++----- python/popgetter/assets/be/census_tables.py | 6 +- python/popgetter/metadata.py | 53 ++++++-- 4 files changed, 146 insertions(+), 52 deletions(-) diff --git a/output_spec.md b/output_spec.md index 8ad02cd..e6b19e5 100644 --- a/output_spec.md +++ b/output_spec.md @@ -30,6 +30,7 @@ The top level should contain: ├── metric_metadata.parquet ├── source_data_releases.parquet ├── data_publishers.parquet + ├── geometry_metadata.parquet ├── country_metadata.parquet ├── metrics/ │   ├── {metric_filename_a}.parquet @@ -51,6 +52,8 @@ with tabulated metadata: `SourceDataRelease` - A `data_publishers.parquet` file containing a serialised list of `DataPublisher` +- A `geometry_metadata.parquet` file containing a serialised list of + `GeometryMetadata` - A `country_metadata.parquet` file containing a serialised list of `CountryMetadata` (most countries should only have one entry, but there may be more than one because of entities like the UK) @@ -62,18 +65,21 @@ Polars does not have the concept of an index column.) These dataframes are then serialised as parquet files, and can be given any filename, as the `MetricMetadata` struct should contain the filepath to them. -Likewise, geometries should be placed in the `geometries` subdirectory. Each set -of geometries should consist of two files, with the same filename stem and -different extensions: +Likewise, geometries should be placed in the `geometries` subdirectory. Each +set of geometries should consist of four files, with the same filename stem and +different extensions. The filename stem is specified inside the +`GeometryMetadata` struct. -- `{filename}.fgb` - a FlatGeobuf file with the geoIDs stored in the `GEO_ID` +- `{filename}.flatgeobuf` - a FlatGeobuf file with the geoIDs stored in the `GEO_ID` column +- `{filename}.geojsonseq` - corresponding GeoJSONSeq file +- `{filename}.pmtiles` - corresponding PMTiles file - `{filename}.parquet` - a serialised dataframe storing the names of the corresponding areas. This dataframe must have: - a `GEO_ID` column which corresponds exactly to those in the FlatGeobuf file. - one or more other columns, whose names are - [lowercase ISO 639-3 chdes](https://iso639-3.sil.org/code_tables/639/data), + [lowercase ISO 639-3 codes](https://iso639-3.sil.org/code_tables/639/data), and contain the names of each region in those specified languages. For example, the parquet file corresponding to the Belgian regions (with diff --git a/python/popgetter/assets/be/census_geometry.py b/python/popgetter/assets/be/census_geometry.py index 8dc52c3..1753a62 100644 --- a/python/popgetter/assets/be/census_geometry.py +++ b/python/popgetter/assets/be/census_geometry.py @@ -2,56 +2,113 @@ import geopandas as gpd import matplotlib.pyplot as plt +import pandas as pd from dagster import ( + AssetIn, + AssetOut, MetadataValue, + SpecificPartitionsPartitionMapping, + multi_asset, ) from icecream import ic from popgetter.utils import markdown_from_plot +from popgetter.metadata import GeometryMetadata, metadata_to_dataframe +from datetime import date -# TODO: Need to re-implement aggregate_sectors_to_municipalities to work with the sectors coming from the partitioned asset. +from .belgium import asset_prefix + +geometry_metadata: GeometryMetadata = GeometryMetadata( + validity_period_start=date(2023, 1, 1), + validity_period_end=date(2023, 12, 31), + level="municipality", + # country -> province -> region -> arrondisement -> municipality + hxl_tag="adm4", +) -# @asset( -# key_prefix=asset_prefix, -# ins={ -# "sector_geometries": AssetIn(key_prefix=asset_prefix), -# }, -# ) -def aggregate_sectors_to_municipalities(context, sector_geometries) -> gpd.GeoDataFrame: +@multi_asset( + ins={ + "sector_geometries": AssetIn( + key=[asset_prefix, "individual_census_table"], + partition_mapping=SpecificPartitionsPartitionMapping( + ["https://statbel.fgov.be/node/4726"] + ), + ), + }, + outs={ + "geometry_metadata": AssetOut(key_prefix=asset_prefix), + "geometry": AssetOut(key_prefix=asset_prefix), + "geometry_names": AssetOut(key_prefix=asset_prefix), + }, +) +def municipalities( + context, sector_geometries +) -> tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame]: + """ + Produces the full set of data / metadata associated with Belgian + municipalities. The outputs, in order, are: + + 1. A DataFrame containing a serialised GeometryMetadata object. + 2. A GeoDataFrame containing the geometries of the municipalities. + 3. A DataFrame containing the names of the municipalities (in this case, + they are in Dutch, French, and German). """ - Aggregates a GeoDataFrame of the Statistical Sectors to Municipalities. + municipality_geometries = ( + sector_geometries.dissolve(by="cd_munty_refnis") + .reset_index() + .rename(columns={"cd_munty_refnis": "GEO_ID"}) + .loc[:, ["geometry", "GEO_ID"]] + ) + ic(municipality_geometries.head()) - The `sectors_gdf` is assumed to be produced by `get_geometries()`. + municipality_names = ( + sector_geometries.rename( + columns={ + "cd_munty_refnis": "GEO_ID", + "tx_munty_descr_nl": "nld", + "tx_munty_descr_fr": "fra", + "tx_munty_descr_de": "deu", + } + ) + .loc[:, ["GEO_ID", "nld", "fra", "deu"]] + .drop_duplicates() + ) + ic(municipality_names.head()) - Also saves the result to a GeoPackage file in the output_dir - returns a GeoDataFrame of the Municipalities. - """ - # output_dir = WORKING_DIR / "statistical_sectors" - munty_gdf = sector_geometries.dissolve(by="cd_munty_refnis") - # munty_gdf.to_file(output_dir / "municipalities.gpkg", driver="GPKG") - munty_gdf.index = munty_gdf.index.astype(str) - ic(munty_gdf.head()) - ic(len(munty_gdf)) - - # Plot and convert the image to Markdown to preview it within Dagster - # Yes we do pass the `plt` object to the markdown_from_plot function and not the `ax` object - # ax = munty_gdf.plot(color="green") - ax = munty_gdf.plot(column="tx_sector_descr_nl", legend=False) + # Generate a plot and convert the image to Markdown to preview it within + # Dagster + joined_gdf = municipality_geometries.merge(municipality_names, on="GEO_ID") + ax = joined_gdf.plot(column="nld", legend=False) ax.set_title("Municipalities in Belgium") md_plot = markdown_from_plot(plt) + geometry_metadata_df = metadata_to_dataframe([geometry_metadata]) + context.add_output_metadata( + output_name="geometry_metadata", metadata={ - "num_records": len(munty_gdf), # Metadata can be any key-value pair - "columns": MetadataValue.md( - "\n".join([f"- '`{col}`'" for col in munty_gdf.columns.to_list()]) - ), - "preview": MetadataValue.md( - munty_gdf.loc[:, munty_gdf.columns != "geometry"].head().to_markdown() - ), + "preview": MetadataValue.md(geometry_metadata_df.head().to_markdown()), + }, + ) + context.add_output_metadata( + output_name="geometry", + metadata={ + "num_records": len(municipality_geometries), "plot": MetadataValue.md(md_plot), - } + }, + ) + context.add_output_metadata( + output_name="geometry_names", + metadata={ + "num_records": len(municipality_names), + "name_columns": MetadataValue.md( + "\n".join( + [f"- '`{col}`'" for col in municipality_names.columns.to_list()] + ) + ), + "preview": MetadataValue.md(municipality_names.head().to_markdown()), + }, ) - return munty_gdf + return geometry_metadata_df, municipality_geometries, municipality_names diff --git a/python/popgetter/assets/be/census_tables.py b/python/popgetter/assets/be/census_tables.py index 5f455c0..9b8ee41 100644 --- a/python/popgetter/assets/be/census_tables.py +++ b/python/popgetter/assets/be/census_tables.py @@ -24,11 +24,13 @@ from popgetter.metadata import ( DataPublisher, SourceDataRelease, + GeometryMetadata, metadata_to_dataframe, ) from popgetter.utils import extract_main_file_from_zip, markdown_from_plot from .belgium import asset_prefix, country +from .census_geometry import geometry_metadata publisher: DataPublisher = DataPublisher( name="Statbel", @@ -49,11 +51,9 @@ expect_next_update=date(2022, 1, 1), url="https://statbel.fgov.be/en/open-data", description="TBC", - geography_file="TBC", - geography_level="Municipality", data_publisher_id=publisher.id, + geometry_metadata_id=geometry_metadata.id, ) -source.update_forward_refs() dataset_node_partition = DynamicPartitionsDefinition(name="dataset_nodes") diff --git a/python/popgetter/metadata.py b/python/popgetter/metadata.py index 964e35f..7b1d288 100644 --- a/python/popgetter/metadata.py +++ b/python/popgetter/metadata.py @@ -17,7 +17,10 @@ def hash_class_vars(class_instance): Note that `vars()` does not include properties, so the IDs themselves are not part of the hash, which avoids self-reference issues. """ - variables = vars(class_instance) + # Must copy the dict to avoid overriding the actual instance attributes! + # Because we're only modifying dates -> strings, we don't need to perform a + # deepcopy + variables = dict(**vars(class_instance)) # Python doesn't serialise dates to JSON, have to convert to ISO 8601 first for key, val in variables.items(): if isinstance(val, date): @@ -26,15 +29,13 @@ def hash_class_vars(class_instance): def metadata_to_dataframe( - metadata_instances: list[ - CountryMetadata | DataPublisher | SourceDataRelease | MetricMetadata - ], + metadata_instances: list[BaseModel], ): """ Convert a list of metadata instances to a pandas DataFrame. Any of the four metadata classes defined in this module can be used here. """ - return pd.DataFrame([vars(md) | {"id": md.id} for md in metadata_instances]) + return pd.DataFrame([md.model_dump() for md in metadata_instances]) class CountryMetadata(BaseModel): @@ -78,6 +79,33 @@ def id(self) -> str: ) +class GeometryMetadata(BaseModel): + @computed_field + @property + def id(self) -> str: + return hash_class_vars(self) + + @computed_field + @property + def filename_stem(self) -> str: + level = "_".join(self.level.lower().split()) + year = self.validity_period_start.year + return f"{level}_{year}" + + validity_period_start: date = Field( + description="The start of the range of time for which the regions are valid (inclusive)" + ) + validity_period_end: date = Field( + description="The end of the range of time for which the regions are valid (inclusive). If the data is a single-day snapshot, this should be the same as `validity_period_start`." + ) + level: str = Field( + description="The geography level contained in the file (e.g. output area, LSOA, MSOA, etc)" + ) + hxl_tag: str = Field( + description="Humanitarian eXchange Language (HXL) description for the geography level" + ) + + class SourceDataRelease(BaseModel): @computed_field @property @@ -108,11 +136,8 @@ def id(self) -> str: description="The ID of the publisher of the data release" ) description: str = Field(description="A description of the data release") - geography_file: str = Field( - description="The path of the geography FlatGeobuf file, relative to the top level of the data release" - ) - geography_level: str = Field( - description="The geography level contained in the file (e.g. output area, LSOA, MSOA, etc)" + geometry_metadata_id: str = Field( + description="The ID of the geometry metadata associated with this data release" ) @model_validator(mode="after") @@ -177,7 +202,13 @@ def id(self) -> str: ) -EXPORTED_MODELS = [CountryMetadata, DataPublisher, SourceDataRelease, MetricMetadata] +EXPORTED_MODELS = [ + CountryMetadata, + DataPublisher, + SourceDataRelease, + MetricMetadata, + GeometryMetadata, +] def export_schema(): From 6e03d4c2f3a3cb83ba5636fc860f55ab938daef4 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Fri, 10 May 2024 18:30:11 +0100 Subject: [PATCH 08/35] Add assets for CountryMetadata and SourceDataRelease --- python/popgetter/assets/be/__init__.py | 8 -------- python/popgetter/assets/be/census_tables.py | 20 ++++++++++++++++++-- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/python/popgetter/assets/be/__init__.py b/python/popgetter/assets/be/__init__.py index 60f9d46..9a961ea 100755 --- a/python/popgetter/assets/be/__init__.py +++ b/python/popgetter/assets/be/__init__.py @@ -17,14 +17,6 @@ from .belgium import asset_prefix, country -@asset(key_prefix=asset_prefix) -def get_country_metadata() -> CountryMetadata: - """ - Returns a CountryMetadata of metadata about the country. - """ - return country - - # @asset(key_prefix=asset_prefix) # def get_population_details_per_municipality(context): # """ diff --git a/python/popgetter/assets/be/census_tables.py b/python/popgetter/assets/be/census_tables.py index 9b8ee41..cef796b 100644 --- a/python/popgetter/assets/be/census_tables.py +++ b/python/popgetter/assets/be/census_tables.py @@ -59,13 +59,29 @@ @asset(key_prefix=asset_prefix, io_manager_key="publishing_io_manager") -def data_publisher(): +def country_metadata() -> pd.DataFrame: """ - Returns a DataPublisher of metadata about the publisher. + Returns a dataframe containing the CountryMetadata for this country. + """ + return metadata_to_dataframe([country]) + + +@asset(key_prefix=asset_prefix, io_manager_key="publishing_io_manager") +def data_publisher() -> pd.DataFrame: + """ + Returns a dataframe containing the DataPublisher for this country. """ return metadata_to_dataframe([publisher]) +@asset(key_prefix=asset_prefix, io_manager_key="publishing_io_manager") +def source_data_release() -> pd.DataFrame: + """ + Returns a dataframe containing the SourceDataRelease for this country. + """ + return metadata_to_dataframe([source]) + + @asset(key_prefix=asset_prefix) def opendata_dataset_list(context) -> Graph: """ From 201adb3b7122c2f6e5fde60897129ed3e214b272 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Fri, 10 May 2024 19:48:34 +0100 Subject: [PATCH 09/35] (Failed) attempt to generate geometries for multiple levels --- python/popgetter/assets/be/census_geometry.py | 120 +++++++++++++++--- 1 file changed, 99 insertions(+), 21 deletions(-) diff --git a/python/popgetter/assets/be/census_geometry.py b/python/popgetter/assets/be/census_geometry.py index 1753a62..3ece84d 100644 --- a/python/popgetter/assets/be/census_geometry.py +++ b/python/popgetter/assets/be/census_geometry.py @@ -7,6 +7,7 @@ AssetIn, AssetOut, MetadataValue, + StaticPartitionsDefinition, SpecificPartitionsPartitionMapping, multi_asset, ) @@ -17,6 +18,7 @@ from datetime import date from .belgium import asset_prefix +from dataclasses import dataclass geometry_metadata: GeometryMetadata = GeometryMetadata( validity_period_start=date(2023, 1, 1), @@ -27,6 +29,68 @@ ) +@dataclass +class BelgiumGeometryLevel: + level: str + hxl_tag: str + geo_id_column: str + name_columns: dict[str, str] # keys = language codes, values = column names + + +BELGIUM_GEOMETRY_LEVELS = { + "province": BelgiumGeometryLevel( + level="province", + hxl_tag="adm1", + geo_id_column="cd_prov_refnis", + name_columns={ + "nld": "tx_prov_descr_nl", + "fra": "tx_prov_descr_fr", + "deu": "tx_prov_descr_de", + }, + ), + "region": BelgiumGeometryLevel( + level="region", + hxl_tag="adm2", + geo_id_column="cd_rgn_refnis", + name_columns={ + "nld": "tx_rgn_descr_nl", + "fra": "tx_rgn_descr_fr", + "deu": "tx_rgn_descr_de", + }, + ), + "arrondisement": BelgiumGeometryLevel( + level="arrondisement", + hxl_tag="adm3", + geo_id_column="cd_dstr_refnis", + name_columns={ + "nld": "tx_adm_dstr_descr_nl", + "fra": "tx_adm_dstr_descr_fr", + "deu": "tx_adm_dstr_descr_de", + }, + ), + "municipality": BelgiumGeometryLevel( + level="municipality", + hxl_tag="adm4", + geo_id_column="cd_munty_refnis", + name_columns={ + "nld": "tx_munty_descr_nl", + "fra": "tx_munty_descr_fr", + "deu": "tx_munty_descr_de", + }, + ), + "statistical_sector": BelgiumGeometryLevel( + level="statistical_sector", + hxl_tag="adm5", + geo_id_column="cd_sector", + name_columns={ + "nld": "tx_sector_descr_nl", + "fra": "tx_sector_descr_fr", + "deu": "tx_sector_descr_de", + }, + ), +} + + @multi_asset( ins={ "sector_geometries": AssetIn( @@ -41,46 +105,62 @@ "geometry": AssetOut(key_prefix=asset_prefix), "geometry_names": AssetOut(key_prefix=asset_prefix), }, + # NOTE: This doesn't work. You can't define assets with different partition + # schemes within the same job. Because the upstream asset already uses + # dynamic partitioning, this asset can't use a different static + # partitioning. MEH. + # partitions_def=StaticPartitionsDefinition( + # list(BELGIUM_GEOMETRY_LEVELS.keys()) + # ), ) -def municipalities( +def create_geometries( context, sector_geometries ) -> tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame]: """ Produces the full set of data / metadata associated with Belgian municipalities. The outputs, in order, are: - + 1. A DataFrame containing a serialised GeometryMetadata object. 2. A GeoDataFrame containing the geometries of the municipalities. 3. A DataFrame containing the names of the municipalities (in this case, they are in Dutch, French, and German). """ - municipality_geometries = ( - sector_geometries.dissolve(by="cd_munty_refnis") + level_details = BELGIUM_GEOMETRY_LEVELS["municipality"] + + geometry_metadata = GeometryMetadata( + validity_period_start=date(2023, 1, 1), + validity_period_end=date(2023, 12, 31), + level=level_details.level, + hxl_tag=level_details.hxl_tag, + ) + + region_geometries = ( + sector_geometries.dissolve(by=level_details.geo_id_column) .reset_index() - .rename(columns={"cd_munty_refnis": "GEO_ID"}) + .rename(columns={level_details.geo_id_column: "GEO_ID"}) .loc[:, ["geometry", "GEO_ID"]] ) - ic(municipality_geometries.head()) + ic(region_geometries.head()) - municipality_names = ( + region_names = ( sector_geometries.rename( columns={ - "cd_munty_refnis": "GEO_ID", - "tx_munty_descr_nl": "nld", - "tx_munty_descr_fr": "fra", - "tx_munty_descr_de": "deu", + level_details.geo_id_column: "GEO_ID", + level_details.name_columns["nld"]: "nld", + level_details.name_columns["fra"]: "fra", + level_details.name_columns["deu"]: "deu", } ) .loc[:, ["GEO_ID", "nld", "fra", "deu"]] .drop_duplicates() ) - ic(municipality_names.head()) + ic(region_names.head()) # Generate a plot and convert the image to Markdown to preview it within # Dagster - joined_gdf = municipality_geometries.merge(municipality_names, on="GEO_ID") + joined_gdf = region_geometries.merge(region_names, on="GEO_ID") ax = joined_gdf.plot(column="nld", legend=False) - ax.set_title("Municipalities in Belgium") + ax.set_title(f"Belgium 2023 {level_details.level}") md_plot = markdown_from_plot(plt) geometry_metadata_df = metadata_to_dataframe([geometry_metadata]) @@ -94,21 +174,19 @@ def municipalities( context.add_output_metadata( output_name="geometry", metadata={ - "num_records": len(municipality_geometries), + "num_records": len(region_geometries), "plot": MetadataValue.md(md_plot), }, ) context.add_output_metadata( output_name="geometry_names", metadata={ - "num_records": len(municipality_names), + "num_records": len(region_names), "name_columns": MetadataValue.md( - "\n".join( - [f"- '`{col}`'" for col in municipality_names.columns.to_list()] - ) + "\n".join([f"- '`{col}`'" for col in region_names.columns.to_list()]) ), - "preview": MetadataValue.md(municipality_names.head().to_markdown()), + "preview": MetadataValue.md(region_names.head().to_markdown()), }, ) - return geometry_metadata_df, municipality_geometries, municipality_names + return geometry_metadata_df, region_geometries, region_names From 5db26abe3c4f54f74a016ae1319022a2d20815a8 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Fri, 10 May 2024 21:40:03 +0100 Subject: [PATCH 10/35] Generate a single geometry asset instead of multiasset --- python/popgetter/assets/be/census_geometry.py | 43 ++++--------------- 1 file changed, 8 insertions(+), 35 deletions(-) diff --git a/python/popgetter/assets/be/census_geometry.py b/python/popgetter/assets/be/census_geometry.py index 3ece84d..54bc869 100644 --- a/python/popgetter/assets/be/census_geometry.py +++ b/python/popgetter/assets/be/census_geometry.py @@ -4,12 +4,10 @@ import matplotlib.pyplot as plt import pandas as pd from dagster import ( - AssetIn, - AssetOut, MetadataValue, - StaticPartitionsDefinition, SpecificPartitionsPartitionMapping, - multi_asset, + AssetIn, + asset, ) from icecream import ic @@ -91,7 +89,7 @@ class BelgiumGeometryLevel: } -@multi_asset( +@asset( ins={ "sector_geometries": AssetIn( key=[asset_prefix, "individual_census_table"], @@ -100,20 +98,8 @@ class BelgiumGeometryLevel: ), ), }, - outs={ - "geometry_metadata": AssetOut(key_prefix=asset_prefix), - "geometry": AssetOut(key_prefix=asset_prefix), - "geometry_names": AssetOut(key_prefix=asset_prefix), - }, - # NOTE: This doesn't work. You can't define assets with different partition - # schemes within the same job. Because the upstream asset already uses - # dynamic partitioning, this asset can't use a different static - # partitioning. MEH. - # partitions_def=StaticPartitionsDefinition( - # list(BELGIUM_GEOMETRY_LEVELS.keys()) - # ), ) -def create_geometries( +def geometry( context, sector_geometries ) -> tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame]: """ @@ -166,26 +152,13 @@ def create_geometries( geometry_metadata_df = metadata_to_dataframe([geometry_metadata]) context.add_output_metadata( - output_name="geometry_metadata", - metadata={ - "preview": MetadataValue.md(geometry_metadata_df.head().to_markdown()), - }, - ) - context.add_output_metadata( - output_name="geometry", metadata={ "num_records": len(region_geometries), - "plot": MetadataValue.md(md_plot), - }, - ) - context.add_output_metadata( - output_name="geometry_names", - metadata={ - "num_records": len(region_names), - "name_columns": MetadataValue.md( - "\n".join([f"- '`{col}`'" for col in region_names.columns.to_list()]) + "geometry_plot": MetadataValue.md(md_plot), + "names_preview": MetadataValue.md(region_names.head().to_markdown()), + "metadata_preview": MetadataValue.md( + geometry_metadata_df.head().to_markdown() ), - "preview": MetadataValue.md(region_names.head().to_markdown()), }, ) From 138b83623acd2f42efa8037958eb509981908238 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Fri, 10 May 2024 22:44:02 +0100 Subject: [PATCH 11/35] Add a local IO manager for geometry --- python/popgetter/__init__.py | 23 +++--------- python/popgetter/assets/be/census_geometry.py | 2 ++ python/popgetter/io_managers/__init__.py | 34 ++++++++++++++---- python/popgetter/io_managers/local.py | 35 +++++++++++++++++-- 4 files changed, 68 insertions(+), 26 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 3be695f..e910c9f 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -3,7 +3,7 @@ from collections.abc import Sequence from pathlib import Path -from popgetter.io_managers.local import LocalTopLevelMetadataIOManager +from popgetter.io_managers.local import LocalTopLevelMetadataIOManager, LocalGeometryIOManager from popgetter.utils import StagingDirResource __version__ = "0.1.0" @@ -75,7 +75,10 @@ } ), }, - "dev": {"publishing_io_manager": LocalTopLevelMetadataIOManager()}, + "dev": { + "publishing_io_manager": LocalTopLevelMetadataIOManager(), + "geometry_io_manager": LocalGeometryIOManager(), + }, } resources = { @@ -94,19 +97,3 @@ resources=resources, jobs=[job_be, job_us, job_uk], ) - - -# class AzureTopLevelMetadataIOManagerWithPartion(ADLS2IOManager): -# -# class AzureTopLevelMetadataIOManager(ADLS2IOManager): -# extension = "parquet" - - -# class FlatGeobufIOManager(ADLS2IOManager): -# extension = "parquet" - -# def _get_path(self, context: InputContext | OutputContext) -> "UPath": -# ... - -# def _get_path_without_extension(self, context: InputContext | OutputContext) -> "UPath": -# ... diff --git a/python/popgetter/assets/be/census_geometry.py b/python/popgetter/assets/be/census_geometry.py index 54bc869..1538033 100644 --- a/python/popgetter/assets/be/census_geometry.py +++ b/python/popgetter/assets/be/census_geometry.py @@ -98,6 +98,8 @@ class BelgiumGeometryLevel: ), ), }, + key_prefix=asset_prefix, + io_manager_key="geometry_io_manager", ) def geometry( context, sector_geometries diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 1c3ad68..0c4c90f 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -7,16 +7,10 @@ class TopLevelMetadataIOManager(ConfigurableIOManager): - # Mapping of asset keys to output filenames output_filenames: dict[str, str] = { "country_metadata": "country_metadata.parquet", "data_publisher": "data_publishers.parquet", "source_data_release": "source_data_releases.parquet", - # New metadata struct, not yet defined - # "geography_release": "geography_releases.parquet", - # Figure out how to use this when the IOManager class is defined - # "geometries": "geometries/{}", # this.format(filename) - # "metrics": "metrics/{}" # this.format(filename) } def get_relative_path(self, context: OutputContext) -> UPath: @@ -34,3 +28,31 @@ def to_binary(self, obj: pd.DataFrame) -> bytes: def load_input(self, context: InputContext) -> pd.DataFrame: raise RuntimeError("This IOManager is only for writing outputs") + + +class TopLevelGeometryIOManager(ConfigurableIOManager): + def get_relative_paths( + self, + context: OutputContext, + obj: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame], + ) -> list[UPath]: + filename_stem = obj[0].iloc[0]["filename_stem"] + asset_prefix = context.asset_key.path[:-1] # e.g. ['be'] + return { + "metadata": "/".join(asset_prefix + ["geometry_metadata.parquet"]), + "flatgeobuf": "/".join( + asset_prefix + ["geometries", f"{filename_stem}.fgb"] + ), + "pmtiles": "/".join( + asset_prefix + ["geometries", f"{filename_stem}.pmtiles"] + ), + "geojsonseq": "/".join( + asset_prefix + ["geometries", f"{filename_stem}.geojsonseq"] + ), + "names": "/".join( + asset_prefix + ["geometries", f"{filename_stem}.parquet"] + ), + } + + def load_input(self, context: InputContext) -> pd.DataFrame: + raise RuntimeError("This IOManager is only for writing outputs") diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index ce25342..f454d53 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -6,10 +6,10 @@ from dagster import OutputContext from upath import UPath -from . import TopLevelMetadataIOManager +from . import TopLevelMetadataIOManager, TopLevelGeometryIOManager -class LocalTopLevelMetadataIOManager(TopLevelMetadataIOManager): +class DagsterHomeMixin: dagster_home: str | None = os.getenv("DAGSTER_HOME") def get_base_path_local(self) -> UPath: @@ -17,6 +17,8 @@ def get_base_path_local(self) -> UPath: raise ValueError("The DAGSTER_HOME environment variable must be set.") return UPath(self.dagster_home) / "cloud_outputs" + +class LocalTopLevelMetadataIOManager(TopLevelMetadataIOManager, DagsterHomeMixin): def handle_output(self, context: OutputContext, obj: pd.DataFrame) -> None: rel_path = self.get_relative_path(context) full_path = self.get_base_path_local() / rel_path @@ -24,3 +26,32 @@ def handle_output(self, context: OutputContext, obj: pd.DataFrame) -> None: context.add_output_metadata(metadata={"parquet_path": str(full_path)}) with full_path.open("wb") as file: file.write(self.to_binary(obj)) + + +class LocalGeometryIOManager(TopLevelGeometryIOManager, DagsterHomeMixin): + def handle_output( + self, + context: OutputContext, + obj: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame], + ) -> None: + rel_paths = self.get_relative_paths(context, obj) + base_path = self.get_base_path_local() + full_paths = {key: base_path / rel_path for key, rel_path in rel_paths.items()} + for path in full_paths.values(): + path.parent.mkdir(parents=True, exist_ok=True) + context.add_output_metadata( + metadata={ + "geometry_metadata_path": str(full_paths["metadata"]), + "flatgeobuf_path": str(full_paths["flatgeobuf"]), + "pmtiles_path": str(full_paths["pmtiles"]), + "geojsonseq_path": str(full_paths["geojsonseq"]), + "names_path": str(full_paths["names"]), + } + ) + + metadata_df, gdf, names_df = obj + metadata_df.to_parquet(full_paths["metadata"]) + gdf.to_file(full_paths["flatgeobuf"], driver="FlatGeobuf") + gdf.to_file(full_paths["geojsonseq"], driver="GeoJSONSeq") + # TODO: generate pmtiles + names_df.to_parquet(full_paths["names"]) From 0f034e7da7972ca0d5a9a2ed1fecf98fef94ffad Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 10 May 2024 21:38:01 +0100 Subject: [PATCH 12/35] Handle outputs implementation for ADLS2IOManager --- python/popgetter/azure/azure_io_manager.py | 83 +++++++++++++++------- 1 file changed, 57 insertions(+), 26 deletions(-) diff --git a/python/popgetter/azure/azure_io_manager.py b/python/popgetter/azure/azure_io_manager.py index 2631ff1..ba3583d 100644 --- a/python/popgetter/azure/azure_io_manager.py +++ b/python/popgetter/azure/azure_io_manager.py @@ -5,10 +5,14 @@ """ from __future__ import annotations +import tempfile +import uuid from collections.abc import Iterator from contextlib import contextmanager +from pathlib import Path from typing import Any +import geopandas as gpd import pandas as pd from dagster import ( InputContext, @@ -44,6 +48,8 @@ def __init__( blob_client: Any, lease_client_constructor: Any, prefix: str = "dagster", + extension: str | None = None, + output_type: str = "metadata", ): self.adls2_client = adls2_client self.file_system_client = self.adls2_client.get_file_system_client(file_system) @@ -55,6 +61,9 @@ def __init__( self.lease_client_constructor = lease_client_constructor self.lease_duration = _LEASE_DURATION self.file_system_client.get_file_system_properties() + self.extension = extension + self.output_type = output_type + super().__init__(base_path=UPath(self.prefix)) def get_op_output_relative_path( @@ -131,29 +140,18 @@ def dump_to_path(self, context: OutputContext, obj: bytes, path: UPath) -> None: connection_timeout=_CONNECTION_TIMEOUT, ) - # TODO: copied for now from TopLevelMetadataIONManager - output_filenames: dict[str, str] = { - "country_metadata": "country_metadata.parquet", - "data_publisher": "data_publishers.parquet", - "source_data_release": "source_data_releases.parquet", - # New metadata struct, not yet defined - # "geography_release": "geography_releases.parquet", - # Figure out how to use this when the IOManager class is defined - # "geometries": "geometries/{}", # this.format(filename) - # "metrics": "metrics/{}" # this.format(filename) - } - def to_binary(self, obj: pd.DataFrame) -> bytes: return obj.to_parquet(None) def _get_path(self, context: InputContext | OutputContext) -> UPath: - try: - path_components = list(context.asset_key.path) - path_components[-1] = self.output_filenames[path_components[-1]] - return UPath("/".join([self.prefix, *path_components])) - except KeyError: - err_msg = f"Only the asset keys {','.join(self.output_filenames.keys())} are compatible with this" - raise ValueError(err_msg) + # try: + # path_components = list(context.asset_key.path) + # path_components[-1] = self.output_filenames[path_components[-1]] + # return UPath("/".join([self.prefix, *path_components])) + # except KeyError: + # err_msg = f"Only the asset keys {','.join(self.output_filenames.keys())} are compatible with this" + # raise ValueError(err_msg) + return UPath("/".join(context.asset_key.path)) def _get_paths_for_partitions( self, context: InputContext | OutputContext @@ -161,6 +159,25 @@ def _get_paths_for_partitions( # TODO: need to override this method with correct extension return super()._get_paths_for_partitions(context) + def handle_output(self, context: OutputContext, obj: Any): + # TODO: convert to bytes + # TODO: consider handling the conversion from an obj dependent on output metadata: + # - specify geography output type + # - specify whether DataFrame or GeoDataFrame + # self._internal_io_manager.handle_output(context, obj) + if self.output_type == "metadata": + obj_bytes = obj.to_parquet(None) + elif self.output_type.lower() == "parquet": + obj_bytes = df_to_bytes(obj, "parquet") + elif self.output_type.lower() == "flatgeobuf": + obj_bytes = df_to_bytes(obj, "flatgeobuf") + elif self.output_type.lower() == "geojsonseq": + obj_bytes = df_to_bytes(obj, "geojsonseq") + else: + value_error: str = f"'{self.output_type}' is not currently supported." + raise ValueError(value_error) + return super().handle_output(context, obj_bytes) + class ADLS2IOManager(ConfigurableIOManager): """Persistent IO manager using Azure Data Lake Storage Gen2 for storage. @@ -238,6 +255,7 @@ def my_job(): adls2_prefix: str = Field( default="dagster", description="ADLS Gen2 file system prefix to write to." ) + output_type: str = Field(description="Output type: metadata, geometry, metric") @classmethod def _is_dagster_maintained(cls) -> bool: @@ -257,13 +275,23 @@ def _internal_io_manager(self) -> ADLS2InnerIOManager: def load_input(self, context: InputContext) -> Any: return self._internal_io_manager.load_input(context) - def handle_output(self, context: OutputContext, obj: pd.DataFrame) -> None: - # TODO: convert to bytes - # TODO: consider handling the conversion from an obj dependent on output metadata: - # - specify geography output type - # - specify whether DataFrame or GeoDataFrame - # self._internal_io_manager.handle_output(context, obj) - self._internal_io_manager.handle_output(context, obj.to_parquet(None)) + def handle_output(self, context: OutputContext, obj: Any) -> None: + self._internal_io_manager.handle_output(context, obj) + + +def df_to_bytes(df: gpd.GeoDataFrame, output_type: str) -> bytes: + tmp = tempfile.NamedTemporaryFile(prefix=str(uuid.uuid4())) + if output_type.lower() == "parquet": + df.to_parquet(tmp.name + ".parquet") + elif output_type.lower() == "flatgeobuf": + df.to_file(tmp.name + ".flatgeobuf", driver="FlatGeobuf") + elif output_type.lower() == "geojsonseq": + df.to_file(tmp.name + ".geojsonseq", driver="GeoJSONSeq") + else: + value_error: str = f"'{output_type}' is not currently supported." + raise ValueError(value_error) + with Path(tmp.name).open(mode="rb") as f: + return f.read() @dagster_maintained_io_manager @@ -345,10 +373,13 @@ def my_job(): adls2_client = adls_resource.adls2_client blob_client = adls_resource.blob_client lease_client = adls_resource.lease_client_constructor + return ADLS2InnerIOManager( init_context.resource_config["adls2_file_system"], adls2_client, blob_client, lease_client, init_context.resource_config.get("adls2_prefix"), + init_context.resource_config.get("extension", None), + init_context.resource_config.get("output_type", "metadata"), ) From 99120bc4e88c1042a1fceeb881b01e47ffd4407f Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Tue, 14 May 2024 18:32:50 +0100 Subject: [PATCH 13/35] Initial Azure geo IO manager --- python/popgetter/__init__.py | 7 +- python/popgetter/io_managers/__init__.py | 10 +- python/popgetter/io_managers/azure.py | 189 ++++++++++++++++++++++- python/popgetter/io_managers/local.py | 3 +- 4 files changed, 201 insertions(+), 8 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index e910c9f..48e1591 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -3,7 +3,11 @@ from collections.abc import Sequence from pathlib import Path -from popgetter.io_managers.local import LocalTopLevelMetadataIOManager, LocalGeometryIOManager +from popgetter.io_managers.azure import AzureGeoIOManager +from popgetter.io_managers.local import ( + LocalGeometryIOManager, + LocalTopLevelMetadataIOManager, +) from popgetter.utils import StagingDirResource __version__ = "0.1.0" @@ -74,6 +78,7 @@ "credential": {"sas": os.getenv("SAS_TOKEN")}, } ), + "geometry_io_manager": AzureGeoIOManager(), }, "dev": { "publishing_io_manager": LocalTopLevelMetadataIOManager(), diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 0c4c90f..3212418 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -1,7 +1,8 @@ from __future__ import annotations +import geopandas as gpd import pandas as pd -from dagster import ConfigurableIOManager, InputContext, OutputContext +from dagster import ConfigurableIOManager, InputContext, IOManager, OutputContext from icecream import ic from upath import UPath @@ -30,14 +31,15 @@ def load_input(self, context: InputContext) -> pd.DataFrame: raise RuntimeError("This IOManager is only for writing outputs") -class TopLevelGeometryIOManager(ConfigurableIOManager): +# class TopLevelGeometryIOManager(ConfigurableIOManager): +class TopLevelGeometryIOManager(IOManager): def get_relative_paths( self, context: OutputContext, obj: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame], - ) -> list[UPath]: + ) -> dict[str, str]: filename_stem = obj[0].iloc[0]["filename_stem"] - asset_prefix = context.asset_key.path[:-1] # e.g. ['be'] + asset_prefix = list(context.asset_key.path[:-1]) # e.g. ['be'] return { "metadata": "/".join(asset_prefix + ["geometry_metadata.parquet"]), "flatgeobuf": "/".join( diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index 6c8e623..ba71367 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -1,11 +1,41 @@ from __future__ import annotations +import os +import tempfile +from collections.abc import Iterator +from contextlib import contextmanager +from pathlib import Path + +import geopandas as gpd import pandas as pd -from dagster import InputContext, MultiPartitionKey, OutputContext +from azure.core.credentials import ( + AzureSasCredential, +) +from azure.storage.filedatalake import ( + DataLakeLeaseClient, + DataLakeServiceClient, + FileSystemClient, +) +from dagster import ( + Any, + InputContext, + MultiPartitionKey, + OutputContext, +) +from dagster_azure.adls2.utils import ResourceNotFoundError +from icecream import ic from upath import UPath from ..azure import ADLS2InnerIOManager from . import TopLevelMetadataIOManager +from .local import DagsterHomeMixin, TopLevelGeometryIOManager + +# Note: this might need to be longer for some large files, but there is an issue with header's not matching +_LEASE_DURATION = 60 # One minute + +# Set connection timeout to be larger than default: +# https://github.com/Azure/azure-sdk-for-python/issues/26993#issuecomment-1289799860 +_CONNECTION_TIMEOUT = 6000 class AzureTopLevelMetadataIOManager(TopLevelMetadataIOManager, ADLS2InnerIOManager): @@ -34,10 +64,11 @@ def _get_paths_for_partitions( ) -> dict[str, UPath]: """Returns a dict of partition_keys into I/O paths for a given context.""" if not context.has_asset_partitions: - raise TypeError( + error = ( f"Detected {context.dagster_type.typing_type} input type " "but the asset is not partitioned" ) + raise TypeError(error) def _formatted_multipartitioned_path(partition_key: MultiPartitionKey) -> str: ordered_dimension_keys = [ @@ -64,3 +95,157 @@ def _formatted_multipartitioned_path(partition_key: MultiPartitionKey) -> str: ) for partition_key, partition in formatted_partition_keys.items() } + + +class AzureGeoIOManager(TopLevelGeometryIOManager, DagsterHomeMixin): + storage: str = os.getenv("AZURE_STORAGE_ACCOUNT", "") + container: str = os.getenv("AZURE_CONTAINER", "") + prefix: str = os.getenv("AZURE_DIRECTORY", "") + sas_token: str = os.getenv("SAS_TOKEN", "") + adls2_client: DataLakeServiceClient + file_system_client: FileSystemClient + + def __init__(self): + def _create_url(storage_account, subdomain): + return f"https://{storage_account}.{subdomain}.core.windows.net/" + + def create_adls2_client( + storage_account: str, credential + ) -> DataLakeServiceClient: + """Create an ADLS2 client.""" + account_url = _create_url(storage_account, "dfs") + return DataLakeServiceClient(account_url, credential) + + self.adls2_client = create_adls2_client( + self.storage, AzureSasCredential(self.sas_token) + ) + self.file_system_client = self.adls2_client.get_file_system_client( + self.container + ) + + # # We also need a blob client to handle copying as ADLS doesn't have a copy API yet + # self.blob_client = blob_client + # self.blob_container_client = self.blob_client.get_container_client(file_system) + + self.lease_duration = _LEASE_DURATION + self.file_system_client.get_file_system_properties() + + def get_base_path_local(self) -> UPath: + return UPath(self.prefix) + + @property + def lease_client_constructor(self) -> Any: + return DataLakeLeaseClient + + @staticmethod + def geo_df_to_bytes(context, geo_df: gpd.GeoDataFrame, output_type: str) -> bytes: + tmp = tempfile.NamedTemporaryFile() + if output_type.lower() == "parquet": + fname = tmp.name + ".parquet" + geo_df.to_parquet(fname) + elif output_type.lower() == "flatgeobuf": + fname = tmp.name + ".fgb" + geo_df.to_file(fname, driver="FlatGeobuf") + elif output_type.lower() == "geojsonseq": + fname = tmp.name + ".geojsonseq" + geo_df.to_file(fname, driver="GeoJSONSeq") + elif output_type.lower() == "pmtiles": + error = "pmtiles not currently implemented" + raise ValueError(error) + else: + value_error: str = f"'{output_type}' is not currently supported." + raise ValueError(value_error) + with Path(fname).open(mode="rb") as f: + b = f.read() + context.log.debug(ic(f"Size: {len(b) / (1_024 * 1_024):.3f}MB")) + return b + + def _uri_for_path(self, path: UPath, protocol: str = "abfss://") -> str: + return "{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}".format( + protocol=protocol, + filesystem=self.file_system_client.file_system_name, + account=self.file_system_client.account_name, + key=path.as_posix(), + ) + + def get_loading_input_log_message(self, path: UPath) -> str: + return f"Loading ADLS2 object from: {self._uri_for_path(path)}" + + def get_writing_output_log_message(self, path: UPath) -> str: + return f"Writing ADLS2 object at: {self._uri_for_path(path)}" + + def unlink(self, path: UPath) -> None: + file_client = self.file_system_client.get_file_client(path.as_posix()) + with self._acquire_lease(file_client, is_rm=True) as lease: + file_client.delete_file(lease=lease, recursive=True) + + def dump_to_path(self, context: OutputContext, obj: bytes, path: UPath) -> None: + if self.path_exists(path): + context.log.warning(f"Removing existing ADLS2 key: {path}") # noqa: G004 + self.unlink(path) + + file = self.file_system_client.create_file(path.as_posix()) + with self._acquire_lease(file) as lease: + # Note: chunk_size can also be specified, see API for Azure SDK for Python, DataLakeFileClient: + # https://learn.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.datalakefileclient + file.upload_data( + obj, + lease=lease, + overwrite=True, + connection_timeout=_CONNECTION_TIMEOUT, + ) + + def path_exists(self, path: UPath) -> bool: + try: + self.file_system_client.get_file_client( + path.as_posix() + ).get_file_properties() + except ResourceNotFoundError: + return False + return True + + @contextmanager + def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]: + lease_client = self.lease_client_constructor(client=client) + try: + lease_client.acquire(lease_duration=self.lease_duration) + yield lease_client.id + finally: + # cannot release a lease on a file that no longer exists, so need to check + if not is_rm: + lease_client.release() + + def handle_output( + self, + context: OutputContext, + obj: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame], + ) -> None: + rel_paths = self.get_relative_paths(context, obj) + base_path = self.get_base_path_local() + full_paths = {key: base_path / rel_path for key, rel_path in rel_paths.items()} + for path in full_paths.values(): + path.parent.mkdir(parents=True, exist_ok=True) + context.add_output_metadata( + metadata={ + "geometry_metadata_path": str(full_paths["metadata"]), + "flatgeobuf_path": str(full_paths["flatgeobuf"]), + "pmtiles_path": str(full_paths["pmtiles"]), + "geojsonseq_path": str(full_paths["geojsonseq"]), + "names_path": str(full_paths["names"]), + } + ) + + metadata_df, gdf, names_df = obj + self.dump_to_path(context, metadata_df.to_parquet(None), full_paths["metadata"]) + self.dump_to_path( + context, + self.geo_df_to_bytes(context, gdf, "flatgeobuf"), + full_paths["flatgeobuf"], + ) + self.dump_to_path( + context, + self.geo_df_to_bytes(context, gdf, "geojsonseq"), + full_paths["geojsonseq"], + ) + # TODO: generate pmtiles + self.dump_to_path(context, names_df.to_parquet(None), full_paths["names"]) diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index f454d53..812feff 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -2,11 +2,12 @@ import os +import geopandas as gpd import pandas as pd from dagster import OutputContext from upath import UPath -from . import TopLevelMetadataIOManager, TopLevelGeometryIOManager +from . import TopLevelGeometryIOManager, TopLevelMetadataIOManager class DagsterHomeMixin: From 7dc103d9b52a5f9c920df198d995ddefde98d39b Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 15 May 2024 07:16:38 +0100 Subject: [PATCH 14/35] Azure IO manager, top level Azure metadata IO manager --- python/popgetter/__init__.py | 20 +--- python/popgetter/io_managers/__init__.py | 33 +++--- python/popgetter/io_managers/azure.py | 145 ++++++++--------------- 3 files changed, 73 insertions(+), 125 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 48e1591..43547a8 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -3,7 +3,10 @@ from collections.abc import Sequence from pathlib import Path -from popgetter.io_managers.azure import AzureGeoIOManager +from popgetter.io_managers.azure import ( + AzureGeoIOManager, + AzureTopLevelMetadataIOManager, +) from popgetter.io_managers.local import ( LocalGeometryIOManager, LocalTopLevelMetadataIOManager, @@ -33,10 +36,8 @@ from dagster._core.definitions.unresolved_asset_job_definition import ( UnresolvedAssetJobDefinition, ) -from dagster_azure.adls2 import adls2_resource from popgetter import assets, cloud_outputs -from popgetter.azure.azure_io_manager import adls2_io_manager all_assets: Sequence[AssetsDefinition | SourceAsset | CacheableAssetsDefinition] = [ *load_assets_from_package_module(assets.us, group_name="us"), @@ -66,18 +67,7 @@ resources_by_env = { "prod": { - "publishing_io_manager": adls2_io_manager.configured( - { - "adls2_file_system": os.getenv("AZURE_CONTAINER"), - "adls2_prefix": os.getenv("AZURE_DIRECTORY"), - } - ), - "adls2": adls2_resource.configured( - { - "storage_account": os.getenv("AZURE_STORAGE_ACCOUNT"), - "credential": {"sas": os.getenv("SAS_TOKEN")}, - } - ), + "publishing_io_manager": AzureTopLevelMetadataIOManager(), "geometry_io_manager": AzureGeoIOManager(), }, "dev": { diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 3212418..b0e4f4f 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -1,14 +1,16 @@ from __future__ import annotations +from typing import ClassVar + import geopandas as gpd import pandas as pd -from dagster import ConfigurableIOManager, InputContext, IOManager, OutputContext +from dagster import InputContext, IOManager, OutputContext from icecream import ic from upath import UPath -class TopLevelMetadataIOManager(ConfigurableIOManager): - output_filenames: dict[str, str] = { +class TopLevelMetadataIOManager(IOManager): + output_filenames: ClassVar[dict[str, str]] = { "country_metadata": "country_metadata.parquet", "data_publisher": "data_publishers.parquet", "source_data_release": "source_data_releases.parquet", @@ -20,18 +22,18 @@ def get_relative_path(self, context: OutputContext) -> UPath: path_components = list(context.asset_key.path) path_components[-1] = self.output_filenames[path_components[-1]] return UPath("/".join(path_components)) - except KeyError: + except KeyError as err: err_msg = f"Only the asset keys {','.join(self.output_filenames.keys())} are compatible with this" - raise ValueError(err_msg) + raise ValueError(err_msg) from err def to_binary(self, obj: pd.DataFrame) -> bytes: return obj.to_parquet(None) - def load_input(self, context: InputContext) -> pd.DataFrame: - raise RuntimeError("This IOManager is only for writing outputs") + def load_input(self, _context: InputContext) -> pd.DataFrame: + err_msg = "This IOManager is only for writing outputs" + raise RuntimeError(err_msg) -# class TopLevelGeometryIOManager(ConfigurableIOManager): class TopLevelGeometryIOManager(IOManager): def get_relative_paths( self, @@ -41,20 +43,21 @@ def get_relative_paths( filename_stem = obj[0].iloc[0]["filename_stem"] asset_prefix = list(context.asset_key.path[:-1]) # e.g. ['be'] return { - "metadata": "/".join(asset_prefix + ["geometry_metadata.parquet"]), + "metadata": "/".join([*asset_prefix, "geometry_metadata.parquet"]), "flatgeobuf": "/".join( - asset_prefix + ["geometries", f"{filename_stem}.fgb"] + [*asset_prefix, "geometries", f"{filename_stem}.fgb"] ), "pmtiles": "/".join( - asset_prefix + ["geometries", f"{filename_stem}.pmtiles"] + [*asset_prefix, "geometries", f"{filename_stem}.pmtiles"] ), "geojsonseq": "/".join( - asset_prefix + ["geometries", f"{filename_stem}.geojsonseq"] + [*asset_prefix, "geometries", f"{filename_stem}.geojsonseq"] ), "names": "/".join( - asset_prefix + ["geometries", f"{filename_stem}.parquet"] + [*asset_prefix, "geometries", f"{filename_stem}.parquet"] ), } - def load_input(self, context: InputContext) -> pd.DataFrame: - raise RuntimeError("This IOManager is only for writing outputs") + def load_input(self, _context: InputContext) -> pd.DataFrame: + err_msg = "This IOManager is only for writing outputs" + raise RuntimeError(err_msg) diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index ba71367..37f1579 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -18,17 +18,14 @@ ) from dagster import ( Any, - InputContext, - MultiPartitionKey, + IOManager, OutputContext, ) from dagster_azure.adls2.utils import ResourceNotFoundError from icecream import ic from upath import UPath -from ..azure import ADLS2InnerIOManager -from . import TopLevelMetadataIOManager -from .local import DagsterHomeMixin, TopLevelGeometryIOManager +from . import TopLevelGeometryIOManager, TopLevelMetadataIOManager # Note: this might need to be longer for some large files, but there is an issue with header's not matching _LEASE_DURATION = 60 # One minute @@ -38,74 +35,22 @@ _CONNECTION_TIMEOUT = 6000 -class AzureTopLevelMetadataIOManager(TopLevelMetadataIOManager, ADLS2InnerIOManager): - extension: str = ".parquet" - - def handle_output(self, context: OutputContext, obj: pd.DataFrame) -> None: - rel_path = self.get_relative_path(context) - full_path = self.get_base_path() / rel_path - full_path.parent.mkdir(parents=True, exist_ok=True) - context.add_output_metadata(metadata={"parquet_path": str(full_path)}) - self.dump_to_path(context, self.to_binary(obj), full_path) - - def _get_path_without_extension( - self, context: InputContext | OutputContext - ) -> UPath: - if context.has_asset_key: - context_path = self.get_asset_relative_path(context) - else: - # we are dealing with an op output - context_path = self.get_op_output_relative_path(context) - - return self._base_path.joinpath(context_path) - - def _get_paths_for_partitions( - self, context: InputContext | OutputContext - ) -> dict[str, UPath]: - """Returns a dict of partition_keys into I/O paths for a given context.""" - if not context.has_asset_partitions: - error = ( - f"Detected {context.dagster_type.typing_type} input type " - "but the asset is not partitioned" - ) - raise TypeError(error) - - def _formatted_multipartitioned_path(partition_key: MultiPartitionKey) -> str: - ordered_dimension_keys = [ - key[1] - for key in sorted( - partition_key.keys_by_dimension.items(), key=lambda x: x[0] - ) - ] - return "/".join(ordered_dimension_keys) - - formatted_partition_keys = { - partition_key: ( - _formatted_multipartitioned_path(partition_key) - if isinstance(partition_key, MultiPartitionKey) - else partition_key - ) - for partition_key in context.asset_partition_keys - } - - asset_path = self._get_path_without_extension(context) - return { - partition_key: self._with_extension( - self.get_path_for_partition(context, asset_path, partition) - ) - for partition_key, partition in formatted_partition_keys.items() - } - - -class AzureGeoIOManager(TopLevelGeometryIOManager, DagsterHomeMixin): - storage: str = os.getenv("AZURE_STORAGE_ACCOUNT", "") - container: str = os.getenv("AZURE_CONTAINER", "") - prefix: str = os.getenv("AZURE_DIRECTORY", "") - sas_token: str = os.getenv("SAS_TOKEN", "") +class AzureIOManager(IOManager): + storage_account: str | None = os.getenv("AZURE_STORAGE_ACCOUNT") + container: str | None = os.getenv("AZURE_CONTAINER") + prefix: str | None = os.getenv("AZURE_DIRECTORY") + sas_token: str | None = os.getenv("SAS_TOKEN") adls2_client: DataLakeServiceClient file_system_client: FileSystemClient def __init__(self): + if self.storage_account is None: + err_msg = "Storage account needs to be provided." + raise ValueError(err_msg) + if self.sas_token is None: + err_msg = "Credenital (SAS) needs to be provided." + raise ValueError(err_msg) + def _create_url(storage_account, subdomain): return f"https://{storage_account}.{subdomain}.core.windows.net/" @@ -117,7 +62,7 @@ def create_adls2_client( return DataLakeServiceClient(account_url, credential) self.adls2_client = create_adls2_client( - self.storage, AzureSasCredential(self.sas_token) + self.storage_account, AzureSasCredential(self.sas_token) ) self.file_system_client = self.adls2_client.get_file_system_client( self.container @@ -130,36 +75,13 @@ def create_adls2_client( self.lease_duration = _LEASE_DURATION self.file_system_client.get_file_system_properties() - def get_base_path_local(self) -> UPath: + def get_base_path(self) -> UPath: return UPath(self.prefix) @property def lease_client_constructor(self) -> Any: return DataLakeLeaseClient - @staticmethod - def geo_df_to_bytes(context, geo_df: gpd.GeoDataFrame, output_type: str) -> bytes: - tmp = tempfile.NamedTemporaryFile() - if output_type.lower() == "parquet": - fname = tmp.name + ".parquet" - geo_df.to_parquet(fname) - elif output_type.lower() == "flatgeobuf": - fname = tmp.name + ".fgb" - geo_df.to_file(fname, driver="FlatGeobuf") - elif output_type.lower() == "geojsonseq": - fname = tmp.name + ".geojsonseq" - geo_df.to_file(fname, driver="GeoJSONSeq") - elif output_type.lower() == "pmtiles": - error = "pmtiles not currently implemented" - raise ValueError(error) - else: - value_error: str = f"'{output_type}' is not currently supported." - raise ValueError(value_error) - with Path(fname).open(mode="rb") as f: - b = f.read() - context.log.debug(ic(f"Size: {len(b) / (1_024 * 1_024):.3f}MB")) - return b - def _uri_for_path(self, path: UPath, protocol: str = "abfss://") -> str: return "{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}".format( protocol=protocol, @@ -215,13 +137,46 @@ def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]: if not is_rm: lease_client.release() + +class AzureTopLevelMetadataIOManager(TopLevelMetadataIOManager, AzureIOManager): + def handle_output(self, context: OutputContext, df: pd.DataFrame) -> None: + rel_path = self.get_relative_path(context) + full_path = self.get_base_path() / rel_path + context.add_output_metadata(metadata={"parquet_path": str(full_path)}) + self.dump_to_path(context, df.to_parquet(None), full_path) + + +class AzureGeoIOManager(TopLevelGeometryIOManager, AzureIOManager): + @staticmethod + def geo_df_to_bytes(context, geo_df: gpd.GeoDataFrame, output_type: str) -> bytes: + tmp = tempfile.NamedTemporaryFile() + if output_type.lower() == "parquet": + fname = tmp.name + ".parquet" + geo_df.to_parquet(fname) + elif output_type.lower() == "flatgeobuf": + fname = tmp.name + ".fgb" + geo_df.to_file(fname, driver="FlatGeobuf") + elif output_type.lower() == "geojsonseq": + fname = tmp.name + ".geojsonseq" + geo_df.to_file(fname, driver="GeoJSONSeq") + elif output_type.lower() == "pmtiles": + err_msg = "pmtiles not currently implemented" + raise ValueError(err_msg) + else: + value_error: str = f"'{output_type}' is not currently supported." + raise ValueError(value_error) + with Path(fname).open(mode="rb") as f: + b: bytes = f.read() + context.log.debug(ic(f"Size: {len(b) / (1_024 * 1_024):.3f}MB")) + return b + def handle_output( self, context: OutputContext, obj: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame], ) -> None: rel_paths = self.get_relative_paths(context, obj) - base_path = self.get_base_path_local() + base_path = self.get_base_path() full_paths = {key: base_path / rel_path for key, rel_path in rel_paths.items()} for path in full_paths.values(): path.parent.mkdir(parents=True, exist_ok=True) From 993ed73c9d93ddcdafd890acd3b1925663c3b63d Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 15 May 2024 07:19:20 +0100 Subject: [PATCH 15/35] Remove obsolete Azure IO manager for partitioned assets --- python/popgetter/azure/__init__.py | 3 - python/popgetter/azure/azure_io_manager.py | 385 --------------------- 2 files changed, 388 deletions(-) delete mode 100644 python/popgetter/azure/__init__.py delete mode 100644 python/popgetter/azure/azure_io_manager.py diff --git a/python/popgetter/azure/__init__.py b/python/popgetter/azure/__init__.py deleted file mode 100644 index 1871aba..0000000 --- a/python/popgetter/azure/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from __future__ import annotations - -from .azure_io_manager import * diff --git a/python/popgetter/azure/azure_io_manager.py b/python/popgetter/azure/azure_io_manager.py deleted file mode 100644 index ba3583d..0000000 --- a/python/popgetter/azure/azure_io_manager.py +++ /dev/null @@ -1,385 +0,0 @@ -""" -ADLS2IOManager without pickling based on: -https://docs.dagster.io/_modules/dagster_azure/adls2/io_manager#ADLS2PickleIOManager - -""" -from __future__ import annotations - -import tempfile -import uuid -from collections.abc import Iterator -from contextlib import contextmanager -from pathlib import Path -from typing import Any - -import geopandas as gpd -import pandas as pd -from dagster import ( - InputContext, - OutputContext, - ResourceDependency, - io_manager, -) -from dagster import ( - _check as check, -) -from dagster._config.pythonic_config import ConfigurableIOManager -from dagster._core.storage.io_manager import dagster_maintained_io_manager -from dagster._core.storage.upath_io_manager import UPathIOManager -from dagster._utils.cached_method import cached_method -from dagster_azure.adls2.resources import ADLS2Resource -from dagster_azure.adls2.utils import ResourceNotFoundError -from pydantic import Field -from upath import UPath - -# Note: this might need to be longer for some large files, but there is an issue with header's not matching -_LEASE_DURATION = 60 # One minute - -# Set connection timeout to be larger than default: -# https://github.com/Azure/azure-sdk-for-python/issues/26993#issuecomment-1289799860 -_CONNECTION_TIMEOUT = 6000 - - -class ADLS2InnerIOManager(UPathIOManager): - def __init__( - self, - file_system: Any, - adls2_client: Any, - blob_client: Any, - lease_client_constructor: Any, - prefix: str = "dagster", - extension: str | None = None, - output_type: str = "metadata", - ): - self.adls2_client = adls2_client - self.file_system_client = self.adls2_client.get_file_system_client(file_system) - # We also need a blob client to handle copying as ADLS doesn't have a copy API yet - self.blob_client = blob_client - self.blob_container_client = self.blob_client.get_container_client(file_system) - self.prefix = check.str_param(prefix, "prefix") - - self.lease_client_constructor = lease_client_constructor - self.lease_duration = _LEASE_DURATION - self.file_system_client.get_file_system_properties() - self.extension = extension - self.output_type = output_type - - super().__init__(base_path=UPath(self.prefix)) - - def get_op_output_relative_path( - self, context: InputContext | OutputContext - ) -> UPath: - parts = context.get_identifier() - run_id = parts[0] - output_parts = parts[1:] - return UPath("storage", run_id, "files", *output_parts) - - def get_loading_input_log_message(self, path: UPath) -> str: - return f"Loading ADLS2 object from: {self._uri_for_path(path)}" - - def get_writing_output_log_message(self, path: UPath) -> str: - return f"Writing ADLS2 object at: {self._uri_for_path(path)}" - - def unlink(self, path: UPath) -> None: - file_client = self.file_system_client.get_file_client(path.as_posix()) - with self._acquire_lease(file_client, is_rm=True) as lease: - file_client.delete_file(lease=lease, recursive=True) - - def make_directory(self, path: UPath) -> None: # noqa: ARG002 - # It is not necessary to create directories in ADLS2 - return None - - def path_exists(self, path: UPath) -> bool: - try: - self.file_system_client.get_file_client( - path.as_posix() - ).get_file_properties() - except ResourceNotFoundError: - return False - return True - - def _uri_for_path(self, path: UPath, protocol: str = "abfss://") -> str: - return "{protocol}{filesystem}@{account}.dfs.core.windows.net/{key}".format( - protocol=protocol, - filesystem=self.file_system_client.file_system_name, - account=self.file_system_client.account_name, - key=path.as_posix(), - ) - - @contextmanager - def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]: - lease_client = self.lease_client_constructor(client=client) - try: - lease_client.acquire(lease_duration=self.lease_duration) - yield lease_client.id - finally: - # cannot release a lease on a file that no longer exists, so need to check - if not is_rm: - lease_client.release() - - def load_from_path(self, context: InputContext, path: UPath) -> bytes | None: - if context.dagster_type.typing_type == type(None): - return None - file = self.file_system_client.get_file_client(path.as_posix()) - stream = file.download_file() - return stream.readall() - - def dump_to_path(self, context: OutputContext, obj: bytes, path: UPath) -> None: - if self.path_exists(path): - context.log.warning(f"Removing existing ADLS2 key: {path}") # noqa: G004 - self.unlink(path) - - file = self.file_system_client.create_file(path.as_posix()) - with self._acquire_lease(file) as lease: - # Note: chunk_size can also be specified, see API for Azure SDK for Python, DataLakeFileClient: - # https://learn.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.datalakefileclient - file.upload_data( - obj, - lease=lease, - overwrite=True, - connection_timeout=_CONNECTION_TIMEOUT, - ) - - def to_binary(self, obj: pd.DataFrame) -> bytes: - return obj.to_parquet(None) - - def _get_path(self, context: InputContext | OutputContext) -> UPath: - # try: - # path_components = list(context.asset_key.path) - # path_components[-1] = self.output_filenames[path_components[-1]] - # return UPath("/".join([self.prefix, *path_components])) - # except KeyError: - # err_msg = f"Only the asset keys {','.join(self.output_filenames.keys())} are compatible with this" - # raise ValueError(err_msg) - return UPath("/".join(context.asset_key.path)) - - def _get_paths_for_partitions( - self, context: InputContext | OutputContext - ) -> dict[str, UPath]: - # TODO: need to override this method with correct extension - return super()._get_paths_for_partitions(context) - - def handle_output(self, context: OutputContext, obj: Any): - # TODO: convert to bytes - # TODO: consider handling the conversion from an obj dependent on output metadata: - # - specify geography output type - # - specify whether DataFrame or GeoDataFrame - # self._internal_io_manager.handle_output(context, obj) - if self.output_type == "metadata": - obj_bytes = obj.to_parquet(None) - elif self.output_type.lower() == "parquet": - obj_bytes = df_to_bytes(obj, "parquet") - elif self.output_type.lower() == "flatgeobuf": - obj_bytes = df_to_bytes(obj, "flatgeobuf") - elif self.output_type.lower() == "geojsonseq": - obj_bytes = df_to_bytes(obj, "geojsonseq") - else: - value_error: str = f"'{self.output_type}' is not currently supported." - raise ValueError(value_error) - return super().handle_output(context, obj_bytes) - - -class ADLS2IOManager(ConfigurableIOManager): - """Persistent IO manager using Azure Data Lake Storage Gen2 for storage. - - Serializes objects via pickling. Suitable for objects storage for distributed executors, so long - as each execution node has network connectivity and credentials for ADLS and the backing - container. - - Assigns each op output to a unique filepath containing run ID, step key, and output name. - Assigns each asset to a single filesystem path, at "/". If the asset key - has multiple components, the final component is used as the name of the file, and the preceding - components as parent directories under the base_dir. - - Subsequent materializations of an asset will overwrite previous materializations of that asset. - With a base directory of "/my/base/path", an asset with key - `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory - with path "/my/base/path/one/two/". - - Example usage: - - 1. Attach this IO manager to a set of assets. - - .. code-block:: python - - from dagster import Definitions, asset - from dagster_azure.adls2 import adls2_resource - from popgetter.azure import ADLS2IOManager - - - @asset - def asset1(): - # create df ... - return df - - - @asset - def asset2(asset1): - return df[:5] - - - defs = Definitions( - assets=[asset1, asset2], - resources={ - "io_manager": ADLS2IOManager( - adls2_file_system="my-cool-fs", adls2_prefix="my-cool-prefix" - ), - "adls2": adls2_resource, - }, - ) - - - 2. Attach this IO manager to your job to make it available to your ops. - - .. code-block:: python - - from dagster import job - from dagster_azure.adls2 import adls2_resource - from popgetter.azure import ADLS2IOManager - - - @job( - resource_defs={ - "io_manager": ADLS2IOManager( - adls2_file_system="my-cool-fs", adls2_prefix="my-cool-prefix" - ), - "adls2": adls2_resource, - }, - ) - def my_job(): - ... - """ - - adls2: ResourceDependency[ADLS2Resource] - adls2_file_system: str = Field(description="ADLS Gen2 file system name.") - adls2_prefix: str = Field( - default="dagster", description="ADLS Gen2 file system prefix to write to." - ) - output_type: str = Field(description="Output type: metadata, geometry, metric") - - @classmethod - def _is_dagster_maintained(cls) -> bool: - return True - - @property - @cached_method - def _internal_io_manager(self) -> ADLS2InnerIOManager: - return ADLS2InnerIOManager( - self.adls2_file_system, - self.adls2.adls2_client, - self.adls2.blob_client, - self.adls2.lease_client_constructor, - self.adls2_prefix, - ) - - def load_input(self, context: InputContext) -> Any: - return self._internal_io_manager.load_input(context) - - def handle_output(self, context: OutputContext, obj: Any) -> None: - self._internal_io_manager.handle_output(context, obj) - - -def df_to_bytes(df: gpd.GeoDataFrame, output_type: str) -> bytes: - tmp = tempfile.NamedTemporaryFile(prefix=str(uuid.uuid4())) - if output_type.lower() == "parquet": - df.to_parquet(tmp.name + ".parquet") - elif output_type.lower() == "flatgeobuf": - df.to_file(tmp.name + ".flatgeobuf", driver="FlatGeobuf") - elif output_type.lower() == "geojsonseq": - df.to_file(tmp.name + ".geojsonseq", driver="GeoJSONSeq") - else: - value_error: str = f"'{output_type}' is not currently supported." - raise ValueError(value_error) - with Path(tmp.name).open(mode="rb") as f: - return f.read() - - -@dagster_maintained_io_manager -@io_manager( - config_schema=ADLS2IOManager.to_config_schema(), - required_resource_keys={"adls2"}, -) -def adls2_io_manager(init_context): - """Persistent IO manager using Azure Data Lake Storage Gen2 for storage. - - Serializes objects via pickling. Suitable for objects storage for distributed executors, so long - as each execution node has network connectivity and credentials for ADLS and the backing - container. - - Assigns each op output to a unique filepath containing run ID, step key, and output name. - Assigns each asset to a single filesystem path, at "/". If the asset key - has multiple components, the final component is used as the name of the file, and the preceding - components as parent directories under the base_dir. - - Subsequent materializations of an asset will overwrite previous materializations of that asset. - With a base directory of "/my/base/path", an asset with key - `AssetKey(["one", "two", "three"])` would be stored in a file called "three" in a directory - with path "/my/base/path/one/two/". - - Example usage: - - 1. Attach this IO manager to a set of assets. - - .. code-block:: python - - from dagster import Definitions, asset - from dagster_azure.adls2 import adls2_resource - from popgetter.azure import adls2_io_manager - - - @asset - def asset1(): - # create df ... - return df - - - @asset - def asset2(asset1): - return df[:5] - - - defs = Definitions( - assets=[asset1, asset2], - resources={ - "io_manager": adls2_io_manager.configured( - {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"} - ), - "adls2": adls2_resource, - }, - ) - - - 2. Attach this IO manager to your job to make it available to your ops. - - .. code-block:: python - - from dagster import job - from dagster_azure.adls2 import adls2_resource - from popgetter.azure import adls2_io_manager - - - @job( - resource_defs={ - "io_manager": adls2_io_manager.configured( - {"adls2_file_system": "my-cool-fs", "adls2_prefix": "my-cool-prefix"} - ), - "adls2": adls2_resource, - }, - ) - def my_job(): - ... - """ - adls_resource = init_context.resources.adls2 - adls2_client = adls_resource.adls2_client - blob_client = adls_resource.blob_client - lease_client = adls_resource.lease_client_constructor - - return ADLS2InnerIOManager( - init_context.resource_config["adls2_file_system"], - adls2_client, - blob_client, - lease_client, - init_context.resource_config.get("adls2_prefix"), - init_context.resource_config.get("extension", None), - init_context.resource_config.get("output_type", "metadata"), - ) From cf1ee547343e17f193deff4f3950eab3619a6620 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 15 May 2024 07:38:00 +0100 Subject: [PATCH 16/35] Add blob client, import adls constructor fn --- python/popgetter/io_managers/azure.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index 37f1579..546f7b6 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -21,7 +21,8 @@ IOManager, OutputContext, ) -from dagster_azure.adls2.utils import ResourceNotFoundError +from dagster_azure.adls2.utils import ResourceNotFoundError, create_adls2_client +from dagster_azure.blob.utils import create_blob_client from icecream import ic from upath import UPath @@ -50,16 +51,9 @@ def __init__(self): if self.sas_token is None: err_msg = "Credenital (SAS) needs to be provided." raise ValueError(err_msg) - - def _create_url(storage_account, subdomain): - return f"https://{storage_account}.{subdomain}.core.windows.net/" - - def create_adls2_client( - storage_account: str, credential - ) -> DataLakeServiceClient: - """Create an ADLS2 client.""" - account_url = _create_url(storage_account, "dfs") - return DataLakeServiceClient(account_url, credential) + if self.container is None: + err_msg = "Container needs to be provided." + raise ValueError(err_msg) self.adls2_client = create_adls2_client( self.storage_account, AzureSasCredential(self.sas_token) @@ -67,10 +61,13 @@ def create_adls2_client( self.file_system_client = self.adls2_client.get_file_system_client( self.container ) - - # # We also need a blob client to handle copying as ADLS doesn't have a copy API yet - # self.blob_client = blob_client - # self.blob_container_client = self.blob_client.get_container_client(file_system) + # Blob client needed to handle copying as ADLS doesn't have a copy API yet + self.blob_client = create_blob_client( + self.storage_account, AzureSasCredential(self.sas_token) + ) + self.blob_container_client = self.blob_client.get_container_client( + self.container + ) self.lease_duration = _LEASE_DURATION self.file_system_client.get_file_system_properties() From 1dda12207fce117dac849f89aa396d16e04b22d7 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Wed, 15 May 2024 09:21:59 +0100 Subject: [PATCH 17/35] Update lease duration, add extension for general IO manager --- python/popgetter/__init__.py | 2 ++ python/popgetter/cloud_outputs.py | 6 +++--- python/popgetter/io_managers/azure.py | 25 +++++++++++++++++++++++-- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 43547a8..5eeab89 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -4,6 +4,7 @@ from pathlib import Path from popgetter.io_managers.azure import ( + AzureGeneralIOManager, AzureGeoIOManager, AzureTopLevelMetadataIOManager, ) @@ -67,6 +68,7 @@ resources_by_env = { "prod": { + "general_io_manager": AzureGeneralIOManager(".bin"), "publishing_io_manager": AzureTopLevelMetadataIOManager(), "geometry_io_manager": AzureGeoIOManager(), }, diff --git a/python/popgetter/cloud_outputs.py b/python/popgetter/cloud_outputs.py index 92f8980..8d53ca4 100644 --- a/python/popgetter/cloud_outputs.py +++ b/python/popgetter/cloud_outputs.py @@ -213,14 +213,14 @@ def upstream_df(context): raise ValueError(err_msg) -@asset(io_manager_key="publishing_io_manager") +@asset(io_manager_key="general_io_manager") def test_azure(): return pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}).to_parquet(None) -@asset(io_manager_key="publishing_io_manager") +@asset(io_manager_key="general_io_manager") def test_azure_large(): - return b"0" * (45 * 1024 * 1024 + 100) + return b"0" * (450 * 1024 * 1024 + 100) def df_to_bytes(df: gpd.GeoDataFrame, output_type: str) -> bytes: diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index 546f7b6..4deff35 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -18,6 +18,7 @@ ) from dagster import ( Any, + InputContext, IOManager, OutputContext, ) @@ -28,8 +29,8 @@ from . import TopLevelGeometryIOManager, TopLevelMetadataIOManager -# Note: this might need to be longer for some large files, but there is an issue with header's not matching -_LEASE_DURATION = 60 # One minute +# Set no time limit on lease duration to enable large files to be uploaded +_LEASE_DURATION = -1 # Set connection timeout to be larger than default: # https://github.com/Azure/azure-sdk-for-python/issues/26993#issuecomment-1289799860 @@ -135,6 +136,26 @@ def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]: lease_client.release() +class AzureGeneralIOManager(AzureIOManager): + extension: str + + def __init__(self, extension: str | None = None): + super().__init__() + if extension is not None and not extension.startswith("."): + err_msg = f"Provided extension ('{extension}') does not begin with '.'" + raise ValueError(err_msg) + self.extension = "" if extension is None else extension + + def handle_output(self, context: OutputContext, obj: bytes) -> None: + path = self.get_base_path() / ".".join( + [*context.asset_key.path, self.extension] + ) + self.dump_to_path(context, obj, path) + + def load_input(self, context: InputContext) -> Any: + return super().load_input(context) + + class AzureTopLevelMetadataIOManager(TopLevelMetadataIOManager, AzureIOManager): def handle_output(self, context: OutputContext, df: pd.DataFrame) -> None: rel_path = self.get_relative_path(context) From 88bcab1a4a3b47c1d9946b90606360bc388d9b17 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Wed, 15 May 2024 15:40:49 +0100 Subject: [PATCH 18/35] Enable general_io_manager for both prod & dev --- python/popgetter/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 5eeab89..286aec5 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -68,7 +68,6 @@ resources_by_env = { "prod": { - "general_io_manager": AzureGeneralIOManager(".bin"), "publishing_io_manager": AzureTopLevelMetadataIOManager(), "geometry_io_manager": AzureGeoIOManager(), }, @@ -83,6 +82,7 @@ "staging_res": StagingDirResource( staging_dir=str(Path(__file__).parent.joinpath("staging_dir").resolve()) ), + "general_io_manager": AzureGeneralIOManager(".bin"), } resources.update(resources_by_env[os.getenv("ENV", "dev")]) From 85474e1a13363cb3ed5a3371bf62dd3d5f51504e Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Wed, 15 May 2024 17:10:19 +0100 Subject: [PATCH 19/35] Rework local IO managers to use new input types as discussed In particular: - TopLevelMetadata assets will return the actual classes instead of dataframes as this avoids unnecessary conversions between classes/dataframes (a common operation is extracting a field, e.g. an ID, in a downstream asset and while that can be done with df["field_name"].iloc[0] it's clenaer to just use class.field_name) - Geometry assets will return a list of (GeometryMetadata, gdf, names_df) because they cannot be separately partitioned The new IO managers have been disabled in this commit because I have introduced dependencies between some of the outputs. For example, the SourceDataRelease requires the ID of the GeometryMetadata which is constructed at runtime by the geometry asset. Since our IO managers only allow for output and not input, it is impossible to produce any assets that depend on those using the new IO managers. The way around this is to use the default IO managers for these assets themselves, and then create NEW assets which depend on these assets and exist solely to handle the output to local / Azure. Indeed, these 'new' assets can just be cloud sensors (which leads us VERY nicely into that bit of work which we were always going to do!) NOTE: This commit will break the Azure IO managers. They will be fixed soon. --- python/popgetter/assets/be/census_derived.py | 30 ++-- python/popgetter/assets/be/census_geometry.py | 135 +++++++++++------- python/popgetter/assets/be/census_tables.py | 46 ++---- python/popgetter/io_managers/__init__.py | 86 +++++++---- python/popgetter/io_managers/local.py | 72 +++++++--- 5 files changed, 220 insertions(+), 149 deletions(-) diff --git a/python/popgetter/assets/be/census_derived.py b/python/popgetter/assets/be/census_derived.py index 94eaf9f..9264da1 100644 --- a/python/popgetter/assets/be/census_derived.py +++ b/python/popgetter/assets/be/census_derived.py @@ -12,10 +12,10 @@ ) from icecream import ic -from popgetter.metadata import MetricMetadata +from popgetter.metadata import MetricMetadata, SourceDataRelease, metadata_to_dataframe from .belgium import asset_prefix -from .census_tables import dataset_node_partition, source +from .census_tables import dataset_node_partition _needed_dataset = [ { @@ -123,13 +123,15 @@ def needed_datasets(context) -> pd.DataFrame: return needed_df -def census_table_metadata(catalog_row: dict) -> MetricMetadata: +def make_census_table_metadata( + catalog_row: dict, source_data_release: SourceDataRelease +) -> MetricMetadata: return MetricMetadata( human_readable_name=catalog_row["human_readable_name"], source_download_url=catalog_row["source_download_url"], source_archive_file_path=catalog_row["source_archive_file_path"], source_documentation_url=catalog_row["source_documentation_url"], - source_data_release_id=source.id, + source_data_release_id=source_data_release.id, # TODO - this is a placeholder parent_metric_id="unknown_at_this_stage", potential_denominator_ids=None, @@ -179,7 +181,6 @@ def filter_needed_catalog( "individual_census_table": AssetIn( key_prefix=asset_prefix, partition_mapping=needed_dataset_mapping ), - # "individual_census_table": AssetIn(key_prefix=asset_prefix), "filter_needed_catalog": AssetIn(key_prefix=asset_prefix), }, outs={ @@ -227,14 +228,13 @@ def get_enriched_tables( result_mmd = census_table_metadata(catalog_row) - # pivot_data(context, result_df, catalog_row) - return result_df, result_mmd @multi_asset( partitions_def=dataset_node_partition, ins={ + "source_data_release": AssetIn(key_prefix=asset_prefix), "source_table": AssetIn( key_prefix=asset_prefix, partition_mapping=needed_dataset_mapping ), @@ -249,6 +249,7 @@ def get_enriched_tables( ) def pivot_data( context, + source_data_release: SourceDataRelease, source_table: dict[str, pd.DataFrame], source_mmd: dict[str, MetricMetadata], ) -> tuple[pd.DataFrame, list[MetricMetadata]]: @@ -290,6 +291,7 @@ def pivot_data( ) new_mmd = parent_mmd.copy() + new_mmd.source_data_release_id = source_data_release.id new_mmd.parent_metric_id = parent_mmd.source_metric_id new_mmd.hxl_tag = col_name new_mmds.append(new_mmd) @@ -304,11 +306,17 @@ def pivot_data( context.add_output_metadata( output_name="derived_table", metadata={ - "num_records": len(new_table), # Metadata can be any key-value pair - "columns": MetadataValue.md( - "\n".join([f"- '`{col}`'" for col in new_table.columns.to_list()]) + "num_records": len(new_table), + "metrics_preview": MetadataValue.md(new_table.head().to_markdown()), + }, + ) + context.add_output_metadata( + output_name="derived_mmds", + metadata={ + "num_records": len(new_mmds), + "metadata_preview": MetadataValue.md( + metadata_to_dataframe(new_mmds).head().to_markdown() ), - "preview": MetadataValue.md(new_table.head().to_markdown()), }, ) diff --git a/python/popgetter/assets/be/census_geometry.py b/python/popgetter/assets/be/census_geometry.py index 1538033..1e0b384 100644 --- a/python/popgetter/assets/be/census_geometry.py +++ b/python/popgetter/assets/be/census_geometry.py @@ -12,19 +12,16 @@ from icecream import ic from popgetter.utils import markdown_from_plot -from popgetter.metadata import GeometryMetadata, metadata_to_dataframe +from popgetter.metadata import ( + GeometryMetadata, + metadata_to_dataframe, + SourceDataRelease, +) from datetime import date from .belgium import asset_prefix from dataclasses import dataclass - -geometry_metadata: GeometryMetadata = GeometryMetadata( - validity_period_start=date(2023, 1, 1), - validity_period_end=date(2023, 12, 31), - level="municipality", - # country -> province -> region -> arrondisement -> municipality - hxl_tag="adm4", -) +from .census_tables import publisher @dataclass @@ -99,11 +96,10 @@ class BelgiumGeometryLevel: ), }, key_prefix=asset_prefix, - io_manager_key="geometry_io_manager", ) def geometry( context, sector_geometries -) -> tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame]: +) -> list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]]: """ Produces the full set of data / metadata associated with Belgian municipalities. The outputs, in order, are: @@ -113,55 +109,88 @@ def geometry( 3. A DataFrame containing the names of the municipalities (in this case, they are in Dutch, French, and German). """ - level_details = BELGIUM_GEOMETRY_LEVELS["municipality"] + geometries_to_return = [] + + for level_details in BELGIUM_GEOMETRY_LEVELS.values(): + geometry_metadata = GeometryMetadata( + validity_period_start=date(2023, 1, 1), + validity_period_end=date(2023, 12, 31), + level=level_details.level, + hxl_tag=level_details.hxl_tag, + ) - geometry_metadata = GeometryMetadata( - validity_period_start=date(2023, 1, 1), - validity_period_end=date(2023, 12, 31), - level=level_details.level, - hxl_tag=level_details.hxl_tag, - ) + region_geometries = ( + sector_geometries.dissolve(by=level_details.geo_id_column) + .reset_index() + .rename(columns={level_details.geo_id_column: "GEO_ID"}) + .loc[:, ["geometry", "GEO_ID"]] + ) + ic(region_geometries.head()) + + region_names = ( + sector_geometries.rename( + columns={ + level_details.geo_id_column: "GEO_ID", + level_details.name_columns["nld"]: "nld", + level_details.name_columns["fra"]: "fra", + level_details.name_columns["deu"]: "deu", + } + ) + .loc[:, ["GEO_ID", "nld", "fra", "deu"]] + .drop_duplicates() + ) + ic(region_names.head()) - region_geometries = ( - sector_geometries.dissolve(by=level_details.geo_id_column) - .reset_index() - .rename(columns={level_details.geo_id_column: "GEO_ID"}) - .loc[:, ["geometry", "GEO_ID"]] - ) - ic(region_geometries.head()) - - region_names = ( - sector_geometries.rename( - columns={ - level_details.geo_id_column: "GEO_ID", - level_details.name_columns["nld"]: "nld", - level_details.name_columns["fra"]: "fra", - level_details.name_columns["deu"]: "deu", - } + geometries_to_return.append( + (geometry_metadata, region_geometries, region_names) ) - .loc[:, ["GEO_ID", "nld", "fra", "deu"]] - .drop_duplicates() - ) - ic(region_names.head()) - # Generate a plot and convert the image to Markdown to preview it within - # Dagster - joined_gdf = region_geometries.merge(region_names, on="GEO_ID") - ax = joined_gdf.plot(column="nld", legend=False) - ax.set_title(f"Belgium 2023 {level_details.level}") + # Add output metadata + first_metadata, first_gdf, first_names = geometries_to_return[0] + first_joined_gdf = first_gdf.merge(first_names, on="GEO_ID") + ax = first_joined_gdf.plot(column="nld", legend=False) + ax.set_title(f"Belgium 2023 {first_metadata.level}") md_plot = markdown_from_plot(plt) - - geometry_metadata_df = metadata_to_dataframe([geometry_metadata]) - context.add_output_metadata( metadata={ - "num_records": len(region_geometries), - "geometry_plot": MetadataValue.md(md_plot), - "names_preview": MetadataValue.md(region_names.head().to_markdown()), - "metadata_preview": MetadataValue.md( - geometry_metadata_df.head().to_markdown() + "all_geom_levels": MetadataValue.md( + ",".join([metadata.level for metadata, _, _ in geometries_to_return]) ), - }, + "first_geometry_plot": MetadataValue.md(md_plot), + "first_names_preview": MetadataValue.md(first_names.head().to_markdown()), + } + ) + + return geometries_to_return + + +@asset(key_prefix=asset_prefix) +def source_data_release( + geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]] +) -> SourceDataRelease: + """ + Returns the SourceDataRelease corresponding to the municipality level. + """ + # Pick out the municipality geometry and get its ID + for geo_metadata, gdf, names_df in geometry: + if geo_metadata.level == "municipality": + geo_metadata_id = geo_metadata.id + break + else: + raise ValueError("No municipality geometry found") + + source = SourceDataRelease( + name="StatBel Open Data", + date_published=date(2015, 10, 22), + reference_period_start=date(2015, 10, 22), + reference_period_end=date(2015, 10, 22), + collection_period_start=date(2015, 10, 22), + collection_period_end=date(2015, 10, 22), + expect_next_update=date(2022, 1, 1), + url="https://statbel.fgov.be/en/open-data", + description="TBC", + data_publisher_id=publisher.id, + geometry_metadata_id=geo_metadata_id, ) - return geometry_metadata_df, region_geometries, region_names + return source diff --git a/python/popgetter/assets/be/census_tables.py b/python/popgetter/assets/be/census_tables.py index cef796b..e74be22 100644 --- a/python/popgetter/assets/be/census_tables.py +++ b/python/popgetter/assets/be/census_tables.py @@ -22,6 +22,7 @@ from rdflib.namespace import DCAT, DCTERMS, SKOS from popgetter.metadata import ( + CountryMetadata, DataPublisher, SourceDataRelease, GeometryMetadata, @@ -30,7 +31,7 @@ from popgetter.utils import extract_main_file_from_zip, markdown_from_plot from .belgium import asset_prefix, country -from .census_geometry import geometry_metadata + publisher: DataPublisher = DataPublisher( name="Statbel", @@ -41,45 +42,23 @@ opendata_catalog_root = URIRef("http://data.gov.be/catalog/statbelopen") -source: SourceDataRelease = SourceDataRelease( - name="StatBel Open Data", - date_published=date(2015, 10, 22), - reference_period_start=date(2015, 10, 22), - reference_period_end=date(2015, 10, 22), - collection_period_start=date(2015, 10, 22), - collection_period_end=date(2015, 10, 22), - expect_next_update=date(2022, 1, 1), - url="https://statbel.fgov.be/en/open-data", - description="TBC", - data_publisher_id=publisher.id, - geometry_metadata_id=geometry_metadata.id, -) - dataset_node_partition = DynamicPartitionsDefinition(name="dataset_nodes") -@asset(key_prefix=asset_prefix, io_manager_key="publishing_io_manager") -def country_metadata() -> pd.DataFrame: - """ - Returns a dataframe containing the CountryMetadata for this country. - """ - return metadata_to_dataframe([country]) - - -@asset(key_prefix=asset_prefix, io_manager_key="publishing_io_manager") -def data_publisher() -> pd.DataFrame: +@asset(key_prefix=asset_prefix) +def country_metadata() -> CountryMetadata: """ - Returns a dataframe containing the DataPublisher for this country. + Returns the CountryMetadata for this country. """ - return metadata_to_dataframe([publisher]) + return country -@asset(key_prefix=asset_prefix, io_manager_key="publishing_io_manager") -def source_data_release() -> pd.DataFrame: +@asset(key_prefix=asset_prefix) +def data_publisher() -> DataPublisher: """ - Returns a dataframe containing the SourceDataRelease for this country. + Returns the DataPublisher for this country. """ - return metadata_to_dataframe([source]) + return publisher @asset(key_prefix=asset_prefix) @@ -150,14 +129,13 @@ def catalog_as_dataframe(context, opendata_dataset_list: Graph) -> pd.DataFrame: ) ) + # This is unknown at this stage catalog_summary["metric_parquet_file_url"].append(None) catalog_summary["parquet_margin_of_error_column"].append(None) catalog_summary["parquet_margin_of_error_file"].append(None) catalog_summary["potential_denominator_ids"].append(None) catalog_summary["parent_metric_id"].append(None) - catalog_summary["source_data_release_id"].append(source.id) - - # This is unknown at this stage + catalog_summary["source_data_release_id"].append(None) catalog_summary["parquet_column_name"].append(None) download_url, archive_file_path, format = get_distribution_url( diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index b0e4f4f..df596bd 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -7,27 +7,45 @@ from dagster import InputContext, IOManager, OutputContext from icecream import ic from upath import UPath +from popgetter.metadata import ( + CountryMetadata, + DataPublisher, + SourceDataRelease, + GeometryMetadata, + MetricMetadata, + metadata_to_dataframe, +) class TopLevelMetadataIOManager(IOManager): - output_filenames: ClassVar[dict[str, str]] = { - "country_metadata": "country_metadata.parquet", - "data_publisher": "data_publishers.parquet", - "source_data_release": "source_data_releases.parquet", - } - - def get_relative_path(self, context: OutputContext) -> UPath: - try: - ic(context.asset_key.path) - path_components = list(context.asset_key.path) - path_components[-1] = self.output_filenames[path_components[-1]] - return UPath("/".join(path_components)) - except KeyError as err: - err_msg = f"Only the asset keys {','.join(self.output_filenames.keys())} are compatible with this" - raise ValueError(err_msg) from err - - def to_binary(self, obj: pd.DataFrame) -> bytes: - return obj.to_parquet(None) + def get_output_filename( + self, obj: CountryMetadata | DataPublisher | SourceDataRelease + ) -> str: + if isinstance(obj, CountryMetadata): + return "country_metadata.parquet" + elif isinstance(obj, DataPublisher): + return "data_publishers.parquet" + elif isinstance(obj, SourceDataRelease): + return "source_data_releases.parquet" + else: + err_msg = "This IO manager only accepts CountryMetadata, DataPublisher, and SourceDataRelease" + raise ValueError(err_msg) + + def get_relative_path( + self, + context: OutputContext, + obj: CountryMetadata | DataPublisher | SourceDataRelease, + ) -> UPath: + ic(context.asset_key.path) + path_prefixes = list(context.asset_key.path[:-1]) + filename = self.get_output_filename(obj) + return UPath("/".join([*path_prefixes, filename])) + + def to_binary( + self, obj: CountryMetadata | DataPublisher | SourceDataRelease + ) -> bytes: + df = metadata_to_dataframe([obj]) + return df.to_parquet(None) def load_input(self, _context: InputContext) -> pd.DataFrame: err_msg = "This IOManager is only for writing outputs" @@ -38,26 +56,32 @@ class TopLevelGeometryIOManager(IOManager): def get_relative_paths( self, context: OutputContext, - obj: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame], - ) -> dict[str, str]: - filename_stem = obj[0].iloc[0]["filename_stem"] + geo_metadata: GeometryMetadata, + ) -> dict[str, UPath]: + filename_stem = geo_metadata.filename_stem asset_prefix = list(context.asset_key.path[:-1]) # e.g. ['be'] return { - "metadata": "/".join([*asset_prefix, "geometry_metadata.parquet"]), - "flatgeobuf": "/".join( + "flatgeobuf": UPath("/".join( [*asset_prefix, "geometries", f"{filename_stem}.fgb"] - ), - "pmtiles": "/".join( - [*asset_prefix, "geometries", f"{filename_stem}.pmtiles"] - ), - "geojsonseq": "/".join( + )), + "pmtiles": UPath("/".join( + [*asset_prefix, "geometries", f"TODO_{filename_stem}.pmtiles"] + )), + "geojsonseq": UPath("/".join( [*asset_prefix, "geometries", f"{filename_stem}.geojsonseq"] - ), - "names": "/".join( + )), + "names": UPath("/".join( [*asset_prefix, "geometries", f"{filename_stem}.parquet"] - ), + )), } + def get_relative_path_for_metadata( + self, + context: OutputContext, + ) -> UPath: + asset_prefix = list(context.asset_key.path[:-1]) + return UPath("/".join([*asset_prefix, "geometry_metadata.parquet"])) + def load_input(self, _context: InputContext) -> pd.DataFrame: err_msg = "This IOManager is only for writing outputs" raise RuntimeError(err_msg) diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index 812feff..1922791 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -4,10 +4,18 @@ import geopandas as gpd import pandas as pd -from dagster import OutputContext +from dagster import OutputContext, MetadataValue from upath import UPath from . import TopLevelGeometryIOManager, TopLevelMetadataIOManager +from popgetter.metadata import ( + CountryMetadata, + DataPublisher, + SourceDataRelease, + GeometryMetadata, + MetricMetadata, + metadata_to_dataframe, +) class DagsterHomeMixin: @@ -20,8 +28,12 @@ def get_base_path_local(self) -> UPath: class LocalTopLevelMetadataIOManager(TopLevelMetadataIOManager, DagsterHomeMixin): - def handle_output(self, context: OutputContext, obj: pd.DataFrame) -> None: - rel_path = self.get_relative_path(context) + def handle_output( + self, + context: OutputContext, + obj: CountryMetadata | DataPublisher | SourceDataRelease, + ) -> None: + rel_path = self.get_relative_path(context, obj) full_path = self.get_base_path_local() / rel_path full_path.parent.mkdir(parents=True, exist_ok=True) context.add_output_metadata(metadata={"parquet_path": str(full_path)}) @@ -33,26 +45,46 @@ class LocalGeometryIOManager(TopLevelGeometryIOManager, DagsterHomeMixin): def handle_output( self, context: OutputContext, - obj: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame], + obj: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], ) -> None: - rel_paths = self.get_relative_paths(context, obj) + output_metadata = { + "flatgeobuf_paths": [], + "pmtiles_paths": [], + "geojsonseq_paths": [], + "names_paths": [], + } base_path = self.get_base_path_local() - full_paths = {key: base_path / rel_path for key, rel_path in rel_paths.items()} - for path in full_paths.values(): - path.parent.mkdir(parents=True, exist_ok=True) + + for obj_component in obj: + geo_metadata, gdf, names_df = obj_component + + rel_paths = self.get_relative_paths(context, geo_metadata) + full_paths = { + key: base_path / rel_path for key, rel_path in rel_paths.items() + } + for path in full_paths.values(): + path.parent.mkdir(parents=True, exist_ok=True) + + output_metadata["flatgeobuf_paths"].append(str(full_paths["flatgeobuf"])) + output_metadata["pmtiles_paths"].append(str(full_paths["pmtiles"])) + output_metadata["geojsonseq_paths"].append(str(full_paths["geojsonseq"])) + output_metadata["names_paths"].append(str(full_paths["names"])) + + gdf.to_file(full_paths["flatgeobuf"], driver="FlatGeobuf") + gdf.to_file(full_paths["geojsonseq"], driver="GeoJSONSeq") + # TODO: generate pmtiles + names_df.to_parquet(full_paths["names"]) + + # Metadata has to be serialised separately since they all get put in + # the same path. + metadata_df_filepath = base_path / self.get_relative_path_for_metadata(context) + metadata_df = metadata_to_dataframe([md for md, _, _ in obj]) + metadata_df.to_parquet(metadata_df_filepath) + context.add_output_metadata( metadata={ - "geometry_metadata_path": str(full_paths["metadata"]), - "flatgeobuf_path": str(full_paths["flatgeobuf"]), - "pmtiles_path": str(full_paths["pmtiles"]), - "geojsonseq_path": str(full_paths["geojsonseq"]), - "names_path": str(full_paths["names"]), + "metadata_path": MetadataValue.md(str(metadata_df_filepath)), + "metadata_preview": MetadataValue.md(metadata_df.head().to_markdown()), + **output_metadata, } ) - - metadata_df, gdf, names_df = obj - metadata_df.to_parquet(full_paths["metadata"]) - gdf.to_file(full_paths["flatgeobuf"], driver="FlatGeobuf") - gdf.to_file(full_paths["geojsonseq"], driver="GeoJSONSeq") - # TODO: generate pmtiles - names_df.to_parquet(full_paths["names"]) From cdd5c70592441c57d0a9b09a398ac9b8c5f117fb Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Wed, 15 May 2024 18:30:50 +0100 Subject: [PATCH 20/35] Implement cloud output sensors for TLM + geometries --- python/popgetter/__init__.py | 9 ++- python/popgetter/cloud_outputs/__init__.py | 1 + python/popgetter/cloud_outputs/geometry.py | 69 ++++++++++++++++++ python/popgetter/cloud_outputs/metadata.py | 71 +++++++++++++++++++ ...{cloud_outputs.py => cloud_outputs_old.py} | 4 +- python/popgetter/io_managers/__init__.py | 32 ++++----- 6 files changed, 166 insertions(+), 20 deletions(-) create mode 100644 python/popgetter/cloud_outputs/__init__.py create mode 100644 python/popgetter/cloud_outputs/geometry.py create mode 100644 python/popgetter/cloud_outputs/metadata.py rename python/popgetter/{cloud_outputs.py => cloud_outputs_old.py} (98%) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 286aec5..0eff9a4 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -44,7 +44,9 @@ *load_assets_from_package_module(assets.us, group_name="us"), *load_assets_from_package_module(assets.be, group_name="be"), *load_assets_from_package_module(assets.uk, group_name="uk"), - *load_assets_from_modules([cloud_outputs], group_name="cloud_assets"), + *load_assets_from_package_module(cloud_outputs, group_name="cloud_outputs"), + # [cloud_outputs.metadata], group_name="cloud_outputs_metadata" + # ), ] job_be: UnresolvedAssetJobDefinition = define_asset_job( @@ -90,7 +92,10 @@ defs: Definitions = Definitions( assets=all_assets, schedules=[], - sensors=[cloud_outputs.country_outputs_sensor], + sensors=[ + cloud_outputs.metadata.metadata_sensor, + cloud_outputs.geometry.geometry_sensor, + ], resources=resources, jobs=[job_be, job_us, job_uk], ) diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py new file mode 100644 index 0000000..3081ae4 --- /dev/null +++ b/python/popgetter/cloud_outputs/__init__.py @@ -0,0 +1 @@ +from . import metadata, geometry diff --git a/python/popgetter/cloud_outputs/geometry.py b/python/popgetter/cloud_outputs/geometry.py new file mode 100644 index 0000000..21c41cf --- /dev/null +++ b/python/popgetter/cloud_outputs/geometry.py @@ -0,0 +1,69 @@ +from dagster import ( + multi_asset_sensor, + asset, + AssetSelection, + AssetExecutionContext, + AssetKey, + StaticPartitionsDefinition, + DefaultSensorStatus, + RunRequest, + Output, + define_asset_job, + load_assets_from_current_module, +) +from icecream import ic +from functools import reduce + +cloud_assets_geometry = load_assets_from_current_module( + group_name="cloud_assets_geometry" +) + +# TODO: is there a better way to do this than to manually list them here? +geometry_asset_names = [ + "be/geometry", +] +geometry_partition = StaticPartitionsDefinition(geometry_asset_names) +geometry_assets_to_monitor = reduce( + lambda x, y: x | y, + [AssetSelection.keys(k) for k in geometry_asset_names], +) + + +@asset( + partitions_def=geometry_partition, + io_manager_key="geometry_io_manager", +) +def publish_geometry(context: AssetExecutionContext): + # Get the output of the asset + from popgetter import defs as popgetter_defs + + ic(context.partition_key) + + # load_asset_value expects a list of strings + output = popgetter_defs.load_asset_value(context.partition_key.split("/")) + ic(output) + + return Output(output) + + +geometry_job = define_asset_job( + name="geometry_job", + selection="publish_geometry", + partitions_def=geometry_partition, +) + + +@multi_asset_sensor( + monitored_assets=geometry_assets_to_monitor, + job=geometry_job, + minimum_interval_seconds=10, + default_status=DefaultSensorStatus.RUNNING, +) +def geometry_sensor(context): + asset_events = context.latest_materialization_records_by_key() + for asset_key, execution_value in asset_events.items(): + yield RunRequest( + run_key=None, + partition_key="/".join(asset_key.path), + ) + context.advance_cursor({asset_key: execution_value}) diff --git a/python/popgetter/cloud_outputs/metadata.py b/python/popgetter/cloud_outputs/metadata.py new file mode 100644 index 0000000..76e0ea0 --- /dev/null +++ b/python/popgetter/cloud_outputs/metadata.py @@ -0,0 +1,71 @@ +from dagster import ( + multi_asset_sensor, + asset, + AssetSelection, + AssetExecutionContext, + AssetKey, + StaticPartitionsDefinition, + DefaultSensorStatus, + RunRequest, + Output, + define_asset_job, + load_assets_from_current_module, +) +from icecream import ic +from functools import reduce + +cloud_assets_metadata = load_assets_from_current_module( + group_name="cloud_assets_metadata" +) + +# TODO: is there a better way to do this than to manually list them here? +toplevel_metadata_asset_names = [ + "be/country_metadata", + "be/data_publisher", + "be/source_data_release", +] +toplevel_metadata_partition = StaticPartitionsDefinition(toplevel_metadata_asset_names) +toplevel_metadata_assets_to_monitor = reduce( + lambda x, y: x | y, + [AssetSelection.keys(k) for k in toplevel_metadata_asset_names], +) + + +@asset( + partitions_def=toplevel_metadata_partition, + io_manager_key="publishing_io_manager", +) +def publish_toplevel_metadata(context: AssetExecutionContext): + # Get the output of the asset + from popgetter import defs as popgetter_defs + + ic(context.partition_key) + + # load_asset_value expects a list of strings + output = popgetter_defs.load_asset_value(context.partition_key.split("/")) + ic(output) + + return Output(output) + + +toplevel_metadata_job = define_asset_job( + name="toplevel_metadata_job", + selection="publish_toplevel_metadata", + partitions_def=toplevel_metadata_partition, +) + + +@multi_asset_sensor( + monitored_assets=toplevel_metadata_assets_to_monitor, + job=toplevel_metadata_job, + minimum_interval_seconds=10, + default_status=DefaultSensorStatus.RUNNING, +) +def metadata_sensor(context): + asset_events = context.latest_materialization_records_by_key() + for asset_key, execution_value in asset_events.items(): + yield RunRequest( + run_key=None, + partition_key="/".join(asset_key.path), + ) + context.advance_cursor({asset_key: execution_value}) diff --git a/python/popgetter/cloud_outputs.py b/python/popgetter/cloud_outputs_old.py similarity index 98% rename from python/popgetter/cloud_outputs.py rename to python/popgetter/cloud_outputs_old.py index 8d53ca4..16c1679 100644 --- a/python/popgetter/cloud_outputs.py +++ b/python/popgetter/cloud_outputs_old.py @@ -170,8 +170,8 @@ def country_outputs_sensor(context): @asset( - # TODO: This feels like a code smell. (mixing my metaphors) - # It feels that this structure is duplicated and it ought + # todo: this feels like a code smell. (mixing my metaphors) + # it feels that this structure is duplicated and it ought # to be possible to have some reusable structure. config_schema={ "asset_to_load": str, diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index df596bd..2ea4169 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -36,8 +36,8 @@ def get_relative_path( context: OutputContext, obj: CountryMetadata | DataPublisher | SourceDataRelease, ) -> UPath: - ic(context.asset_key.path) - path_prefixes = list(context.asset_key.path[:-1]) + ic(context.partition_key) + path_prefixes = list(context.partition_key.split("/"))[:-1] filename = self.get_output_filename(obj) return UPath("/".join([*path_prefixes, filename])) @@ -59,27 +59,27 @@ def get_relative_paths( geo_metadata: GeometryMetadata, ) -> dict[str, UPath]: filename_stem = geo_metadata.filename_stem - asset_prefix = list(context.asset_key.path[:-1]) # e.g. ['be'] + asset_prefix = list(context.partition_key.split("/"))[:-1] # e.g. ['be'] return { - "flatgeobuf": UPath("/".join( - [*asset_prefix, "geometries", f"{filename_stem}.fgb"] - )), - "pmtiles": UPath("/".join( - [*asset_prefix, "geometries", f"TODO_{filename_stem}.pmtiles"] - )), - "geojsonseq": UPath("/".join( - [*asset_prefix, "geometries", f"{filename_stem}.geojsonseq"] - )), - "names": UPath("/".join( - [*asset_prefix, "geometries", f"{filename_stem}.parquet"] - )), + "flatgeobuf": UPath( + "/".join([*asset_prefix, "geometries", f"{filename_stem}.fgb"]) + ), + "pmtiles": UPath( + "/".join([*asset_prefix, "geometries", f"TODO_{filename_stem}.pmtiles"]) + ), + "geojsonseq": UPath( + "/".join([*asset_prefix, "geometries", f"{filename_stem}.geojsonseq"]) + ), + "names": UPath( + "/".join([*asset_prefix, "geometries", f"{filename_stem}.parquet"]) + ), } def get_relative_path_for_metadata( self, context: OutputContext, ) -> UPath: - asset_prefix = list(context.asset_key.path[:-1]) + asset_prefix = list(context.partition_key.split("/"))[:-1] return UPath("/".join([*asset_prefix, "geometry_metadata.parquet"])) def load_input(self, _context: InputContext) -> pd.DataFrame: From e9d167aab6e3f04ac887ffd7565f97c7f661e995 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Wed, 15 May 2024 18:48:41 +0100 Subject: [PATCH 21/35] Update Azure IO managers to work with cloud sensors (to a certain extent) I'm very much unsure what happens if two IO managers attempt to use the lease at the same time. To get the geometry sensor to work, I had to lengthen the sensor interval to 60 seconds and also disable the metadata sensor. --- python/popgetter/cloud_outputs/geometry.py | 2 +- python/popgetter/io_managers/azure.py | 63 ++++++++++++---------- python/popgetter/io_managers/local.py | 4 +- 3 files changed, 38 insertions(+), 31 deletions(-) diff --git a/python/popgetter/cloud_outputs/geometry.py b/python/popgetter/cloud_outputs/geometry.py index 21c41cf..2dedd5a 100644 --- a/python/popgetter/cloud_outputs/geometry.py +++ b/python/popgetter/cloud_outputs/geometry.py @@ -56,7 +56,7 @@ def publish_geometry(context: AssetExecutionContext): @multi_asset_sensor( monitored_assets=geometry_assets_to_monitor, job=geometry_job, - minimum_interval_seconds=10, + minimum_interval_seconds=60, default_status=DefaultSensorStatus.RUNNING, ) def geometry_sensor(context): diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index 4deff35..d29d975 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -28,6 +28,12 @@ from upath import UPath from . import TopLevelGeometryIOManager, TopLevelMetadataIOManager +from popgetter.metadata import ( + CountryMetadata, + DataPublisher, + SourceDataRelease, + metadata_to_dataframe, +) # Set no time limit on lease duration to enable large files to be uploaded _LEASE_DURATION = -1 @@ -157,10 +163,15 @@ def load_input(self, context: InputContext) -> Any: class AzureTopLevelMetadataIOManager(TopLevelMetadataIOManager, AzureIOManager): - def handle_output(self, context: OutputContext, df: pd.DataFrame) -> None: - rel_path = self.get_relative_path(context) + def handle_output( + self, + context: OutputContext, + obj: CountryMetadata | DataPublisher | SourceDataRelease, + ) -> None: + rel_path = self.get_relative_path(context, obj) full_path = self.get_base_path() / rel_path context.add_output_metadata(metadata={"parquet_path": str(full_path)}) + df = metadata_to_dataframe([obj]) self.dump_to_path(context, df.to_parquet(None), full_path) @@ -191,34 +202,32 @@ def geo_df_to_bytes(context, geo_df: gpd.GeoDataFrame, output_type: str) -> byte def handle_output( self, context: OutputContext, - obj: tuple[pd.DataFrame, gpd.GeoDataFrame, pd.DataFrame], + obj: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], ) -> None: - rel_paths = self.get_relative_paths(context, obj) base_path = self.get_base_path() - full_paths = {key: base_path / rel_path for key, rel_path in rel_paths.items()} - for path in full_paths.values(): - path.parent.mkdir(parents=True, exist_ok=True) - context.add_output_metadata( - metadata={ - "geometry_metadata_path": str(full_paths["metadata"]), - "flatgeobuf_path": str(full_paths["flatgeobuf"]), - "pmtiles_path": str(full_paths["pmtiles"]), - "geojsonseq_path": str(full_paths["geojsonseq"]), - "names_path": str(full_paths["names"]), + + for geo_metadata, gdf, names_df in obj: + rel_paths = self.get_relative_paths(context, geo_metadata) + full_paths = { + key: base_path / rel_path for key, rel_path in rel_paths.items() } - ) - metadata_df, gdf, names_df = obj - self.dump_to_path(context, metadata_df.to_parquet(None), full_paths["metadata"]) - self.dump_to_path( - context, - self.geo_df_to_bytes(context, gdf, "flatgeobuf"), - full_paths["flatgeobuf"], - ) + self.dump_to_path( + context, + self.geo_df_to_bytes(context, gdf, "flatgeobuf"), + full_paths["flatgeobuf"], + ) + self.dump_to_path( + context, + self.geo_df_to_bytes(context, gdf, "geojsonseq"), + full_paths["geojsonseq"], + ) + # TODO: generate pmtiles + self.dump_to_path(context, names_df.to_parquet(None), full_paths["names"]) + + # Handle metadata separately since they all get put into one dataframe + metadata_df_filepath = base_path / self.get_relative_path_for_metadata(context) + metadata_df = metadata_to_dataframe([md for md, _, _ in obj]) self.dump_to_path( - context, - self.geo_df_to_bytes(context, gdf, "geojsonseq"), - full_paths["geojsonseq"], + context, metadata_df.to_parquet(None), metadata_df_filepath ) - # TODO: generate pmtiles - self.dump_to_path(context, names_df.to_parquet(None), full_paths["names"]) diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index 1922791..001f465 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -55,9 +55,7 @@ def handle_output( } base_path = self.get_base_path_local() - for obj_component in obj: - geo_metadata, gdf, names_df = obj_component - + for geo_metadata, gdf, names_df in obj: rel_paths = self.get_relative_paths(context, geo_metadata) full_paths = { key: base_path / rel_path for key, rel_path in rel_paths.items() From e657a824bfe7257f08fba5633ff4c1ef0e80a610 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Wed, 15 May 2024 20:14:04 +0100 Subject: [PATCH 22/35] Rename IOManagers to Mixins, re-add Azure test jobs --- python/popgetter/__init__.py | 4 +- python/popgetter/cloud_outputs/__init__.py | 2 +- python/popgetter/cloud_outputs/azure_test.py | 12 +++++ python/popgetter/cloud_outputs_old.py | 10 ---- python/popgetter/io_managers/__init__.py | 18 +++---- python/popgetter/io_managers/azure.py | 57 ++++++++++---------- python/popgetter/io_managers/local.py | 16 +++--- 7 files changed, 61 insertions(+), 58 deletions(-) create mode 100644 python/popgetter/cloud_outputs/azure_test.py diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 0eff9a4..8dd0344 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -45,8 +45,6 @@ *load_assets_from_package_module(assets.be, group_name="be"), *load_assets_from_package_module(assets.uk, group_name="uk"), *load_assets_from_package_module(cloud_outputs, group_name="cloud_outputs"), - # [cloud_outputs.metadata], group_name="cloud_outputs_metadata" - # ), ] job_be: UnresolvedAssetJobDefinition = define_asset_job( @@ -84,7 +82,7 @@ "staging_res": StagingDirResource( staging_dir=str(Path(__file__).parent.joinpath("staging_dir").resolve()) ), - "general_io_manager": AzureGeneralIOManager(".bin"), + "azure_general_io_manager": AzureGeneralIOManager(".bin"), } resources.update(resources_by_env[os.getenv("ENV", "dev")]) diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py index 3081ae4..44f9fb0 100644 --- a/python/popgetter/cloud_outputs/__init__.py +++ b/python/popgetter/cloud_outputs/__init__.py @@ -1 +1 @@ -from . import metadata, geometry +from . import metadata, geometry, azure_test diff --git a/python/popgetter/cloud_outputs/azure_test.py b/python/popgetter/cloud_outputs/azure_test.py new file mode 100644 index 0000000..91a302c --- /dev/null +++ b/python/popgetter/cloud_outputs/azure_test.py @@ -0,0 +1,12 @@ +import pandas as pd +from dagster import asset + + +@asset(io_manager_key="azure_general_io_manager") +def test_azure(): + return pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}).to_parquet(None) + + +@asset(io_manager_key="azure_general_io_manager") +def test_azure_large(): + return b"0" * (450 * 1024 * 1024 + 100) diff --git a/python/popgetter/cloud_outputs_old.py b/python/popgetter/cloud_outputs_old.py index 16c1679..2f4dc6c 100644 --- a/python/popgetter/cloud_outputs_old.py +++ b/python/popgetter/cloud_outputs_old.py @@ -213,16 +213,6 @@ def upstream_df(context): raise ValueError(err_msg) -@asset(io_manager_key="general_io_manager") -def test_azure(): - return pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}).to_parquet(None) - - -@asset(io_manager_key="general_io_manager") -def test_azure_large(): - return b"0" * (450 * 1024 * 1024 + 100) - - def df_to_bytes(df: gpd.GeoDataFrame, output_type: str) -> bytes: tmp = tempfile.NamedTemporaryFile(prefix=str(uuid.uuid4())) if output_type.lower() == "parquet": diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 2ea4169..3f67c4a 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -17,7 +17,13 @@ ) -class TopLevelMetadataIOManager(IOManager): +class PopgetterIOManager(IOManager): + def load_input(self, _context: InputContext) -> pd.DataFrame: + err_msg = "This IOManager is only for writing outputs" + raise RuntimeError(err_msg) + + +class TopLevelMetadataMixin(): def get_output_filename( self, obj: CountryMetadata | DataPublisher | SourceDataRelease ) -> str: @@ -47,12 +53,8 @@ def to_binary( df = metadata_to_dataframe([obj]) return df.to_parquet(None) - def load_input(self, _context: InputContext) -> pd.DataFrame: - err_msg = "This IOManager is only for writing outputs" - raise RuntimeError(err_msg) - -class TopLevelGeometryIOManager(IOManager): +class GeometryMixin(): def get_relative_paths( self, context: OutputContext, @@ -81,7 +83,3 @@ def get_relative_path_for_metadata( ) -> UPath: asset_prefix = list(context.partition_key.split("/"))[:-1] return UPath("/".join([*asset_prefix, "geometry_metadata.parquet"])) - - def load_input(self, _context: InputContext) -> pd.DataFrame: - err_msg = "This IOManager is only for writing outputs" - raise RuntimeError(err_msg) diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index d29d975..ae3427d 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -18,8 +18,8 @@ ) from dagster import ( Any, - InputContext, IOManager, + InputContext, OutputContext, ) from dagster_azure.adls2.utils import ResourceNotFoundError, create_adls2_client @@ -27,7 +27,7 @@ from icecream import ic from upath import UPath -from . import TopLevelGeometryIOManager, TopLevelMetadataIOManager +from . import PopgetterIOManager, TopLevelMetadataMixin, GeometryMixin from popgetter.metadata import ( CountryMetadata, DataPublisher, @@ -43,7 +43,7 @@ _CONNECTION_TIMEOUT = 6000 -class AzureIOManager(IOManager): +class AzureMixin: storage_account: str | None = os.getenv("AZURE_STORAGE_ACCOUNT") container: str | None = os.getenv("AZURE_CONTAINER") prefix: str | None = os.getenv("AZURE_DIRECTORY") @@ -142,27 +142,9 @@ def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]: lease_client.release() -class AzureGeneralIOManager(AzureIOManager): - extension: str - - def __init__(self, extension: str | None = None): - super().__init__() - if extension is not None and not extension.startswith("."): - err_msg = f"Provided extension ('{extension}') does not begin with '.'" - raise ValueError(err_msg) - self.extension = "" if extension is None else extension - - def handle_output(self, context: OutputContext, obj: bytes) -> None: - path = self.get_base_path() / ".".join( - [*context.asset_key.path, self.extension] - ) - self.dump_to_path(context, obj, path) - - def load_input(self, context: InputContext) -> Any: - return super().load_input(context) - - -class AzureTopLevelMetadataIOManager(TopLevelMetadataIOManager, AzureIOManager): +class AzureTopLevelMetadataIOManager( + AzureMixin, TopLevelMetadataMixin, PopgetterIOManager +): def handle_output( self, context: OutputContext, @@ -175,7 +157,7 @@ def handle_output( self.dump_to_path(context, df.to_parquet(None), full_path) -class AzureGeoIOManager(TopLevelGeometryIOManager, AzureIOManager): +class AzureGeoIOManager(AzureMixin, GeometryMixin, PopgetterIOManager): @staticmethod def geo_df_to_bytes(context, geo_df: gpd.GeoDataFrame, output_type: str) -> bytes: tmp = tempfile.NamedTemporaryFile() @@ -228,6 +210,27 @@ def handle_output( # Handle metadata separately since they all get put into one dataframe metadata_df_filepath = base_path / self.get_relative_path_for_metadata(context) metadata_df = metadata_to_dataframe([md for md, _, _ in obj]) - self.dump_to_path( - context, metadata_df.to_parquet(None), metadata_df_filepath + self.dump_to_path(context, metadata_df.to_parquet(None), metadata_df_filepath) + + +class AzureGeneralIOManager(AzureMixin, IOManager): + """This class is used only for an asset which tests the Azure functionality + (see cloud_outputs/azure_test.py). It is not used for publishing any + popgetter data.""" + extension: str + + def __init__(self, extension: str | None = None): + super().__init__() + if extension is not None and not extension.startswith("."): + err_msg = f"Provided extension ('{extension}') does not begin with '.'" + raise ValueError(err_msg) + self.extension = "" if extension is None else extension + + def handle_output(self, context: OutputContext, obj: bytes) -> None: + path = self.get_base_path() / ".".join( + [*context.asset_key.path, self.extension] ) + self.dump_to_path(context, obj, path) + + def load_input(self, context: InputContext) -> Any: + return super().load_input(context) diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index 001f465..e32b200 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -7,7 +7,7 @@ from dagster import OutputContext, MetadataValue from upath import UPath -from . import TopLevelGeometryIOManager, TopLevelMetadataIOManager +from . import PopgetterIOManager, GeometryMixin, TopLevelMetadataMixin from popgetter.metadata import ( CountryMetadata, DataPublisher, @@ -18,30 +18,32 @@ ) -class DagsterHomeMixin: +class LocalMixin: dagster_home: str | None = os.getenv("DAGSTER_HOME") - def get_base_path_local(self) -> UPath: + def get_base_path(self) -> UPath: if not self.dagster_home: raise ValueError("The DAGSTER_HOME environment variable must be set.") return UPath(self.dagster_home) / "cloud_outputs" -class LocalTopLevelMetadataIOManager(TopLevelMetadataIOManager, DagsterHomeMixin): +class LocalTopLevelMetadataIOManager( + LocalMixin, TopLevelMetadataMixin, PopgetterIOManager +): def handle_output( self, context: OutputContext, obj: CountryMetadata | DataPublisher | SourceDataRelease, ) -> None: rel_path = self.get_relative_path(context, obj) - full_path = self.get_base_path_local() / rel_path + full_path = self.get_base_path() / rel_path full_path.parent.mkdir(parents=True, exist_ok=True) context.add_output_metadata(metadata={"parquet_path": str(full_path)}) with full_path.open("wb") as file: file.write(self.to_binary(obj)) -class LocalGeometryIOManager(TopLevelGeometryIOManager, DagsterHomeMixin): +class LocalGeometryIOManager(LocalMixin, GeometryMixin, PopgetterIOManager): def handle_output( self, context: OutputContext, @@ -53,7 +55,7 @@ def handle_output( "geojsonseq_paths": [], "names_paths": [], } - base_path = self.get_base_path_local() + base_path = self.get_base_path() for geo_metadata, gdf, names_df in obj: rel_paths = self.get_relative_paths(context, geo_metadata) From e97730ca0b4af0b5e826e768f61e017eb5dad724 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Wed, 15 May 2024 21:38:30 +0100 Subject: [PATCH 23/35] Refactor classes so that everything is in the right place --- python/popgetter/io_managers/__init__.py | 128 +++++++++++++++++++---- python/popgetter/io_managers/azure.py | 98 +++++++---------- python/popgetter/io_managers/local.py | 82 +++++---------- 3 files changed, 170 insertions(+), 138 deletions(-) diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 3f67c4a..319d79b 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -4,7 +4,7 @@ import geopandas as gpd import pandas as pd -from dagster import InputContext, IOManager, OutputContext +from dagster import InputContext, IOManager, OutputContext, MetadataValue from icecream import ic from upath import UPath from popgetter.metadata import ( @@ -15,15 +15,19 @@ MetricMetadata, metadata_to_dataframe, ) +from dataclasses import dataclass class PopgetterIOManager(IOManager): + def get_base_path(self) -> UPath: + raise NotImplementedError + def load_input(self, _context: InputContext) -> pd.DataFrame: err_msg = "This IOManager is only for writing outputs" raise RuntimeError(err_msg) -class TopLevelMetadataMixin(): +class TopLevelMetadataIOManager(PopgetterIOManager): def get_output_filename( self, obj: CountryMetadata | DataPublisher | SourceDataRelease ) -> str: @@ -37,49 +41,127 @@ def get_output_filename( err_msg = "This IO manager only accepts CountryMetadata, DataPublisher, and SourceDataRelease" raise ValueError(err_msg) - def get_relative_path( + def get_full_path( self, context: OutputContext, obj: CountryMetadata | DataPublisher | SourceDataRelease, ) -> UPath: - ic(context.partition_key) path_prefixes = list(context.partition_key.split("/"))[:-1] filename = self.get_output_filename(obj) - return UPath("/".join([*path_prefixes, filename])) + return self.get_base_path() / UPath("/".join([*path_prefixes, filename])) - def to_binary( - self, obj: CountryMetadata | DataPublisher | SourceDataRelease - ) -> bytes: - df = metadata_to_dataframe([obj]) - return df.to_parquet(None) + def handle_df(self, context: OutputContext, df: pd.DataFrame, full_path: UPath) -> None: + raise NotImplementedError + + def handle_output( + self, + context: OutputContext, + obj: CountryMetadata | DataPublisher | SourceDataRelease, + ): + full_path = self.get_full_path(context, obj) + context.add_output_metadata(metadata={"parquet_path": str(full_path)}) + self.handle_df(context, metadata_to_dataframe([obj]), full_path) + + +class GeoIOManager(PopgetterIOManager): + def handle_flatgeobuf( + self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath + ) -> None: + raise NotImplementedError + + def handle_geojsonseq( + self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath + ) -> None: + raise NotImplementedError + + def handle_pmtiles( + self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath + ) -> None: + raise RuntimeError("Pmtiles not currently implemented") + + def handle_names( + self, context: OutputContext, names_df: pd.DataFrame, full_path: UPath + ) -> None: + raise NotImplementedError + + def handle_geo_metadata( + self, context: OutputContext, geo_metadata_df: pd.DataFrame, full_path: UPath + ) -> None: + raise NotImplementedError + @dataclass + class GeometryOutputPaths: + flatgeobuf: str + pmtiles: str + geojsonseq: str + names: str -class GeometryMixin(): - def get_relative_paths( + def get_full_paths_geoms( self, context: OutputContext, geo_metadata: GeometryMetadata, - ) -> dict[str, UPath]: + ) -> GeometryOutputPaths: filename_stem = geo_metadata.filename_stem asset_prefix = list(context.partition_key.split("/"))[:-1] # e.g. ['be'] - return { - "flatgeobuf": UPath( - "/".join([*asset_prefix, "geometries", f"{filename_stem}.fgb"]) - ), - "pmtiles": UPath( + base_path = self.get_base_path() + return self.GeometryOutputPaths( + flatgeobuf=base_path + / UPath("/".join([*asset_prefix, "geometries", f"{filename_stem}.fgb"])), + pmtiles=base_path + / UPath( "/".join([*asset_prefix, "geometries", f"TODO_{filename_stem}.pmtiles"]) ), - "geojsonseq": UPath( + geojsonseq=base_path + / UPath( "/".join([*asset_prefix, "geometries", f"{filename_stem}.geojsonseq"]) ), - "names": UPath( + names=base_path + / UPath( "/".join([*asset_prefix, "geometries", f"{filename_stem}.parquet"]) ), - } + ) - def get_relative_path_for_metadata( + def get_full_path_metadata( self, context: OutputContext, ) -> UPath: + base_path = self.get_base_path() asset_prefix = list(context.partition_key.split("/"))[:-1] - return UPath("/".join([*asset_prefix, "geometry_metadata.parquet"])) + return base_path / UPath("/".join([*asset_prefix, "geometry_metadata.parquet"])) + + def handle_output( + self, + context: OutputContext, + obj: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], + ) -> None: + output_metadata = { + "flatgeobuf_paths": [], + "pmtiles_paths": [], + "geojsonseq_paths": [], + "names_paths": [], + } + + for geo_metadata, gdf, names_df in obj: + full_paths = self.get_full_paths_geoms(context, geo_metadata) + + self.handle_flatgeobuf(context, gdf, full_paths.flatgeobuf) + self.handle_geojsonseq(context, gdf, full_paths.geojsonseq) + # TODO self.handle_pmtiles(context, gdf, full_paths.pmtiles) + self.handle_names(context, names_df, full_paths.names) + + output_metadata["flatgeobuf_paths"].append(str(full_paths.flatgeobuf)) + output_metadata["pmtiles_paths"].append(str(full_paths.pmtiles)) + output_metadata["geojsonseq_paths"].append(str(full_paths.geojsonseq)) + output_metadata["names_paths"].append(str(full_paths.names)) + + metadata_df_filepath = self.get_full_path_metadata(context) + metadata_df = metadata_to_dataframe([md for md, _, _ in obj]) + self.handle_geo_metadata(context, metadata_df, metadata_df_filepath) + + context.add_output_metadata( + metadata={ + "metadata_path": str(metadata_df_filepath), + "metadata_preview": MetadataValue.md(metadata_df.head().to_markdown()), + **output_metadata, + } + ) diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index ae3427d..5f7bc62 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -1,9 +1,9 @@ from __future__ import annotations import os -import tempfile from collections.abc import Iterator from contextlib import contextmanager +import tempfile from pathlib import Path import geopandas as gpd @@ -27,7 +27,7 @@ from icecream import ic from upath import UPath -from . import PopgetterIOManager, TopLevelMetadataMixin, GeometryMixin +from . import PopgetterIOManager, TopLevelMetadataIOManager, GeoIOManager from popgetter.metadata import ( CountryMetadata, DataPublisher, @@ -142,81 +142,55 @@ def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]: lease_client.release() -class AzureTopLevelMetadataIOManager( - AzureMixin, TopLevelMetadataMixin, PopgetterIOManager -): - def handle_output( - self, - context: OutputContext, - obj: CountryMetadata | DataPublisher | SourceDataRelease, - ) -> None: - rel_path = self.get_relative_path(context, obj) - full_path = self.get_base_path() / rel_path - context.add_output_metadata(metadata={"parquet_path": str(full_path)}) - df = metadata_to_dataframe([obj]) - self.dump_to_path(context, df.to_parquet(None), full_path) +class AzureTopLevelMetadataIOManager(AzureMixin, TopLevelMetadataIOManager): + def handle_df(self, context: OutputContext, df: pd.DataFrame, path: UPath) -> None: + self.dump_to_path(context, df.to_parquet(None), path) -class AzureGeoIOManager(AzureMixin, GeometryMixin, PopgetterIOManager): - @staticmethod - def geo_df_to_bytes(context, geo_df: gpd.GeoDataFrame, output_type: str) -> bytes: +class AzureGeoIOManager(AzureMixin, GeoIOManager): + def handle_flatgeobuf( + self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath + ) -> None: tmp = tempfile.NamedTemporaryFile() - if output_type.lower() == "parquet": - fname = tmp.name + ".parquet" - geo_df.to_parquet(fname) - elif output_type.lower() == "flatgeobuf": - fname = tmp.name + ".fgb" - geo_df.to_file(fname, driver="FlatGeobuf") - elif output_type.lower() == "geojsonseq": - fname = tmp.name + ".geojsonseq" - geo_df.to_file(fname, driver="GeoJSONSeq") - elif output_type.lower() == "pmtiles": - err_msg = "pmtiles not currently implemented" - raise ValueError(err_msg) - else: - value_error: str = f"'{output_type}' is not currently supported." - raise ValueError(value_error) + fname = tmp.name + ".fgb" + geo_df.to_file(fname, driver="FlatGeobuf") with Path(fname).open(mode="rb") as f: b: bytes = f.read() context.log.debug(ic(f"Size: {len(b) / (1_024 * 1_024):.3f}MB")) - return b + self.dump_to_path(context, b, full_path) - def handle_output( - self, - context: OutputContext, - obj: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], + def handle_geojsonseq( + self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath ) -> None: - base_path = self.get_base_path() - - for geo_metadata, gdf, names_df in obj: - rel_paths = self.get_relative_paths(context, geo_metadata) - full_paths = { - key: base_path / rel_path for key, rel_path in rel_paths.items() - } - - self.dump_to_path( - context, - self.geo_df_to_bytes(context, gdf, "flatgeobuf"), - full_paths["flatgeobuf"], - ) - self.dump_to_path( - context, - self.geo_df_to_bytes(context, gdf, "geojsonseq"), - full_paths["geojsonseq"], - ) - # TODO: generate pmtiles - self.dump_to_path(context, names_df.to_parquet(None), full_paths["names"]) + tmp = tempfile.NamedTemporaryFile() + fname = tmp.name + ".geojsonseq" + geo_df.to_file(fname, driver="GeoJSONSeq") + with Path(fname).open(mode="rb") as f: + b: bytes = f.read() + context.log.debug(ic(f"Size: {len(b) / (1_024 * 1_024):.3f}MB")) + self.dump_to_path(context, b, full_path) + + def handle_pmtiles( + self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath + ) -> None: + raise RuntimeError("Pmtiles not currently implemented") - # Handle metadata separately since they all get put into one dataframe - metadata_df_filepath = base_path / self.get_relative_path_for_metadata(context) - metadata_df = metadata_to_dataframe([md for md, _, _ in obj]) - self.dump_to_path(context, metadata_df.to_parquet(None), metadata_df_filepath) + def handle_names( + self, context: OutputContext, names_df: pd.DataFrame, full_path: UPath + ) -> None: + self.dump_to_path(context, names_df.to_parquet(None), full_path) + + def handle_geo_metadata( + self, context: OutputContext, geo_metadata_df: pd.DataFrame, full_path: UPath + ) -> None: + self.dump_to_path(context, geo_metadata_df.to_parquet(None), full_path) class AzureGeneralIOManager(AzureMixin, IOManager): """This class is used only for an asset which tests the Azure functionality (see cloud_outputs/azure_test.py). It is not used for publishing any popgetter data.""" + extension: str def __init__(self, extension: str | None = None): diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index e32b200..7c07ed4 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -7,7 +7,7 @@ from dagster import OutputContext, MetadataValue from upath import UPath -from . import PopgetterIOManager, GeometryMixin, TopLevelMetadataMixin +from . import PopgetterIOManager, GeoIOManager, TopLevelMetadataIOManager from popgetter.metadata import ( CountryMetadata, DataPublisher, @@ -26,65 +26,41 @@ def get_base_path(self) -> UPath: raise ValueError("The DAGSTER_HOME environment variable must be set.") return UPath(self.dagster_home) / "cloud_outputs" + def make_parent_dirs(self, full_path: UPath) -> None: + full_path.parent.mkdir(parents=True, exist_ok=True) + class LocalTopLevelMetadataIOManager( - LocalMixin, TopLevelMetadataMixin, PopgetterIOManager + LocalMixin, TopLevelMetadataIOManager, PopgetterIOManager ): - def handle_output( - self, - context: OutputContext, - obj: CountryMetadata | DataPublisher | SourceDataRelease, + def handle_df( + self, context: OutputContext, df: pd.DataFrame, full_path: UPath ) -> None: - rel_path = self.get_relative_path(context, obj) - full_path = self.get_base_path() / rel_path - full_path.parent.mkdir(parents=True, exist_ok=True) - context.add_output_metadata(metadata={"parquet_path": str(full_path)}) - with full_path.open("wb") as file: - file.write(self.to_binary(obj)) + self.make_parent_dirs(full_path) + df.to_parquet(full_path) -class LocalGeometryIOManager(LocalMixin, GeometryMixin, PopgetterIOManager): - def handle_output( - self, - context: OutputContext, - obj: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]], +class LocalGeometryIOManager(LocalMixin, GeoIOManager, PopgetterIOManager): + def handle_flatgeobuf( + self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath ) -> None: - output_metadata = { - "flatgeobuf_paths": [], - "pmtiles_paths": [], - "geojsonseq_paths": [], - "names_paths": [], - } - base_path = self.get_base_path() - - for geo_metadata, gdf, names_df in obj: - rel_paths = self.get_relative_paths(context, geo_metadata) - full_paths = { - key: base_path / rel_path for key, rel_path in rel_paths.items() - } - for path in full_paths.values(): - path.parent.mkdir(parents=True, exist_ok=True) + self.make_parent_dirs(full_path) + geo_df.to_file(full_path, driver="FlatGeobuf") - output_metadata["flatgeobuf_paths"].append(str(full_paths["flatgeobuf"])) - output_metadata["pmtiles_paths"].append(str(full_paths["pmtiles"])) - output_metadata["geojsonseq_paths"].append(str(full_paths["geojsonseq"])) - output_metadata["names_paths"].append(str(full_paths["names"])) - - gdf.to_file(full_paths["flatgeobuf"], driver="FlatGeobuf") - gdf.to_file(full_paths["geojsonseq"], driver="GeoJSONSeq") - # TODO: generate pmtiles - names_df.to_parquet(full_paths["names"]) + def handle_geojsonseq( + self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath + ) -> None: + self.make_parent_dirs(full_path) + geo_df.to_file(full_path, driver="GeoJSONSeq") - # Metadata has to be serialised separately since they all get put in - # the same path. - metadata_df_filepath = base_path / self.get_relative_path_for_metadata(context) - metadata_df = metadata_to_dataframe([md for md, _, _ in obj]) - metadata_df.to_parquet(metadata_df_filepath) + def handle_names( + self, context: OutputContext, names_df: pd.DataFrame, full_path: UPath + ) -> None: + self.make_parent_dirs(full_path) + names_df.to_parquet(full_path) - context.add_output_metadata( - metadata={ - "metadata_path": MetadataValue.md(str(metadata_df_filepath)), - "metadata_preview": MetadataValue.md(metadata_df.head().to_markdown()), - **output_metadata, - } - ) + def handle_geo_metadata( + self, context: OutputContext, geo_metadata_df: pd.DataFrame, full_path: UPath + ) -> None: + self.make_parent_dirs(full_path) + geo_metadata_df.to_parquet(full_path) From 9c93b2235e8f14d3a614b9d7b35737f5f9f8d3ee Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Wed, 15 May 2024 22:00:16 +0100 Subject: [PATCH 24/35] =?UTF-8?q?Fix=20all=20the=20errors=20=F0=9F=A4=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- output_spec.md | 8 ++-- python/popgetter/__init__.py | 1 - python/popgetter/assets/be/__init__.py | 9 ----- python/popgetter/assets/be/census_derived.py | 21 +++++++++++ python/popgetter/assets/be/census_geometry.py | 19 +++++----- python/popgetter/assets/be/census_tables.py | 5 --- python/popgetter/cloud_outputs/__init__.py | 6 ++- python/popgetter/cloud_outputs/azure_test.py | 2 + python/popgetter/cloud_outputs/geometry.py | 16 ++++---- python/popgetter/cloud_outputs/metadata.py | 16 ++++---- python/popgetter/cloud_outputs_old.py | 1 - python/popgetter/io_managers/__init__.py | 37 ++++++++++--------- python/popgetter/io_managers/azure.py | 17 ++------- python/popgetter/io_managers/local.py | 25 +++++-------- python/popgetter/metadata.py | 3 +- 15 files changed, 92 insertions(+), 94 deletions(-) diff --git a/output_spec.md b/output_spec.md index e6b19e5..96caf76 100644 --- a/output_spec.md +++ b/output_spec.md @@ -65,13 +65,13 @@ Polars does not have the concept of an index column.) These dataframes are then serialised as parquet files, and can be given any filename, as the `MetricMetadata` struct should contain the filepath to them. -Likewise, geometries should be placed in the `geometries` subdirectory. Each -set of geometries should consist of four files, with the same filename stem and +Likewise, geometries should be placed in the `geometries` subdirectory. Each set +of geometries should consist of four files, with the same filename stem and different extensions. The filename stem is specified inside the `GeometryMetadata` struct. -- `{filename}.flatgeobuf` - a FlatGeobuf file with the geoIDs stored in the `GEO_ID` - column +- `{filename}.flatgeobuf` - a FlatGeobuf file with the geoIDs stored in the + `GEO_ID` column - `{filename}.geojsonseq` - corresponding GeoJSONSeq file - `{filename}.pmtiles` - corresponding PMTiles file - `{filename}.parquet` - a serialised dataframe storing the names of the diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 8dd0344..95aa1f9 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -28,7 +28,6 @@ PipesSubprocessClient, SourceAsset, define_asset_job, - load_assets_from_modules, load_assets_from_package_module, ) from dagster._core.definitions.cacheable_assets import ( diff --git a/python/popgetter/assets/be/__init__.py b/python/popgetter/assets/be/__init__.py index 9a961ea..46f9119 100755 --- a/python/popgetter/assets/be/__init__.py +++ b/python/popgetter/assets/be/__init__.py @@ -1,20 +1,11 @@ #!/usr/bin/python3 from __future__ import annotations -from dagster import ( - asset, -) - -from popgetter.metadata import ( - CountryMetadata, -) - from . import ( census_derived, # noqa: F401 census_geometry, # noqa: F401 census_tables, # noqa: F401 ) -from .belgium import asset_prefix, country # @asset(key_prefix=asset_prefix) diff --git a/python/popgetter/assets/be/census_derived.py b/python/popgetter/assets/be/census_derived.py index 9264da1..baad50a 100644 --- a/python/popgetter/assets/be/census_derived.py +++ b/python/popgetter/assets/be/census_derived.py @@ -176,6 +176,27 @@ def filter_needed_catalog( return needed_df +def census_table_metadata(catalog_row: dict) -> MetricMetadata: + return MetricMetadata( + human_readable_name=catalog_row["human_readable_name"], + source_download_url=catalog_row["source_download_url"], + source_archive_file_path=catalog_row["source_archive_file_path"], + source_documentation_url=catalog_row["source_documentation_url"], + source_data_release_id="TODO", + # TODO - this is a placeholder + parent_metric_id="unknown_at_this_stage", + potential_denominator_ids=None, + parquet_margin_of_error_file=None, + parquet_margin_of_error_column=None, + parquet_column_name=catalog_row["source_column"], + # TODO - this is a placeholder + metric_parquet_file_url="unknown_at_this_stage", + hxl_tag=catalog_row["hxltag"], + description=catalog_row["description"], + source_metric_id=catalog_row["hxltag"], + ) + + @multi_asset( ins={ "individual_census_table": AssetIn( diff --git a/python/popgetter/assets/be/census_geometry.py b/python/popgetter/assets/be/census_geometry.py index 1e0b384..28bc4ec 100644 --- a/python/popgetter/assets/be/census_geometry.py +++ b/python/popgetter/assets/be/census_geometry.py @@ -1,26 +1,26 @@ from __future__ import annotations +from dataclasses import dataclass +from datetime import date + import geopandas as gpd import matplotlib.pyplot as plt import pandas as pd from dagster import ( + AssetIn, MetadataValue, SpecificPartitionsPartitionMapping, - AssetIn, asset, ) from icecream import ic -from popgetter.utils import markdown_from_plot from popgetter.metadata import ( GeometryMetadata, - metadata_to_dataframe, SourceDataRelease, ) -from datetime import date +from popgetter.utils import markdown_from_plot from .belgium import asset_prefix -from dataclasses import dataclass from .census_tables import publisher @@ -172,14 +172,15 @@ def source_data_release( Returns the SourceDataRelease corresponding to the municipality level. """ # Pick out the municipality geometry and get its ID - for geo_metadata, gdf, names_df in geometry: + for geo_metadata, _, _ in geometry: if geo_metadata.level == "municipality": geo_metadata_id = geo_metadata.id break else: - raise ValueError("No municipality geometry found") + err = "No municipality geometry found" + raise ValueError(err) - source = SourceDataRelease( + return SourceDataRelease( name="StatBel Open Data", date_published=date(2015, 10, 22), reference_period_start=date(2015, 10, 22), @@ -192,5 +193,3 @@ def source_data_release( data_publisher_id=publisher.id, geometry_metadata_id=geo_metadata_id, ) - - return source diff --git a/python/popgetter/assets/be/census_tables.py b/python/popgetter/assets/be/census_tables.py index e74be22..d39035e 100644 --- a/python/popgetter/assets/be/census_tables.py +++ b/python/popgetter/assets/be/census_tables.py @@ -3,7 +3,6 @@ import csv import sqlite3 import zipfile -from datetime import date from pathlib import Path, PurePath from tempfile import TemporaryDirectory from urllib.parse import urlparse @@ -24,15 +23,11 @@ from popgetter.metadata import ( CountryMetadata, DataPublisher, - SourceDataRelease, - GeometryMetadata, - metadata_to_dataframe, ) from popgetter.utils import extract_main_file_from_zip, markdown_from_plot from .belgium import asset_prefix, country - publisher: DataPublisher = DataPublisher( name="Statbel", url="https://statbel.fgov.be/en", diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py index 44f9fb0..4f15e72 100644 --- a/python/popgetter/cloud_outputs/__init__.py +++ b/python/popgetter/cloud_outputs/__init__.py @@ -1 +1,5 @@ -from . import metadata, geometry, azure_test +from __future__ import annotations + +from . import geometry, metadata + +__all__ = ["geometry", "metadata"] diff --git a/python/popgetter/cloud_outputs/azure_test.py b/python/popgetter/cloud_outputs/azure_test.py index 91a302c..41380d0 100644 --- a/python/popgetter/cloud_outputs/azure_test.py +++ b/python/popgetter/cloud_outputs/azure_test.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pandas as pd from dagster import asset diff --git a/python/popgetter/cloud_outputs/geometry.py b/python/popgetter/cloud_outputs/geometry.py index 2dedd5a..88eda65 100644 --- a/python/popgetter/cloud_outputs/geometry.py +++ b/python/popgetter/cloud_outputs/geometry.py @@ -1,18 +1,20 @@ +from __future__ import annotations + +from functools import reduce + from dagster import ( - multi_asset_sensor, - asset, - AssetSelection, AssetExecutionContext, - AssetKey, - StaticPartitionsDefinition, + AssetSelection, DefaultSensorStatus, - RunRequest, Output, + RunRequest, + StaticPartitionsDefinition, + asset, define_asset_job, load_assets_from_current_module, + multi_asset_sensor, ) from icecream import ic -from functools import reduce cloud_assets_geometry = load_assets_from_current_module( group_name="cloud_assets_geometry" diff --git a/python/popgetter/cloud_outputs/metadata.py b/python/popgetter/cloud_outputs/metadata.py index 76e0ea0..8635057 100644 --- a/python/popgetter/cloud_outputs/metadata.py +++ b/python/popgetter/cloud_outputs/metadata.py @@ -1,18 +1,20 @@ +from __future__ import annotations + +from functools import reduce + from dagster import ( - multi_asset_sensor, - asset, - AssetSelection, AssetExecutionContext, - AssetKey, - StaticPartitionsDefinition, + AssetSelection, DefaultSensorStatus, - RunRequest, Output, + RunRequest, + StaticPartitionsDefinition, + asset, define_asset_job, load_assets_from_current_module, + multi_asset_sensor, ) from icecream import ic -from functools import reduce cloud_assets_metadata = load_assets_from_current_module( group_name="cloud_assets_metadata" diff --git a/python/popgetter/cloud_outputs_old.py b/python/popgetter/cloud_outputs_old.py index 2f4dc6c..1d885e7 100644 --- a/python/popgetter/cloud_outputs_old.py +++ b/python/popgetter/cloud_outputs_old.py @@ -5,7 +5,6 @@ from pathlib import Path import geopandas as gpd -import pandas as pd from dagster import ( AssetKey, AssetSelection, diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 319d79b..bd7aedd 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -1,21 +1,19 @@ from __future__ import annotations -from typing import ClassVar +from dataclasses import dataclass import geopandas as gpd import pandas as pd -from dagster import InputContext, IOManager, OutputContext, MetadataValue -from icecream import ic +from dagster import InputContext, IOManager, MetadataValue, OutputContext from upath import UPath + from popgetter.metadata import ( CountryMetadata, DataPublisher, - SourceDataRelease, GeometryMetadata, - MetricMetadata, + SourceDataRelease, metadata_to_dataframe, ) -from dataclasses import dataclass class PopgetterIOManager(IOManager): @@ -33,13 +31,13 @@ def get_output_filename( ) -> str: if isinstance(obj, CountryMetadata): return "country_metadata.parquet" - elif isinstance(obj, DataPublisher): + if isinstance(obj, DataPublisher): return "data_publishers.parquet" - elif isinstance(obj, SourceDataRelease): + if isinstance(obj, SourceDataRelease): return "source_data_releases.parquet" - else: - err_msg = "This IO manager only accepts CountryMetadata, DataPublisher, and SourceDataRelease" - raise ValueError(err_msg) + + err_msg = "This IO manager only accepts CountryMetadata, DataPublisher, and SourceDataRelease" + raise ValueError(err_msg) def get_full_path( self, @@ -50,7 +48,9 @@ def get_full_path( filename = self.get_output_filename(obj) return self.get_base_path() / UPath("/".join([*path_prefixes, filename])) - def handle_df(self, context: OutputContext, df: pd.DataFrame, full_path: UPath) -> None: + def handle_df( + self, context: OutputContext, df: pd.DataFrame, full_path: UPath + ) -> None: raise NotImplementedError def handle_output( @@ -75,9 +75,10 @@ def handle_geojsonseq( raise NotImplementedError def handle_pmtiles( - self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath + self, _context: OutputContext, _geo_df: gpd.GeoDataFrame, _full_path: UPath ) -> None: - raise RuntimeError("Pmtiles not currently implemented") + err_msg = "Pmtiles not currently implemented. You shouldn't be calling this." + raise RuntimeError(err_msg) def handle_names( self, context: OutputContext, names_df: pd.DataFrame, full_path: UPath @@ -91,10 +92,10 @@ def handle_geo_metadata( @dataclass class GeometryOutputPaths: - flatgeobuf: str - pmtiles: str - geojsonseq: str - names: str + flatgeobuf: UPath + pmtiles: UPath + geojsonseq: UPath + names: UPath def get_full_paths_geoms( self, diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index 5f7bc62..8396a41 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -1,9 +1,9 @@ from __future__ import annotations import os +import tempfile from collections.abc import Iterator from contextlib import contextmanager -import tempfile from pathlib import Path import geopandas as gpd @@ -18,8 +18,8 @@ ) from dagster import ( Any, - IOManager, InputContext, + IOManager, OutputContext, ) from dagster_azure.adls2.utils import ResourceNotFoundError, create_adls2_client @@ -27,13 +27,7 @@ from icecream import ic from upath import UPath -from . import PopgetterIOManager, TopLevelMetadataIOManager, GeoIOManager -from popgetter.metadata import ( - CountryMetadata, - DataPublisher, - SourceDataRelease, - metadata_to_dataframe, -) +from . import GeoIOManager, TopLevelMetadataIOManager # Set no time limit on lease duration to enable large files to be uploaded _LEASE_DURATION = -1 @@ -170,11 +164,6 @@ def handle_geojsonseq( context.log.debug(ic(f"Size: {len(b) / (1_024 * 1_024):.3f}MB")) self.dump_to_path(context, b, full_path) - def handle_pmtiles( - self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath - ) -> None: - raise RuntimeError("Pmtiles not currently implemented") - def handle_names( self, context: OutputContext, names_df: pd.DataFrame, full_path: UPath ) -> None: diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index 7c07ed4..9d972e2 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -4,18 +4,10 @@ import geopandas as gpd import pandas as pd -from dagster import OutputContext, MetadataValue +from dagster import OutputContext from upath import UPath -from . import PopgetterIOManager, GeoIOManager, TopLevelMetadataIOManager -from popgetter.metadata import ( - CountryMetadata, - DataPublisher, - SourceDataRelease, - GeometryMetadata, - MetricMetadata, - metadata_to_dataframe, -) +from . import GeoIOManager, PopgetterIOManager, TopLevelMetadataIOManager class LocalMixin: @@ -23,7 +15,8 @@ class LocalMixin: def get_base_path(self) -> UPath: if not self.dagster_home: - raise ValueError("The DAGSTER_HOME environment variable must be set.") + err = "The DAGSTER_HOME environment variable must be set." + raise ValueError(err) return UPath(self.dagster_home) / "cloud_outputs" def make_parent_dirs(self, full_path: UPath) -> None: @@ -34,7 +27,7 @@ class LocalTopLevelMetadataIOManager( LocalMixin, TopLevelMetadataIOManager, PopgetterIOManager ): def handle_df( - self, context: OutputContext, df: pd.DataFrame, full_path: UPath + self, _context: OutputContext, df: pd.DataFrame, full_path: UPath ) -> None: self.make_parent_dirs(full_path) df.to_parquet(full_path) @@ -42,25 +35,25 @@ def handle_df( class LocalGeometryIOManager(LocalMixin, GeoIOManager, PopgetterIOManager): def handle_flatgeobuf( - self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath + self, _context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath ) -> None: self.make_parent_dirs(full_path) geo_df.to_file(full_path, driver="FlatGeobuf") def handle_geojsonseq( - self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath + self, _context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath ) -> None: self.make_parent_dirs(full_path) geo_df.to_file(full_path, driver="GeoJSONSeq") def handle_names( - self, context: OutputContext, names_df: pd.DataFrame, full_path: UPath + self, _context: OutputContext, names_df: pd.DataFrame, full_path: UPath ) -> None: self.make_parent_dirs(full_path) names_df.to_parquet(full_path) def handle_geo_metadata( - self, context: OutputContext, geo_metadata_df: pd.DataFrame, full_path: UPath + self, _context: OutputContext, geo_metadata_df: pd.DataFrame, full_path: UPath ) -> None: self.make_parent_dirs(full_path) geo_metadata_df.to_parquet(full_path) diff --git a/python/popgetter/metadata.py b/python/popgetter/metadata.py index 7b1d288..ce178d0 100644 --- a/python/popgetter/metadata.py +++ b/python/popgetter/metadata.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections.abc import Sequence from datetime import date from hashlib import sha256 from typing import Self @@ -29,7 +30,7 @@ def hash_class_vars(class_instance): def metadata_to_dataframe( - metadata_instances: list[BaseModel], + metadata_instances: Sequence[BaseModel], ): """ Convert a list of metadata instances to a pandas DataFrame. Any of the four From bd1ad96f3cf911278c884ce1742ccc39019314e1 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Wed, 15 May 2024 22:11:03 +0100 Subject: [PATCH 25/35] Fix even more errors --- python/popgetter/__init__.py | 4 ++-- python/popgetter/cloud_outputs/geometry.py | 3 +-- python/popgetter/cloud_outputs/metadata.py | 3 +-- python/popgetter/io_managers/__init__.py | 24 +++++++--------------- python/popgetter/io_managers/azure.py | 6 ++++-- python/popgetter/io_managers/local.py | 24 ++++++---------------- 6 files changed, 21 insertions(+), 43 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 95aa1f9..f367725 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -9,7 +9,7 @@ AzureTopLevelMetadataIOManager, ) from popgetter.io_managers.local import ( - LocalGeometryIOManager, + LocalGeoIOManager, LocalTopLevelMetadataIOManager, ) from popgetter.utils import StagingDirResource @@ -72,7 +72,7 @@ }, "dev": { "publishing_io_manager": LocalTopLevelMetadataIOManager(), - "geometry_io_manager": LocalGeometryIOManager(), + "geometry_io_manager": LocalGeoIOManager(), }, } diff --git a/python/popgetter/cloud_outputs/geometry.py b/python/popgetter/cloud_outputs/geometry.py index 88eda65..d18ed6d 100644 --- a/python/popgetter/cloud_outputs/geometry.py +++ b/python/popgetter/cloud_outputs/geometry.py @@ -3,7 +3,6 @@ from functools import reduce from dagster import ( - AssetExecutionContext, AssetSelection, DefaultSensorStatus, Output, @@ -35,7 +34,7 @@ partitions_def=geometry_partition, io_manager_key="geometry_io_manager", ) -def publish_geometry(context: AssetExecutionContext): +def publish_geometry(context): # Get the output of the asset from popgetter import defs as popgetter_defs diff --git a/python/popgetter/cloud_outputs/metadata.py b/python/popgetter/cloud_outputs/metadata.py index 8635057..d3f60dd 100644 --- a/python/popgetter/cloud_outputs/metadata.py +++ b/python/popgetter/cloud_outputs/metadata.py @@ -3,7 +3,6 @@ from functools import reduce from dagster import ( - AssetExecutionContext, AssetSelection, DefaultSensorStatus, Output, @@ -37,7 +36,7 @@ partitions_def=toplevel_metadata_partition, io_manager_key="publishing_io_manager", ) -def publish_toplevel_metadata(context: AssetExecutionContext): +def publish_toplevel_metadata(context): # Get the output of the asset from popgetter import defs as popgetter_defs diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index bd7aedd..5d1ed52 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -20,6 +20,11 @@ class PopgetterIOManager(IOManager): def get_base_path(self) -> UPath: raise NotImplementedError + def handle_df( + self, context: OutputContext, df: pd.DataFrame, full_path: UPath + ) -> None: + raise NotImplementedError + def load_input(self, _context: InputContext) -> pd.DataFrame: err_msg = "This IOManager is only for writing outputs" raise RuntimeError(err_msg) @@ -48,11 +53,6 @@ def get_full_path( filename = self.get_output_filename(obj) return self.get_base_path() / UPath("/".join([*path_prefixes, filename])) - def handle_df( - self, context: OutputContext, df: pd.DataFrame, full_path: UPath - ) -> None: - raise NotImplementedError - def handle_output( self, context: OutputContext, @@ -80,16 +80,6 @@ def handle_pmtiles( err_msg = "Pmtiles not currently implemented. You shouldn't be calling this." raise RuntimeError(err_msg) - def handle_names( - self, context: OutputContext, names_df: pd.DataFrame, full_path: UPath - ) -> None: - raise NotImplementedError - - def handle_geo_metadata( - self, context: OutputContext, geo_metadata_df: pd.DataFrame, full_path: UPath - ) -> None: - raise NotImplementedError - @dataclass class GeometryOutputPaths: flatgeobuf: UPath @@ -148,7 +138,7 @@ def handle_output( self.handle_flatgeobuf(context, gdf, full_paths.flatgeobuf) self.handle_geojsonseq(context, gdf, full_paths.geojsonseq) # TODO self.handle_pmtiles(context, gdf, full_paths.pmtiles) - self.handle_names(context, names_df, full_paths.names) + self.handle_df(context, names_df, full_paths.names) output_metadata["flatgeobuf_paths"].append(str(full_paths.flatgeobuf)) output_metadata["pmtiles_paths"].append(str(full_paths.pmtiles)) @@ -157,7 +147,7 @@ def handle_output( metadata_df_filepath = self.get_full_path_metadata(context) metadata_df = metadata_to_dataframe([md for md, _, _ in obj]) - self.handle_geo_metadata(context, metadata_df, metadata_df_filepath) + self.handle_df(context, metadata_df, metadata_df_filepath) context.add_output_metadata( metadata={ diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index 8396a41..a8e4f2b 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -135,12 +135,14 @@ def _acquire_lease(self, client: Any, is_rm: bool = False) -> Iterator[str]: if not is_rm: lease_client.release() - -class AzureTopLevelMetadataIOManager(AzureMixin, TopLevelMetadataIOManager): def handle_df(self, context: OutputContext, df: pd.DataFrame, path: UPath) -> None: self.dump_to_path(context, df.to_parquet(None), path) +class AzureTopLevelMetadataIOManager(AzureMixin, TopLevelMetadataIOManager): + pass + + class AzureGeoIOManager(AzureMixin, GeoIOManager): def handle_flatgeobuf( self, context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index 9d972e2..4a04ed0 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -7,7 +7,7 @@ from dagster import OutputContext from upath import UPath -from . import GeoIOManager, PopgetterIOManager, TopLevelMetadataIOManager +from . import GeoIOManager, TopLevelMetadataIOManager class LocalMixin: @@ -22,10 +22,6 @@ def get_base_path(self) -> UPath: def make_parent_dirs(self, full_path: UPath) -> None: full_path.parent.mkdir(parents=True, exist_ok=True) - -class LocalTopLevelMetadataIOManager( - LocalMixin, TopLevelMetadataIOManager, PopgetterIOManager -): def handle_df( self, _context: OutputContext, df: pd.DataFrame, full_path: UPath ) -> None: @@ -33,7 +29,11 @@ def handle_df( df.to_parquet(full_path) -class LocalGeometryIOManager(LocalMixin, GeoIOManager, PopgetterIOManager): +class LocalTopLevelMetadataIOManager(LocalMixin, TopLevelMetadataIOManager): + pass + + +class LocalGeoIOManager(LocalMixin, GeoIOManager): def handle_flatgeobuf( self, _context: OutputContext, geo_df: gpd.GeoDataFrame, full_path: UPath ) -> None: @@ -45,15 +45,3 @@ def handle_geojsonseq( ) -> None: self.make_parent_dirs(full_path) geo_df.to_file(full_path, driver="GeoJSONSeq") - - def handle_names( - self, _context: OutputContext, names_df: pd.DataFrame, full_path: UPath - ) -> None: - self.make_parent_dirs(full_path) - names_df.to_parquet(full_path) - - def handle_geo_metadata( - self, _context: OutputContext, geo_metadata_df: pd.DataFrame, full_path: UPath - ) -> None: - self.make_parent_dirs(full_path) - geo_metadata_df.to_parquet(full_path) From 1de2a882011d4852d27770bffe8ce277bdc7f15c Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Wed, 15 May 2024 23:49:43 +0100 Subject: [PATCH 26/35] Create a factory class that generates both sensors --- python/popgetter/__init__.py | 8 +- python/popgetter/cloud_outputs/__init__.py | 28 +++++- python/popgetter/cloud_outputs/geometry.py | 70 ------------- python/popgetter/cloud_outputs/metadata.py | 72 -------------- .../popgetter/cloud_outputs/sensor_class.py | 98 +++++++++++++++++++ 5 files changed, 128 insertions(+), 148 deletions(-) delete mode 100644 python/popgetter/cloud_outputs/geometry.py delete mode 100644 python/popgetter/cloud_outputs/metadata.py create mode 100644 python/popgetter/cloud_outputs/sensor_class.py diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index f367725..7796a0b 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -67,11 +67,11 @@ resources_by_env = { "prod": { - "publishing_io_manager": AzureTopLevelMetadataIOManager(), + "metadata_io_manager": AzureTopLevelMetadataIOManager(), "geometry_io_manager": AzureGeoIOManager(), }, "dev": { - "publishing_io_manager": LocalTopLevelMetadataIOManager(), + "metadata_io_manager": LocalTopLevelMetadataIOManager(), "geometry_io_manager": LocalGeoIOManager(), }, } @@ -90,8 +90,8 @@ assets=all_assets, schedules=[], sensors=[ - cloud_outputs.metadata.metadata_sensor, - cloud_outputs.geometry.geometry_sensor, + cloud_outputs.metadata_sensor, + cloud_outputs.geometry_sensor, ], resources=resources, jobs=[job_be, job_us, job_uk], diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py index 4f15e72..d847963 100644 --- a/python/popgetter/cloud_outputs/__init__.py +++ b/python/popgetter/cloud_outputs/__init__.py @@ -1,5 +1,29 @@ from __future__ import annotations -from . import geometry, metadata +from .sensor_class import CloudAssetSensor -__all__ = ["geometry", "metadata"] +metadata_factory = CloudAssetSensor( + asset_names_to_monitor=[ + "be/country_metadata", + "be/data_publisher", + "be/source_data_release", + ], + io_manager_key="metadata_io_manager", + prefix="metadata", + interval=20, +) + +metadata_sensor = metadata_factory.create_sensor() +metadata_asset = metadata_factory.create_publishing_asset() + +geometry_factory = CloudAssetSensor( + asset_names_to_monitor=[ + "be/geometry", + ], + io_manager_key="geometry_io_manager", + prefix="geometry", + interval=60, +) + +geometry_sensor = geometry_factory.create_sensor() +geometry_asset = geometry_factory.create_publishing_asset() diff --git a/python/popgetter/cloud_outputs/geometry.py b/python/popgetter/cloud_outputs/geometry.py deleted file mode 100644 index d18ed6d..0000000 --- a/python/popgetter/cloud_outputs/geometry.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import annotations - -from functools import reduce - -from dagster import ( - AssetSelection, - DefaultSensorStatus, - Output, - RunRequest, - StaticPartitionsDefinition, - asset, - define_asset_job, - load_assets_from_current_module, - multi_asset_sensor, -) -from icecream import ic - -cloud_assets_geometry = load_assets_from_current_module( - group_name="cloud_assets_geometry" -) - -# TODO: is there a better way to do this than to manually list them here? -geometry_asset_names = [ - "be/geometry", -] -geometry_partition = StaticPartitionsDefinition(geometry_asset_names) -geometry_assets_to_monitor = reduce( - lambda x, y: x | y, - [AssetSelection.keys(k) for k in geometry_asset_names], -) - - -@asset( - partitions_def=geometry_partition, - io_manager_key="geometry_io_manager", -) -def publish_geometry(context): - # Get the output of the asset - from popgetter import defs as popgetter_defs - - ic(context.partition_key) - - # load_asset_value expects a list of strings - output = popgetter_defs.load_asset_value(context.partition_key.split("/")) - ic(output) - - return Output(output) - - -geometry_job = define_asset_job( - name="geometry_job", - selection="publish_geometry", - partitions_def=geometry_partition, -) - - -@multi_asset_sensor( - monitored_assets=geometry_assets_to_monitor, - job=geometry_job, - minimum_interval_seconds=60, - default_status=DefaultSensorStatus.RUNNING, -) -def geometry_sensor(context): - asset_events = context.latest_materialization_records_by_key() - for asset_key, execution_value in asset_events.items(): - yield RunRequest( - run_key=None, - partition_key="/".join(asset_key.path), - ) - context.advance_cursor({asset_key: execution_value}) diff --git a/python/popgetter/cloud_outputs/metadata.py b/python/popgetter/cloud_outputs/metadata.py deleted file mode 100644 index d3f60dd..0000000 --- a/python/popgetter/cloud_outputs/metadata.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -from functools import reduce - -from dagster import ( - AssetSelection, - DefaultSensorStatus, - Output, - RunRequest, - StaticPartitionsDefinition, - asset, - define_asset_job, - load_assets_from_current_module, - multi_asset_sensor, -) -from icecream import ic - -cloud_assets_metadata = load_assets_from_current_module( - group_name="cloud_assets_metadata" -) - -# TODO: is there a better way to do this than to manually list them here? -toplevel_metadata_asset_names = [ - "be/country_metadata", - "be/data_publisher", - "be/source_data_release", -] -toplevel_metadata_partition = StaticPartitionsDefinition(toplevel_metadata_asset_names) -toplevel_metadata_assets_to_monitor = reduce( - lambda x, y: x | y, - [AssetSelection.keys(k) for k in toplevel_metadata_asset_names], -) - - -@asset( - partitions_def=toplevel_metadata_partition, - io_manager_key="publishing_io_manager", -) -def publish_toplevel_metadata(context): - # Get the output of the asset - from popgetter import defs as popgetter_defs - - ic(context.partition_key) - - # load_asset_value expects a list of strings - output = popgetter_defs.load_asset_value(context.partition_key.split("/")) - ic(output) - - return Output(output) - - -toplevel_metadata_job = define_asset_job( - name="toplevel_metadata_job", - selection="publish_toplevel_metadata", - partitions_def=toplevel_metadata_partition, -) - - -@multi_asset_sensor( - monitored_assets=toplevel_metadata_assets_to_monitor, - job=toplevel_metadata_job, - minimum_interval_seconds=10, - default_status=DefaultSensorStatus.RUNNING, -) -def metadata_sensor(context): - asset_events = context.latest_materialization_records_by_key() - for asset_key, execution_value in asset_events.items(): - yield RunRequest( - run_key=None, - partition_key="/".join(asset_key.path), - ) - context.advance_cursor({asset_key: execution_value}) diff --git a/python/popgetter/cloud_outputs/sensor_class.py b/python/popgetter/cloud_outputs/sensor_class.py new file mode 100644 index 0000000..5ba76e7 --- /dev/null +++ b/python/popgetter/cloud_outputs/sensor_class.py @@ -0,0 +1,98 @@ +from functools import reduce + +from dagster import ( + AssetSelection, + DefaultSensorStatus, + Output, + RunRequest, + StaticPartitionsDefinition, + asset, + load_assets_from_current_module, + multi_asset_sensor, +) +from icecream import ic + + +class CloudAssetSensor: + """ + Class which scaffolds a cloud sensor. This class defines a publishing + asset, which uses a custom IO manager to publish data either to Azure or + local storage. + + The publishing asset in turn monitors a list of country-specific assets, + which are responsible for generating the data that are to be published. + + Arguments + --------- + + `asset_names_to_monitor`: list[str] + A list of asset names in the country pipelines that we want to monitor. + Each asset name should be a single string and should be prefixed with + the asset group name and a forward slash (e.g. 'be/geometry'). + + `io_manager_key`: str + The key of the IO manager used for publishing. See the 'resources' and + 'resources_by_env' dicts in python/popgetter/__init__.py. + + `prefix`: str + A string used to disambiguate the different assets / sensors that are + being generated here, as Dagster does not allow duplicated names. + + `interval`: int + The minimum interval at which the sensor should run, in seconds. + """ + + def __init__( + self, + asset_names_to_monitor: list[str], + io_manager_key: str, + prefix: str, + interval: int, + ): + self.asset_names_to_monitor = asset_names_to_monitor + self.io_manager_key = io_manager_key + self.publishing_asset_name = f"publish_{prefix}" + self.sensor_name = f"sensor_{prefix}" + self.interval = interval + + self.partition_definition = StaticPartitionsDefinition( + self.asset_names_to_monitor + ) + self.assets_to_monitor = reduce( + lambda x, y: x | y, + [AssetSelection.keys(k) for k in self.asset_names_to_monitor], + ) + + def create_publishing_asset(self): + @asset( + name=self.publishing_asset_name, + partitions_def=self.partition_definition, + io_manager_key=self.io_manager_key, + ) + def publish(context): + from popgetter import defs as popgetter_defs + + # load_asset_value expects a list of strings + output = popgetter_defs.load_asset_value(context.partition_key.split("/")) + return Output(output) + + return publish + + def create_sensor(self): + @multi_asset_sensor( + monitored_assets=self.assets_to_monitor, + request_assets=AssetSelection.keys(self.publishing_asset_name), + name=self.sensor_name, + minimum_interval_seconds=self.interval, + default_status=DefaultSensorStatus.RUNNING, + ) + def inner_sensor(context): + asset_events = context.latest_materialization_records_by_key() + for asset_key, execution_value in asset_events.items(): + yield RunRequest( + run_key=None, + partition_key="/".join(asset_key.path), + ) + context.advance_cursor({asset_key: execution_value}) + + return inner_sensor From 72f279c6fe524dd74629ecf96970eb9f15bf5240 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Thu, 16 May 2024 16:15:33 +0100 Subject: [PATCH 27/35] Add one unpartitioned output for Belgian metrics, tidy some code --- python/popgetter/assets/be/census_derived.py | 358 +++++++++--------- python/popgetter/assets/be/census_geometry.py | 2 +- python/popgetter/assets/be/census_tables.py | 4 +- python/popgetter/cloud_outputs/__init__.py | 2 +- .../popgetter/cloud_outputs/sensor_class.py | 4 +- python/popgetter/metadata.py | 6 +- 6 files changed, 183 insertions(+), 193 deletions(-) diff --git a/python/popgetter/assets/be/census_derived.py b/python/popgetter/assets/be/census_derived.py index baad50a..a4e8911 100644 --- a/python/popgetter/assets/be/census_derived.py +++ b/python/popgetter/assets/be/census_derived.py @@ -1,14 +1,18 @@ from __future__ import annotations +from collections.abc import Callable +from dataclasses import dataclass +from functools import reduce + import pandas as pd from dagster import ( AssetIn, - AssetOut, + IdentityPartitionMapping, MetadataValue, + Output, SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset, - multi_asset, ) from icecream import ic @@ -48,55 +52,66 @@ needed_dataset_mapping = SpecificPartitionsPartitionMapping(_needed_dataset_nodes) needed_dataset_partition = StaticPartitionsDefinition(_needed_dataset_nodes) -# Using HXL tags for variable names (https://hxlstandard.org/standard/1-1final/dictionary/#tag_population) -_derived_columns: list[dict] = [ - { - "node": "https://statbel.fgov.be/node/4689", - "hxltag": "population_children_age5_17", - "group_by_column": "CD_REFNIS", - "filter_func": lambda df: (df["CD_AGE"] >= 5) & (df["CD_AGE"] < 18), - }, - { - "node": "https://statbel.fgov.be/node/4689", - "hxltag": "population_infants_age0_4", - "group_by_column": "CD_REFNIS", - "filter_func": lambda df: (df["CD_AGE"] <= 4), - }, - { - "node": "https://statbel.fgov.be/node/4689", - "hxltag": "population_children_age0_17", - "group_by_column": "CD_REFNIS", - "filter_func": lambda df: (df["CD_AGE"] >= 0) & (df["CD_AGE"] < 18), - }, - { - "node": "https://statbel.fgov.be/node/4689", - "hxltag": "population_adults_f", - "group_by_column": "CD_REFNIS", - "filter_func": lambda df: (df["CD_AGE"] > 18) & (df["CD_SEX"] == "F"), - }, - { - "node": "https://statbel.fgov.be/node/4689", - "hxltag": "population_adults_m", - "group_by_column": "CD_REFNIS", - "filter_func": lambda df: (df["CD_AGE"] > 18) & (df["CD_SEX"] == "M"), - }, - { - "node": "https://statbel.fgov.be/node/4689", - "hxltag": "population_adults", - "group_by_column": "CD_REFNIS", - "filter_func": lambda df: (df["CD_AGE"] > 18), - }, - { - "node": "https://statbel.fgov.be/node/4689", - "hxltag": "population_ind", - "group_by_column": "CD_REFNIS", - "filter_func": lambda df: (df["CD_AGE"] >= 0), - }, -] -derived_columns = pd.DataFrame( - _derived_columns, columns=["node", "hxltag", "group_by_column", "filter_func"] -) +@dataclass +class DerivedColumn: + hxltag: str + filter_func: Callable[[pd.DataFrame], pd.DataFrame] + output_column_name: str + human_readable_name: str + + +# The keys of this dict are the nodes (i.e. partition keys). The values are a +# list of all columns of data derived from this node. +DERIVED_COLUMN_SPECIFICATIONS: dict[str, (str, [DerivedColumn])] = { + "https://statbel.fgov.be/node/4689": ( + "CD_REFNIS", + [ + DerivedColumn( + hxltag="#population+children+age5_17", + filter_func=lambda df: df.query("CD_AGE >= 5 and CD_AGE < 18"), + output_column_name="children_5_17", + human_readable_name="Children aged 5 to 17", + ), + DerivedColumn( + hxltag="#population+infants+age0_4", + filter_func=lambda df: df.query("CD_AGE >= 0 and CD_AGE < 5"), + output_column_name="infants_0_4", + human_readable_name="Infants aged 0 to 4", + ), + DerivedColumn( + hxltag="#population+children+age0_17", + filter_func=lambda df: df.query("CD_AGE >= 0 and CD_AGE < 18"), + output_column_name="children_0_17", + human_readable_name="Children aged 0 to 17", + ), + DerivedColumn( + hxltag="#population+adults+f", + filter_func=lambda df: df.query("CD_AGE >= 18 and CD_SEX == 'F'"), + output_column_name="adults_f", + human_readable_name="Female adults", + ), + DerivedColumn( + hxltag="#population+adults+m", + filter_func=lambda df: df.query("CD_AGE >= 18 and CD_SEX == 'M'"), + output_column_name="adults_m", + human_readable_name="Male adults", + ), + DerivedColumn( + hxltag="#population+adults", + filter_func=lambda df: df.query("CD_AGE >= 18"), + output_column_name="adults", + human_readable_name="Adults", + ), + DerivedColumn( + hxltag="#population+ind", + filter_func=lambda df: df, + output_column_name="individuals", + human_readable_name="Total individuals", + ), + ], + ) +} @asset(key_prefix=asset_prefix) @@ -132,17 +147,15 @@ def make_census_table_metadata( source_archive_file_path=catalog_row["source_archive_file_path"], source_documentation_url=catalog_row["source_documentation_url"], source_data_release_id=source_data_release.id, - # TODO - this is a placeholder - parent_metric_id="unknown_at_this_stage", + parent_metric_id=None, potential_denominator_ids=None, parquet_margin_of_error_file=None, parquet_margin_of_error_column=None, parquet_column_name=catalog_row["source_column"], - # TODO - this is a placeholder - metric_parquet_file_url="unknown_at_this_stage", + metric_parquet_path="__PLACEHOLDER__", hxl_tag=catalog_row["hxltag"], description=catalog_row["description"], - source_metric_id=catalog_row["hxltag"], + source_metric_id=catalog_row["source_column"], ) @@ -161,9 +174,7 @@ def filter_needed_catalog( needed_df = needed_datasets.merge(catalog_as_dataframe, how="inner", on="node") - # Now add some metadata to the context context.add_output_metadata( - # Metadata can be any key-value pair metadata={ "num_records": len(needed_df), "columns": MetadataValue.md( @@ -176,169 +187,148 @@ def filter_needed_catalog( return needed_df -def census_table_metadata(catalog_row: dict) -> MetricMetadata: - return MetricMetadata( - human_readable_name=catalog_row["human_readable_name"], - source_download_url=catalog_row["source_download_url"], - source_archive_file_path=catalog_row["source_archive_file_path"], - source_documentation_url=catalog_row["source_documentation_url"], - source_data_release_id="TODO", - # TODO - this is a placeholder - parent_metric_id="unknown_at_this_stage", - potential_denominator_ids=None, - parquet_margin_of_error_file=None, - parquet_margin_of_error_column=None, - parquet_column_name=catalog_row["source_column"], - # TODO - this is a placeholder - metric_parquet_file_url="unknown_at_this_stage", - hxl_tag=catalog_row["hxltag"], - description=catalog_row["description"], - source_metric_id=catalog_row["hxltag"], - ) - - -@multi_asset( +@asset( ins={ "individual_census_table": AssetIn( key_prefix=asset_prefix, partition_mapping=needed_dataset_mapping ), "filter_needed_catalog": AssetIn(key_prefix=asset_prefix), - }, - outs={ - "source_table": AssetOut(key_prefix=asset_prefix), - "source_mmd": AssetOut(key_prefix=asset_prefix), + "source_data_release_munip": AssetIn(key_prefix=asset_prefix), }, partitions_def=dataset_node_partition, + key_prefix=asset_prefix, ) -def get_enriched_tables( - context, individual_census_table, filter_needed_catalog -) -> tuple[pd.DataFrame, MetricMetadata]: - ic(context) - partition_keys = context.asset_partition_keys_for_input( +def source_metrics_by_partition( + context, + individual_census_table: dict[str, pd.DataFrame], + filter_needed_catalog: pd.DataFrame, + # TODO: generalise to list or dict of SourceDataReleases as there may be + # tables in here that are not at the same release level + source_data_release_munip: SourceDataRelease, + # TODO: return an intermediate type instead of MetricMetadata +) -> tuple[MetricMetadata, pd.DataFrame]: + input_partition_keys = context.asset_partition_keys_for_input( input_name="individual_census_table" ) - output_partition = context.asset_partition_key_for_output("source_table") - ic(partition_keys) - ic(len(partition_keys)) - ic(output_partition) - ic(type(output_partition)) - - if output_partition not in partition_keys: - err_msg = f"Requested partition {output_partition} not found in the subset of 'needed' partitions {partition_keys}" - raise ValueError(err_msg) + output_partition_key = context.partition_key - if output_partition not in individual_census_table: + if output_partition_key not in input_partition_keys: + skip_reason = f"Skipping as requested partition {output_partition_key} is not part of the 'needed' partitions {input_partition_keys}" + context.log.warning(skip_reason) + raise RuntimeError(skip_reason) + + try: + result_df = individual_census_table[output_partition_key] + except KeyError: err_msg = ( - f"Partition key {output_partition} not found in individual_census_table\n" + f"Partition key {output_partition_key} not found in individual_census_table\n" f"Available keys are {individual_census_table.keys()}" ) raise ValueError(err_msg) - result_df = individual_census_table[output_partition] catalog_row = filter_needed_catalog[ - filter_needed_catalog["node"].eq(output_partition) - ] - ic(catalog_row) - ic(type(catalog_row)) - catalog_row = catalog_row.to_dict(orient="index") - ic(catalog_row) - ic(type(catalog_row)) - catalog_row = catalog_row.popitem()[1] - ic(catalog_row) - ic(type(catalog_row)) + filter_needed_catalog["node"] == output_partition_key + ].to_dict(orient="records")[0] - result_mmd = census_table_metadata(catalog_row) + result_mmd = make_census_table_metadata(catalog_row, source_data_release_munip) - return result_df, result_mmd + return result_mmd, result_df -@multi_asset( +@asset( partitions_def=dataset_node_partition, ins={ - "source_data_release": AssetIn(key_prefix=asset_prefix), - "source_table": AssetIn( - key_prefix=asset_prefix, partition_mapping=needed_dataset_mapping - ), - "source_mmd": AssetIn( - key_prefix=asset_prefix, partition_mapping=needed_dataset_mapping + "source_metrics_by_partition": AssetIn( + key_prefix=asset_prefix, partition_mapping=IdentityPartitionMapping() ), }, - outs={ - "derived_table": AssetOut(key_prefix=asset_prefix), - "derived_mmds": AssetOut(key_prefix=asset_prefix), - }, + key_prefix=asset_prefix, ) -def pivot_data( +def derived_metrics_by_partition( context, - source_data_release: SourceDataRelease, - source_table: dict[str, pd.DataFrame], - source_mmd: dict[str, MetricMetadata], -) -> tuple[pd.DataFrame, list[MetricMetadata]]: - node = context.asset_partition_key_for_output("derived_table") - - census_table = source_table[node] - parent_mmd = source_mmd[node] - - ic(census_table.columns) - ic(parent_mmd.parquet_column_name) - assert parent_mmd.parquet_column_name in census_table.columns - assert len(census_table) > 0 - - ic(census_table.head()) - ic(parent_mmd) - - source_column = parent_mmd.parquet_column_name - metrics = derived_columns[derived_columns["node"].eq(node)] - - # TODO, check whether it is necessary to forcibly remove columns that are not - # meaningful for the aggregation. - - new_table: pd.DataFrame = pd.DataFrame() - - new_mmds: list[MetricMetadata] = [] - - for row_tuple in metrics.itertuples(): - ic(row_tuple) - _, _, col_name, group_by_column, filter = row_tuple - new_col_def = {col_name: pd.NamedAgg(column=source_column, aggfunc="sum")} - subset = census_table.loc[filter] - ic(subset.head()) - ic(len(subset)) - temp_table: pd.DataFrame = subset.groupby( - by=group_by_column, as_index=True - ).agg( - func=None, - **new_col_def, # type: ignore TODO, don't know why pyright is complaining here + source_metrics_by_partition: tuple[MetricMetadata, pd.DataFrame], +) -> tuple[list[MetricMetadata], pd.DataFrame]: + node = context.partition_key + + source_mmd, source_table = source_metrics_by_partition + source_column = source_mmd.parquet_column_name + assert source_column in source_table.columns + assert len(source_table) > 0 + + try: + geo_id_col_name, metric_specs = DERIVED_COLUMN_SPECIFICATIONS[node] + except KeyError: + skip_reason = ( + f"Skipping as no derived columns are to be created for node {node}" ) + context.log.warning(skip_reason) + raise RuntimeError(skip_reason) - new_mmd = parent_mmd.copy() - new_mmd.source_data_release_id = source_data_release.id - new_mmd.parent_metric_id = parent_mmd.source_metric_id - new_mmd.hxl_tag = col_name - new_mmds.append(new_mmd) + # Rename the geoID column to GEO_ID + source_table = source_table.rename(columns={geo_id_col_name: "GEO_ID"}) - if len(new_table) == 0: - new_table = temp_table - else: - new_table = new_table.merge( - temp_table, left_index=True, right_index=True, how="inner" - ) + derived_metrics: list[pd.DataFrame] = [] + derived_mmd: list[MetricMetadata] = [] - context.add_output_metadata( - output_name="derived_table", - metadata={ - "num_records": len(new_table), - "metrics_preview": MetadataValue.md(new_table.head().to_markdown()), - }, + parquet_file_name = "".join(c for c in node if c.isalnum()) + ".parquet" + + for metric_spec in metric_specs: + new_table = ( + source_table.pipe(metric_spec.filter_func) + .groupby(by="GEO_ID", as_index=True) + .sum() + .rename(columns={source_column: metric_spec.output_column_name}) + .filter(items=["GEO_ID", metric_spec.output_column_name]) + ) + derived_metrics.append(new_table) + + new_mmd = source_mmd.copy() + new_mmd.parent_metric_id = source_mmd.source_metric_id + new_mmd.metric_parquet_path = parquet_file_name + new_mmd.hxl_tag = metric_spec.hxltag + new_mmd.parquet_column_name = metric_spec.output_column_name + new_mmd.human_readable_name = metric_spec.human_readable_name + derived_mmd.append(new_mmd) + + joined_metrics = reduce( + lambda left, right: pd.merge( + left, right, on="GEO_ID", how="inner", validate="one_to_one" + ), + derived_metrics, ) + context.add_output_metadata( - output_name="derived_mmds", metadata={ - "num_records": len(new_mmds), "metadata_preview": MetadataValue.md( - metadata_to_dataframe(new_mmds).head().to_markdown() + metadata_to_dataframe(derived_mmd).head().to_markdown() ), + "metrics_shape": f"{joined_metrics.shape[0]} rows x {joined_metrics.shape[1]} columns", + "metrics_preview": MetadataValue.md(joined_metrics.head().to_markdown()), }, ) - return new_table, new_mmds + return derived_mmd, joined_metrics + + +@asset( + ins={ + "derived_metrics_by_partition": AssetIn( + key_prefix=asset_prefix, + partition_mapping=SpecificPartitionsPartitionMapping( + ["https://statbel.fgov.be/node/4689"] + ), + ), + }, + key_prefix=asset_prefix, +) +def metrics( + context, derived_metrics_by_partition: tuple[list[MetricMetadata], pd.DataFrame] +) -> list[tuple[list[MetricMetadata], pd.DataFrame]]: + """ + This asset exists solely to aggregate all the derived tables into one + single unpartitioned asset, which the downstream publishing tasks can use. + + Right now it is a bit boring because it only relies on one partition, but + it could be extended when we have more data products. + """ + return [derived_metrics_by_partition] diff --git a/python/popgetter/assets/be/census_geometry.py b/python/popgetter/assets/be/census_geometry.py index 28bc4ec..5a7c275 100644 --- a/python/popgetter/assets/be/census_geometry.py +++ b/python/popgetter/assets/be/census_geometry.py @@ -165,7 +165,7 @@ def geometry( @asset(key_prefix=asset_prefix) -def source_data_release( +def source_data_release_munip( geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]] ) -> SourceDataRelease: """ diff --git a/python/popgetter/assets/be/census_tables.py b/python/popgetter/assets/be/census_tables.py index d39035e..fabd0f2 100644 --- a/python/popgetter/assets/be/census_tables.py +++ b/python/popgetter/assets/be/census_tables.py @@ -94,7 +94,7 @@ def catalog_as_dataframe(context, opendata_dataset_list: Graph) -> pd.DataFrame: "node": [], "human_readable_name": [], "description": [], - "metric_parquet_file_url": [], + "metric_parquet_path": [], "parquet_column_name": [], "parquet_margin_of_error_column": [], "parquet_margin_of_error_file": [], @@ -125,7 +125,7 @@ def catalog_as_dataframe(context, opendata_dataset_list: Graph) -> pd.DataFrame: ) # This is unknown at this stage - catalog_summary["metric_parquet_file_url"].append(None) + catalog_summary["metric_parquet_path"].append(None) catalog_summary["parquet_margin_of_error_column"].append(None) catalog_summary["parquet_margin_of_error_file"].append(None) catalog_summary["potential_denominator_ids"].append(None) diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py index d847963..a6904ec 100644 --- a/python/popgetter/cloud_outputs/__init__.py +++ b/python/popgetter/cloud_outputs/__init__.py @@ -6,7 +6,7 @@ asset_names_to_monitor=[ "be/country_metadata", "be/data_publisher", - "be/source_data_release", + "be/source_data_release_munip", ], io_manager_key="metadata_io_manager", prefix="metadata", diff --git a/python/popgetter/cloud_outputs/sensor_class.py b/python/popgetter/cloud_outputs/sensor_class.py index 5ba76e7..1c32d64 100644 --- a/python/popgetter/cloud_outputs/sensor_class.py +++ b/python/popgetter/cloud_outputs/sensor_class.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from functools import reduce from dagster import ( @@ -7,10 +9,8 @@ RunRequest, StaticPartitionsDefinition, asset, - load_assets_from_current_module, multi_asset_sensor, ) -from icecream import ic class CloudAssetSensor: diff --git a/python/popgetter/metadata.py b/python/popgetter/metadata.py index ce178d0..8a324be 100644 --- a/python/popgetter/metadata.py +++ b/python/popgetter/metadata.py @@ -163,7 +163,7 @@ def id(self) -> str: description='A human readable name for the metric, something like "Total Population under 12 years old"' ) source_metric_id: str = Field( - description='The name of the metric that comes from the source dataset ( for example in the ACS this might be "B001_E001" or something similar' + description='The name of the metric that comes from the source dataset (for example in the ACS this might be "B001_E001" or something similar)' ) description: str = Field( description="A longer description of the metric which might include info on the caveats for the metric" @@ -171,8 +171,8 @@ def id(self) -> str: hxl_tag: str = Field( description="Field description using the Humanitarian eXchange Language (HXL) standard" ) - metric_parquet_file_url: str | None = Field( - description="The relative path output file that contains this metric value. This should be relative to the root of a base URL defined at project level and should NOT include the file extension" + metric_parquet_path: str | None = Field( + description="The path to the parquet file that contains the metric" ) parquet_column_name: str = Field( description="Name of column in the outputted parquet file which contains the metric" From bca1f450656060b18890629c6b0e356a2bc9acdc Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Thu, 16 May 2024 16:17:30 +0100 Subject: [PATCH 28/35] rename 'TopLevelMetadata' -> just 'Metadata' --- python/popgetter/__init__.py | 8 ++++---- python/popgetter/io_managers/__init__.py | 2 +- python/popgetter/io_managers/azure.py | 4 ++-- python/popgetter/io_managers/local.py | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 7796a0b..92bbcd3 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -6,11 +6,11 @@ from popgetter.io_managers.azure import ( AzureGeneralIOManager, AzureGeoIOManager, - AzureTopLevelMetadataIOManager, + AzureMetadataIOManager, ) from popgetter.io_managers.local import ( LocalGeoIOManager, - LocalTopLevelMetadataIOManager, + LocalMetadataIOManager, ) from popgetter.utils import StagingDirResource @@ -67,11 +67,11 @@ resources_by_env = { "prod": { - "metadata_io_manager": AzureTopLevelMetadataIOManager(), + "metadata_io_manager": AzureMetadataIOManager(), "geometry_io_manager": AzureGeoIOManager(), }, "dev": { - "metadata_io_manager": LocalTopLevelMetadataIOManager(), + "metadata_io_manager": LocalMetadataIOManager(), "geometry_io_manager": LocalGeoIOManager(), }, } diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 5d1ed52..f2e1752 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -30,7 +30,7 @@ def load_input(self, _context: InputContext) -> pd.DataFrame: raise RuntimeError(err_msg) -class TopLevelMetadataIOManager(PopgetterIOManager): +class MetadataIOManager(PopgetterIOManager): def get_output_filename( self, obj: CountryMetadata | DataPublisher | SourceDataRelease ) -> str: diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index a8e4f2b..996bb0b 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -27,7 +27,7 @@ from icecream import ic from upath import UPath -from . import GeoIOManager, TopLevelMetadataIOManager +from . import GeoIOManager, MetadataIOManager # Set no time limit on lease duration to enable large files to be uploaded _LEASE_DURATION = -1 @@ -139,7 +139,7 @@ def handle_df(self, context: OutputContext, df: pd.DataFrame, path: UPath) -> No self.dump_to_path(context, df.to_parquet(None), path) -class AzureTopLevelMetadataIOManager(AzureMixin, TopLevelMetadataIOManager): +class AzureMetadataIOManager(AzureMixin, MetadataIOManager): pass diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index 4a04ed0..64f06a8 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -7,7 +7,7 @@ from dagster import OutputContext from upath import UPath -from . import GeoIOManager, TopLevelMetadataIOManager +from . import GeoIOManager, MetadataIOManager class LocalMixin: @@ -29,7 +29,7 @@ def handle_df( df.to_parquet(full_path) -class LocalTopLevelMetadataIOManager(LocalMixin, TopLevelMetadataIOManager): +class LocalMetadataIOManager(LocalMixin, MetadataIOManager): pass From d5d5ef1f8e042271c8517a57ab36b38cf433a2a0 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Thu, 16 May 2024 16:42:04 +0100 Subject: [PATCH 29/35] Metrics IO manager + cloud sensor --- python/popgetter/__init__.py | 5 +++ python/popgetter/assets/be/census_derived.py | 13 ++++--- python/popgetter/cloud_outputs/__init__.py | 12 ++++++ python/popgetter/io_managers/__init__.py | 39 ++++++++++++++++++++ python/popgetter/io_managers/azure.py | 6 ++- python/popgetter/io_managers/local.py | 6 ++- 6 files changed, 73 insertions(+), 8 deletions(-) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 92bbcd3..66ac17a 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -7,10 +7,12 @@ AzureGeneralIOManager, AzureGeoIOManager, AzureMetadataIOManager, + AzureMetricsIOManager, ) from popgetter.io_managers.local import ( LocalGeoIOManager, LocalMetadataIOManager, + LocalMetricsIOManager, ) from popgetter.utils import StagingDirResource @@ -69,10 +71,12 @@ "prod": { "metadata_io_manager": AzureMetadataIOManager(), "geometry_io_manager": AzureGeoIOManager(), + "metrics_io_manager": AzureMetricsIOManager(), }, "dev": { "metadata_io_manager": LocalMetadataIOManager(), "geometry_io_manager": LocalGeoIOManager(), + "metrics_io_manager": LocalMetricsIOManager(), }, } @@ -92,6 +96,7 @@ sensors=[ cloud_outputs.metadata_sensor, cloud_outputs.geometry_sensor, + cloud_outputs.metrics_sensor, ], resources=resources, jobs=[job_be, job_us, job_uk], diff --git a/python/popgetter/assets/be/census_derived.py b/python/popgetter/assets/be/census_derived.py index a4e8911..e594afc 100644 --- a/python/popgetter/assets/be/census_derived.py +++ b/python/popgetter/assets/be/census_derived.py @@ -9,7 +9,6 @@ AssetIn, IdentityPartitionMapping, MetadataValue, - Output, SpecificPartitionsPartitionMapping, StaticPartitionsDefinition, asset, @@ -224,7 +223,7 @@ def source_metrics_by_partition( f"Partition key {output_partition_key} not found in individual_census_table\n" f"Available keys are {individual_census_table.keys()}" ) - raise ValueError(err_msg) + raise ValueError(err_msg) from None catalog_row = filter_needed_catalog[ filter_needed_catalog["node"] == output_partition_key @@ -291,8 +290,8 @@ def derived_metrics_by_partition( derived_mmd.append(new_mmd) joined_metrics = reduce( - lambda left, right: pd.merge( - left, right, on="GEO_ID", how="inner", validate="one_to_one" + lambda left, right: left.merge( + right, on="GEO_ID", how="inner", validate="one_to_one" ), derived_metrics, ) @@ -323,7 +322,7 @@ def derived_metrics_by_partition( ) def metrics( context, derived_metrics_by_partition: tuple[list[MetricMetadata], pd.DataFrame] -) -> list[tuple[list[MetricMetadata], pd.DataFrame]]: +) -> list[tuple[str, list[MetricMetadata], pd.DataFrame]]: """ This asset exists solely to aggregate all the derived tables into one single unpartitioned asset, which the downstream publishing tasks can use. @@ -331,4 +330,6 @@ def metrics( Right now it is a bit boring because it only relies on one partition, but it could be extended when we have more data products. """ - return [derived_metrics_by_partition] + mmds, table = derived_metrics_by_partition + filepath = mmds[0].metric_parquet_path + return [(filepath, mmds, table)] diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py index a6904ec..96ba6f8 100644 --- a/python/popgetter/cloud_outputs/__init__.py +++ b/python/popgetter/cloud_outputs/__init__.py @@ -27,3 +27,15 @@ geometry_sensor = geometry_factory.create_sensor() geometry_asset = geometry_factory.create_publishing_asset() + +metrics_factory = CloudAssetSensor( + asset_names_to_monitor=[ + "be/metrics", + ], + io_manager_key="metrics_io_manager", + prefix="metrics", + interval=60, +) + +metrics_sensor = metrics_factory.create_sensor() +metrics_asset = metrics_factory.create_publishing_asset() diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index f2e1752..5bf6dd1 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -11,6 +11,7 @@ CountryMetadata, DataPublisher, GeometryMetadata, + MetricMetadata, SourceDataRelease, metadata_to_dataframe, ) @@ -156,3 +157,41 @@ def handle_output( **output_metadata, } ) + + +class MetricsIOManager(PopgetterIOManager): + def get_full_path_metadata( + self, + context: OutputContext, + ) -> UPath: + base_path = self.get_base_path() + asset_prefix = list(context.partition_key.split("/"))[:-1] + return base_path / UPath("/".join([*asset_prefix, "metric_metadata.parquet"])) + + def get_full_path_metrics( + self, + context: OutputContext, + parquet_path: str, + ) -> UPath: + base_path = self.get_base_path() + asset_prefix = list(context.partition_key.split("/"))[:-1] + return base_path / UPath("/".join([*asset_prefix, "metrics", parquet_path])) + + def handle_output( + self, + context: OutputContext, + obj: list[tuple[str, list[MetricMetadata], pd.DataFrame]], + ) -> None: + # Aggregate all the MetricMetadatas into a single dataframe, then + # serialise + all_metadatas_df = metadata_to_dataframe( + [md for _, per_file_metadatas, _ in obj for md in per_file_metadatas] + ) + metadata_df_filepath = self.get_full_path_metadata(context) + self.handle_df(context, all_metadatas_df, metadata_df_filepath) + + # Write dataframes to the parquet files specified in the first element + # of the tuple + for rel_path, _, df in obj: + full_path = self.get_full_path_metrics(context, rel_path) + self.handle_df(context, df, full_path) diff --git a/python/popgetter/io_managers/azure.py b/python/popgetter/io_managers/azure.py index 996bb0b..fdc8155 100644 --- a/python/popgetter/io_managers/azure.py +++ b/python/popgetter/io_managers/azure.py @@ -27,7 +27,7 @@ from icecream import ic from upath import UPath -from . import GeoIOManager, MetadataIOManager +from . import GeoIOManager, MetadataIOManager, MetricsIOManager # Set no time limit on lease duration to enable large files to be uploaded _LEASE_DURATION = -1 @@ -177,6 +177,10 @@ def handle_geo_metadata( self.dump_to_path(context, geo_metadata_df.to_parquet(None), full_path) +class AzureMetricsIOManager(AzureMixin, MetricsIOManager): + pass + + class AzureGeneralIOManager(AzureMixin, IOManager): """This class is used only for an asset which tests the Azure functionality (see cloud_outputs/azure_test.py). It is not used for publishing any diff --git a/python/popgetter/io_managers/local.py b/python/popgetter/io_managers/local.py index 64f06a8..a1f6163 100644 --- a/python/popgetter/io_managers/local.py +++ b/python/popgetter/io_managers/local.py @@ -7,7 +7,7 @@ from dagster import OutputContext from upath import UPath -from . import GeoIOManager, MetadataIOManager +from . import GeoIOManager, MetadataIOManager, MetricsIOManager class LocalMixin: @@ -45,3 +45,7 @@ def handle_geojsonseq( ) -> None: self.make_parent_dirs(full_path) geo_df.to_file(full_path, driver="GeoJSONSeq") + + +class LocalMetricsIOManager(LocalMixin, MetricsIOManager): + pass From 3f3e62e441605dc1dcd510cf3b09a8a90e50fca2 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Fri, 17 May 2024 00:55:10 +0100 Subject: [PATCH 30/35] Set $IGNORE_EXPERIMENTAL_WARNINGS to any nonempty value to suppress them --- python/popgetter/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 66ac17a..2ae8718 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -1,8 +1,12 @@ from __future__ import annotations +import os +import warnings from collections.abc import Sequence from pathlib import Path +from dagster import ExperimentalWarning + from popgetter.io_managers.azure import ( AzureGeneralIOManager, AzureGeoIOManager, @@ -21,6 +25,10 @@ __all__ = ["__version__"] +if "IGNORE_EXPERIMENTAL_WARNINGS" in os.environ: + warnings.filterwarnings("ignore", category=ExperimentalWarning) + + import os from dagster import ( From fd5cabbd001a7b6010cfe664bf34ad997392f068 Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Fri, 17 May 2024 01:00:48 +0100 Subject: [PATCH 31/35] Add output metadata to MetricsIOManager --- python/popgetter/io_managers/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 5bf6dd1..0fb2ccb 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -195,3 +195,21 @@ def handle_output( for rel_path, _, df in obj: full_path = self.get_full_path_metrics(context, rel_path) self.handle_df(context, df, full_path) + + # Add metadata + context.add_output_metadata( + metadata={ + "metric_parquet_paths": [ + str(self.get_full_path_metrics(context, rel_path)) + for rel_path, _, _ in obj + ], + "num_metrics": len(all_metadatas_df), + "metric_human_readable_names": all_metadatas_df[ + "human_readable_name" + ].tolist(), + "metadata_parquet_path": str(metadata_df_filepath), + "metadata_preview": MetadataValue.md( + all_metadatas_df.head().to_markdown() + ), + } + ) From 77e645dc4e4a47284d54efde08da42c28df280bc Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Fri, 17 May 2024 01:01:05 +0100 Subject: [PATCH 32/35] Stop the sensor from rerunning on assets that aren't freshly materialised --- python/popgetter/cloud_outputs/sensor_class.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/python/popgetter/cloud_outputs/sensor_class.py b/python/popgetter/cloud_outputs/sensor_class.py index 1c32d64..d0b75fe 100644 --- a/python/popgetter/cloud_outputs/sensor_class.py +++ b/python/popgetter/cloud_outputs/sensor_class.py @@ -89,10 +89,15 @@ def create_sensor(self): def inner_sensor(context): asset_events = context.latest_materialization_records_by_key() for asset_key, execution_value in asset_events.items(): - yield RunRequest( - run_key=None, - partition_key="/".join(asset_key.path), - ) - context.advance_cursor({asset_key: execution_value}) + # Assets which were materialised since the last time the sensor + # was run will have non-None execution_values (it will be an + # dagster.EventLogRecord class, which contains more information + # if needed). + if execution_value is not None: + yield RunRequest( + run_key=None, + partition_key="/".join(asset_key.path), + ) + context.advance_cursor({asset_key: execution_value}) return inner_sensor From 697efbcd34f610a53c13d6789b3fae2af6df620e Mon Sep 17 00:00:00 2001 From: Jonathan Yong Date: Fri, 17 May 2024 01:06:51 +0100 Subject: [PATCH 33/35] Fix type errors --- python/popgetter/assets/be/census_derived.py | 12 ++++++++++-- python/popgetter/metadata.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/python/popgetter/assets/be/census_derived.py b/python/popgetter/assets/be/census_derived.py index e594afc..e825c1a 100644 --- a/python/popgetter/assets/be/census_derived.py +++ b/python/popgetter/assets/be/census_derived.py @@ -62,7 +62,7 @@ class DerivedColumn: # The keys of this dict are the nodes (i.e. partition keys). The values are a # list of all columns of data derived from this node. -DERIVED_COLUMN_SPECIFICATIONS: dict[str, (str, [DerivedColumn])] = { +DERIVED_COLUMN_SPECIFICATIONS: dict[str, tuple[str, list[DerivedColumn]]] = { "https://statbel.fgov.be/node/4689": ( "CD_REFNIS", [ @@ -261,7 +261,7 @@ def derived_metrics_by_partition( f"Skipping as no derived columns are to be created for node {node}" ) context.log.warning(skip_reason) - raise RuntimeError(skip_reason) + raise RuntimeError(skip_reason) from None # Rename the geoID column to GEO_ID source_table = source_table.rename(columns={geo_id_col_name: "GEO_ID"}) @@ -332,4 +332,12 @@ def metrics( """ mmds, table = derived_metrics_by_partition filepath = mmds[0].metric_parquet_path + + context.add_output_metadata( + metadata={ + "num_metrics": len(mmds), + "num_parquets": 1, + }, + ) + return [(filepath, mmds, table)] diff --git a/python/popgetter/metadata.py b/python/popgetter/metadata.py index 8a324be..c8d148a 100644 --- a/python/popgetter/metadata.py +++ b/python/popgetter/metadata.py @@ -171,7 +171,7 @@ def id(self) -> str: hxl_tag: str = Field( description="Field description using the Humanitarian eXchange Language (HXL) standard" ) - metric_parquet_path: str | None = Field( + metric_parquet_path: str = Field( description="The path to the parquet file that contains the metric" ) parquet_column_name: str = Field( From 1ff9abb7251298d9508415a28a6cc1db5fb58709 Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 17 May 2024 12:43:30 +0100 Subject: [PATCH 34/35] Revise metadata IO manager to handle individual, list or dict metadata Co-authored-by: Jonathan Yong --- python/popgetter/assets/be/census_derived.py | 11 +++-- python/popgetter/assets/be/census_geometry.py | 44 ++++++++----------- python/popgetter/cloud_outputs/__init__.py | 2 +- python/popgetter/io_managers/__init__.py | 43 ++++++++++++++---- 4 files changed, 62 insertions(+), 38 deletions(-) diff --git a/python/popgetter/assets/be/census_derived.py b/python/popgetter/assets/be/census_derived.py index e825c1a..8e3bbcc 100644 --- a/python/popgetter/assets/be/census_derived.py +++ b/python/popgetter/assets/be/census_derived.py @@ -192,7 +192,7 @@ def filter_needed_catalog( key_prefix=asset_prefix, partition_mapping=needed_dataset_mapping ), "filter_needed_catalog": AssetIn(key_prefix=asset_prefix), - "source_data_release_munip": AssetIn(key_prefix=asset_prefix), + "source_data_releases": AssetIn(key_prefix=asset_prefix), }, partitions_def=dataset_node_partition, key_prefix=asset_prefix, @@ -203,7 +203,8 @@ def source_metrics_by_partition( filter_needed_catalog: pd.DataFrame, # TODO: generalise to list or dict of SourceDataReleases as there may be # tables in here that are not at the same release level - source_data_release_munip: SourceDataRelease, + # E.g. keys as Geography level ID + source_data_releases: dict[str, SourceDataRelease], # TODO: return an intermediate type instead of MetricMetadata ) -> tuple[MetricMetadata, pd.DataFrame]: input_partition_keys = context.asset_partition_keys_for_input( @@ -229,7 +230,11 @@ def source_metrics_by_partition( filter_needed_catalog["node"] == output_partition_key ].to_dict(orient="records")[0] - result_mmd = make_census_table_metadata(catalog_row, source_data_release_munip) + # TODO: refine upon more general level handling with derived column config. + # This config is currently called `DERIVED_COLUMN_SPECIFICATIONS` here and the + # level can also be included there. + key = "municipality" + result_mmd = make_census_table_metadata(catalog_row, source_data_releases[key]) return result_mmd, result_df diff --git a/python/popgetter/assets/be/census_geometry.py b/python/popgetter/assets/be/census_geometry.py index 5a7c275..f58a12a 100644 --- a/python/popgetter/assets/be/census_geometry.py +++ b/python/popgetter/assets/be/census_geometry.py @@ -165,31 +165,25 @@ def geometry( @asset(key_prefix=asset_prefix) -def source_data_release_munip( +def source_data_releases( geometry: list[tuple[GeometryMetadata, gpd.GeoDataFrame, pd.DataFrame]] -) -> SourceDataRelease: +) -> dict[str, SourceDataRelease]: """ - Returns the SourceDataRelease corresponding to the municipality level. + Returns all SourceDataReleases for each geometry level. """ - # Pick out the municipality geometry and get its ID - for geo_metadata, _, _ in geometry: - if geo_metadata.level == "municipality": - geo_metadata_id = geo_metadata.id - break - else: - err = "No municipality geometry found" - raise ValueError(err) - - return SourceDataRelease( - name="StatBel Open Data", - date_published=date(2015, 10, 22), - reference_period_start=date(2015, 10, 22), - reference_period_end=date(2015, 10, 22), - collection_period_start=date(2015, 10, 22), - collection_period_end=date(2015, 10, 22), - expect_next_update=date(2022, 1, 1), - url="https://statbel.fgov.be/en/open-data", - description="TBC", - data_publisher_id=publisher.id, - geometry_metadata_id=geo_metadata_id, - ) + return { + geo_metadata.level: SourceDataRelease( + name="StatBel Open Data", + date_published=date(2015, 10, 22), + reference_period_start=date(2015, 10, 22), + reference_period_end=date(2015, 10, 22), + collection_period_start=date(2015, 10, 22), + collection_period_end=date(2015, 10, 22), + expect_next_update=date(2022, 1, 1), + url="https://statbel.fgov.be/en/open-data", + description="TBC", + data_publisher_id=publisher.id, + geometry_metadata_id=geo_metadata.id, + ) + for geo_metadata, _, _ in geometry + } diff --git a/python/popgetter/cloud_outputs/__init__.py b/python/popgetter/cloud_outputs/__init__.py index 96ba6f8..1256933 100644 --- a/python/popgetter/cloud_outputs/__init__.py +++ b/python/popgetter/cloud_outputs/__init__.py @@ -6,7 +6,7 @@ asset_names_to_monitor=[ "be/country_metadata", "be/data_publisher", - "be/source_data_release_munip", + "be/source_data_releases", ], io_manager_key="metadata_io_manager", prefix="metadata", diff --git a/python/popgetter/io_managers/__init__.py b/python/popgetter/io_managers/__init__.py index 0fb2ccb..985f30b 100644 --- a/python/popgetter/io_managers/__init__.py +++ b/python/popgetter/io_managers/__init__.py @@ -17,6 +17,10 @@ ) +class IOManagerError(Exception): + pass + + class PopgetterIOManager(IOManager): def get_base_path(self) -> UPath: raise NotImplementedError @@ -33,13 +37,13 @@ def load_input(self, _context: InputContext) -> pd.DataFrame: class MetadataIOManager(PopgetterIOManager): def get_output_filename( - self, obj: CountryMetadata | DataPublisher | SourceDataRelease + self, obj_entry: CountryMetadata | DataPublisher | SourceDataRelease ) -> str: - if isinstance(obj, CountryMetadata): + if isinstance(obj_entry, CountryMetadata): return "country_metadata.parquet" - if isinstance(obj, DataPublisher): + if isinstance(obj_entry, DataPublisher): return "data_publishers.parquet" - if isinstance(obj, SourceDataRelease): + if isinstance(obj_entry, SourceDataRelease): return "source_data_releases.parquet" err_msg = "This IO manager only accepts CountryMetadata, DataPublisher, and SourceDataRelease" @@ -48,20 +52,41 @@ def get_output_filename( def get_full_path( self, context: OutputContext, - obj: CountryMetadata | DataPublisher | SourceDataRelease, + obj_entry: CountryMetadata | DataPublisher | SourceDataRelease, ) -> UPath: path_prefixes = list(context.partition_key.split("/"))[:-1] - filename = self.get_output_filename(obj) + filename = self.get_output_filename(obj_entry) return self.get_base_path() / UPath("/".join([*path_prefixes, filename])) def handle_output( self, context: OutputContext, - obj: CountryMetadata | DataPublisher | SourceDataRelease, + obj: ( + CountryMetadata + | DataPublisher + | SourceDataRelease + | list[CountryMetadata] + | list[DataPublisher] + | list[SourceDataRelease] + | dict[str, CountryMetadata] + | dict[str, DataPublisher] + | dict[str, SourceDataRelease] + ), ): - full_path = self.get_full_path(context, obj) + # Handling multiple obj types at runtime to simplify the DAG + + # TODO: add handling of: incorrect type of obj passed, empty dictionary and + # any other potential errors + if isinstance(obj, dict): + vals = list(obj.values()) + elif isinstance(obj, list): + vals = obj + else: + vals = [obj] + val = vals[0] + full_path = self.get_full_path(context, val) context.add_output_metadata(metadata={"parquet_path": str(full_path)}) - self.handle_df(context, metadata_to_dataframe([obj]), full_path) + self.handle_df(context, metadata_to_dataframe(vals), full_path) class GeoIOManager(PopgetterIOManager): From 3b2a0c27072f288737e95641c4617a7c85e6db4c Mon Sep 17 00:00:00 2001 From: Sam Greenbury Date: Fri, 17 May 2024 16:09:14 +0100 Subject: [PATCH 35/35] Fix tests and revise resources imported for provided env Co-authored-by: Jonathan Yong --- python/popgetter/__init__.py | 43 ++- .../{cloud_outputs => }/azure_test.py | 0 tests/test_cloud_outputs.py | 324 +++++++++--------- tests/test_metadata.py | 14 +- 4 files changed, 189 insertions(+), 192 deletions(-) rename python/popgetter/{cloud_outputs => }/azure_test.py (100%) diff --git a/python/popgetter/__init__.py b/python/popgetter/__init__.py index 2ae8718..144be45 100644 --- a/python/popgetter/__init__.py +++ b/python/popgetter/__init__.py @@ -38,6 +38,7 @@ PipesSubprocessClient, SourceAsset, define_asset_job, + load_assets_from_modules, load_assets_from_package_module, ) from dagster._core.definitions.cacheable_assets import ( @@ -47,13 +48,18 @@ UnresolvedAssetJobDefinition, ) -from popgetter import assets, cloud_outputs +from popgetter import assets, azure_test, cloud_outputs all_assets: Sequence[AssetsDefinition | SourceAsset | CacheableAssetsDefinition] = [ *load_assets_from_package_module(assets.us, group_name="us"), *load_assets_from_package_module(assets.be, group_name="be"), *load_assets_from_package_module(assets.uk, group_name="uk"), *load_assets_from_package_module(cloud_outputs, group_name="cloud_outputs"), + *( + load_assets_from_modules([azure_test], group_name="azure_test") + if os.getenv("ENV") == "prod" + else [] + ), ] job_be: UnresolvedAssetJobDefinition = define_asset_job( @@ -75,28 +81,35 @@ description="Downloads UK data.", ) -resources_by_env = { - "prod": { - "metadata_io_manager": AzureMetadataIOManager(), - "geometry_io_manager": AzureGeoIOManager(), - "metrics_io_manager": AzureMetricsIOManager(), - }, - "dev": { - "metadata_io_manager": LocalMetadataIOManager(), - "geometry_io_manager": LocalGeoIOManager(), - "metrics_io_manager": LocalMetricsIOManager(), - }, -} + +def resources_by_env(): + env = os.getenv("ENV", "dev") + if env == "prod": + return { + "metadata_io_manager": AzureMetadataIOManager(), + "geometry_io_manager": AzureGeoIOManager(), + "metrics_io_manager": AzureMetricsIOManager(), + "azure_general_io_manager": AzureGeneralIOManager(".bin"), + } + if env == "dev": + return { + "metadata_io_manager": LocalMetadataIOManager(), + "geometry_io_manager": LocalGeoIOManager(), + "metrics_io_manager": LocalMetricsIOManager(), + } + + err = f"$ENV should be either 'dev' or 'prod', but received '{env}'" + raise ValueError(err) + resources = { "pipes_subprocess_client": PipesSubprocessClient(), "staging_res": StagingDirResource( staging_dir=str(Path(__file__).parent.joinpath("staging_dir").resolve()) ), - "azure_general_io_manager": AzureGeneralIOManager(".bin"), } -resources.update(resources_by_env[os.getenv("ENV", "dev")]) +resources.update(resources_by_env()) defs: Definitions = Definitions( assets=all_assets, diff --git a/python/popgetter/cloud_outputs/azure_test.py b/python/popgetter/azure_test.py similarity index 100% rename from python/popgetter/cloud_outputs/azure_test.py rename to python/popgetter/azure_test.py diff --git a/tests/test_cloud_outputs.py b/tests/test_cloud_outputs.py index 452a7b5..7e65cfd 100644 --- a/tests/test_cloud_outputs.py +++ b/tests/test_cloud_outputs.py @@ -1,180 +1,168 @@ from __future__ import annotations -from datetime import datetime - import pytest -from dagster import ( - AssetKey, - RunRequest, - SkipReason, - StaticPartitionsDefinition, - asset, - build_asset_context, - build_multi_asset_sensor_context, - instance_for_test, - materialize_to_memory, -) -from icecream import ic # generate_pmtiles, -from popgetter import defs -from popgetter.cloud_outputs import ( - cartography_in_cloud_formats, - country_outputs_sensor, -) - +# from popgetter.cloud_outputs import ( +# cartography_in_cloud_formats, +# country_outputs_sensor, +# ) # generate_pmtiles, -from popgetter.utils import StagingDirResource - # TODO, Move this to a fixture to somewhere more universal from .test_be import demo_sectors # noqa: F401 - -def test_country_outputs_sensor(): - """ "" - This test checks that the country_outputs_sensor function returns a RunRequest - with the correct AssetKey and PartitionKey. - - Three upstream assets are mocked, which different qualities. - The context object is mocked and this also defines which assets are monitored. Hence the test does not attempt to - check that the sensor is correctly monitoring the required production assets. - - Hence this test only checks that the correct config parameters are returned via the RunRequest object(s). - """ - - @asset - def irrelevant_asset(): - return 1 - - @asset - def monitored_asset(): - return 2 - - @asset( - partitions_def=StaticPartitionsDefinition(["A", "B", "C"]), - ) - def partitioned_monitored_asset(context): - return ic(context.partition_key) - - # instance = DagsterInstance.ephemeral() - with instance_for_test() as instance: - ctx = build_multi_asset_sensor_context( - monitored_assets=[ - AssetKey("monitored_asset"), - AssetKey("partitioned_monitored_asset"), - ], - instance=instance, - definitions=defs, - ) - - # Nothing is materialized yet - result = list(country_outputs_sensor(ctx)) - assert len(result) == 1 - assert isinstance(result[0], SkipReason) - - # Only the irrelevant asset is materialized - materialize_to_memory([irrelevant_asset], instance=instance) - result = list(country_outputs_sensor(ctx)) - assert len(result) == 1 - assert isinstance(result[0], SkipReason) - - # Only the monitored asset is materialized - materialize_to_memory([monitored_asset], instance=instance) - result = list(country_outputs_sensor(ctx)) - assert len(result) == 1 - assert isinstance(result[0], RunRequest) - - # Both non-partitioned assets are materialized - materialize_to_memory([monitored_asset, irrelevant_asset], instance=instance) - result = list(country_outputs_sensor(ctx)) - assert len(result) == 1 - assert isinstance(result[0], RunRequest) - assert result[0].partition_key == "monitored-asset" - assert ( - result[0].run_config["ops"]["upstream_df"]["config"]["asset_to_load"] - == "monitored_asset" - ) - assert ( - result[0].run_config["ops"]["upstream_df"]["config"]["partition_to_load"] - is None - ) - - # All three assets are materialized - materialize_to_memory( - [monitored_asset, irrelevant_asset, partitioned_monitored_asset], - instance=instance, - partition_key="A", - ) - result = list(country_outputs_sensor(ctx)) - assert len(result) == 2 - - # The order of the results does not need to be guaranteed - # These two functions are used to check the results below - def assert_for_non_partitioned_assets(r): - assert isinstance(r, RunRequest) - assert r.partition_key == "monitored-asset" - assert ( - r.run_config["ops"]["upstream_df"]["config"]["asset_to_load"] - == "monitored_asset" - ) - assert ( - r.run_config["ops"]["upstream_df"]["config"]["partition_to_load"] - is None - ) - - def assert_for_partitioned_assets(r): - assert r.partition_key == "partitioned-monitored-asset-a" - assert ( - r.run_config["ops"]["upstream_df"]["config"]["asset_to_load"] - == "partitioned_monitored_asset" - ) - assert ( - r.run_config["ops"]["upstream_df"]["config"]["partition_to_load"] == "A" - ) - - # Now check that the results include one partitioned and one non-partitioned asset - try: - assert_for_non_partitioned_assets(result[0]) - assert_for_partitioned_assets(result[1]) - except AssertionError: - assert_for_non_partitioned_assets(result[1]) - assert_for_partitioned_assets(result[0]) - - -# TODO: The no QA comment below is pending moving the fixture to a more -# universal location -def test_cartography_in_cloud_formats(tmp_path, demo_sectors): # noqa: F811 - # This test is outputs each of the cartography outputs to individual files - # in the staging directory. The test then checks that the files exist and - # that they were created after the test started. - time_at_start = datetime.now() - - staging_res = StagingDirResource(staging_dir=str(tmp_path)) - resources_for_test = { - "staging_res": staging_res, - "unit_test_key": "test_cartography_in_cloud_formats", - } - - with ( - instance_for_test() as instance, - build_asset_context( - resources=resources_for_test, - instance=instance, - partition_key="historic-european-region", - ) as context, - ): - # Collect the results - # Results should be a generator which produces three Output objects - results = cartography_in_cloud_formats(context, demo_sectors) - - output_paths = [r.value for r in list(results)] - # There should be 3 outputs - assert len(output_paths) == 3 - - # Check that the output paths exist and were created after the test started - for output_path in output_paths: - assert output_path.exists() - assert output_path.stat().st_mtime > time_at_start.timestamp() +# Commented out test as part of #92 as functions no longer importable +# @pytest.mark.skip( +# reason="This tests old code that is no longer imported since cloud outputs were reworked in #92" +# ) +# def test_country_outputs_sensor(): +# """ "" +# This test checks that the country_outputs_sensor function returns a RunRequest +# with the correct AssetKey and PartitionKey. + +# Three upstream assets are mocked, which different qualities. +# The context object is mocked and this also defines which assets are monitored. Hence the test does not attempt to +# check that the sensor is correctly monitoring the required production assets. + +# Hence this test only checks that the correct config parameters are returned via the RunRequest object(s). +# """ + +# @asset +# def irrelevant_asset(): +# return 1 + +# @asset +# def monitored_asset(): +# return 2 + +# @asset( +# partitions_def=StaticPartitionsDefinition(["A", "B", "C"]), +# ) +# def partitioned_monitored_asset(context): +# return ic(context.partition_key) + +# # instance = DagsterInstance.ephemeral() +# with instance_for_test() as instance: +# ctx = build_multi_asset_sensor_context( +# monitored_assets=[ +# AssetKey("monitored_asset"), +# AssetKey("partitioned_monitored_asset"), +# ], +# instance=instance, +# definitions=defs, +# ) + +# # Nothing is materialized yet +# result = list(country_outputs_sensor(ctx)) +# assert len(result) == 1 +# assert isinstance(result[0], SkipReason) + +# # Only the irrelevant asset is materialized +# materialize_to_memory([irrelevant_asset], instance=instance) +# result = list(country_outputs_sensor(ctx)) +# assert len(result) == 1 +# assert isinstance(result[0], SkipReason) + +# # Only the monitored asset is materialized +# materialize_to_memory([monitored_asset], instance=instance) +# result = list(country_outputs_sensor(ctx)) +# assert len(result) == 1 +# assert isinstance(result[0], RunRequest) + +# # Both non-partitioned assets are materialized +# materialize_to_memory([monitored_asset, irrelevant_asset], instance=instance) +# result = list(country_outputs_sensor(ctx)) +# assert len(result) == 1 +# assert isinstance(result[0], RunRequest) +# assert result[0].partition_key == "monitored-asset" +# assert ( +# result[0].run_config["ops"]["upstream_df"]["config"]["asset_to_load"] +# == "monitored_asset" +# ) +# assert ( +# result[0].run_config["ops"]["upstream_df"]["config"]["partition_to_load"] +# is None +# ) + +# # All three assets are materialized +# materialize_to_memory( +# [monitored_asset, irrelevant_asset, partitioned_monitored_asset], +# instance=instance, +# partition_key="A", +# ) +# result = list(country_outputs_sensor(ctx)) +# assert len(result) == 2 + +# # The order of the results does not need to be guaranteed +# # These two functions are used to check the results below +# def assert_for_non_partitioned_assets(r): +# assert isinstance(r, RunRequest) +# assert r.partition_key == "monitored-asset" +# assert ( +# r.run_config["ops"]["upstream_df"]["config"]["asset_to_load"] +# == "monitored_asset" +# ) +# assert ( +# r.run_config["ops"]["upstream_df"]["config"]["partition_to_load"] +# is None +# ) + +# def assert_for_partitioned_assets(r): +# assert r.partition_key == "partitioned-monitored-asset-a" +# assert ( +# r.run_config["ops"]["upstream_df"]["config"]["asset_to_load"] +# == "partitioned_monitored_asset" +# ) +# assert ( +# r.run_config["ops"]["upstream_df"]["config"]["partition_to_load"] == "A" +# ) + +# # Now check that the results include one partitioned and one non-partitioned asset +# try: +# assert_for_non_partitioned_assets(result[0]) +# assert_for_partitioned_assets(result[1]) +# except AssertionError: +# assert_for_non_partitioned_assets(result[1]) +# assert_for_partitioned_assets(result[0]) + +# Commented out test as part of #92 as functions no longer importable +# @pytest.mark.skip( +# reason="This tests old code that is no longer imported since cloud outputs were reworked in #92" +# ) +# # TODO: The no QA comment below is pending moving the fixture to a more +# # universal location +# def test_cartography_in_cloud_formats(tmp_path, demo_sectors): +# # This test is outputs each of the cartography outputs to individual files +# # in the staging directory. The test then checks that the files exist and +# # that they were created after the test started. +# time_at_start = datetime.now() + +# staging_res = StagingDirResource(staging_dir=str(tmp_path)) +# resources_for_test = { +# "staging_res": staging_res, +# "unit_test_key": "test_cartography_in_cloud_formats", +# } + +# with ( +# instance_for_test() as instance, +# build_asset_context( +# resources=resources_for_test, +# instance=instance, +# partition_key="historic-european-region", +# ) as context, +# ): +# # Collect the results +# # Results should be a generator which produces three Output objects +# results = cartography_in_cloud_formats(context, demo_sectors) + +# output_paths = [r.value for r in list(results)] +# # There should be 3 outputs +# assert len(output_paths) == 3 + +# # Check that the output paths exist and were created after the test started +# for output_path in output_paths: +# assert output_path.exists() +# assert output_path.stat().st_mtime > time_at_start.timestamp() @pytest.mark.skip(reason="Test not implemented yet") diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 4614902..bc272b2 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -20,8 +20,7 @@ def test_source_data_release_validation_reference(): url="https://example.com", data_publisher_id="test_publisher_id", description="This is a test data release", - geography_file="test_geography_file", - geography_level="test_geography_level", + geometry_metadata_id="test_geom_id", ) @@ -38,8 +37,7 @@ def test_source_data_release_validation_collection(): url="https://example.com", data_publisher_id="test_publisher_id", description="This is a test data release", - geography_file="test_geography_file", - geography_level="test_geography_level", + geometry_metadata_id="test_geom_id", ) @@ -55,12 +53,11 @@ def test_source_data_release_hash(): url="https://example.com", data_publisher_id="test_publisher_id", description="This is a test data release", - geography_file="test_geography_file", - geography_level="test_geography_level", + geometry_metadata_id="test_geom_id", ) assert ( source_data_release.id - == "15e6144c641c637247bde426fba653f209717799e41df6709a589bafbb4014c1" + == "9ec7e234d73664339e4c1f04bfa485dbb17e204dd72dc3ffbb9cab6870475597" ) source_data_release2 = SourceDataRelease( @@ -74,8 +71,7 @@ def test_source_data_release_hash(): url="https://example.com", data_publisher_id="test_publisher_id", description="This is a test data release", - geography_file="test_geography_file", - geography_level="test_geography_level", + geometry_metadata_id="test_geom_id", ) assert source_data_release.id != source_data_release2.id