diff --git a/CHANGES.rst b/CHANGES.rst index 52905982..3fe583ef 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -19,6 +19,8 @@ Version 3.X.X (2019-09-XX) defer the validation in case of inconsistencies to the final commit. Exception messages will be less verbose in these cases as before. - Add support for pyarrow 0.15.0 +- Fix an issue where an empty dataframe of a partition in a multi-table dataset + would raise a schema validation exception - Remove support for pyarrow < 0.13.0 diff --git a/kartothek/core/common_metadata.py b/kartothek/core/common_metadata.py index 11c1da6d..b9e16f4f 100644 --- a/kartothek/core/common_metadata.py +++ b/kartothek/core/common_metadata.py @@ -721,6 +721,8 @@ def validate_shared_columns(schemas, ignore_pandas=False): obj = (field, col) if col in seen: ref = seen[col] + if pa.types.is_null(ref[0].type) or pa.types.is_null(field.type): + continue if ref != obj: raise ValueError( 'Found incompatible entries for column "{}"\n{}\n{}'.format( diff --git a/tests/core/test_common_metadata.py b/tests/core/test_common_metadata.py index 8825e791..ac2520e0 100644 --- a/tests/core/test_common_metadata.py +++ b/tests/core/test_common_metadata.py @@ -251,6 +251,21 @@ def test_validate_shared_columns_same(df_all_types): ) +def test_validate_shared_columns_null_value(df_all_types): + schema1 = make_meta(df_all_types, origin="1") + schema2 = make_meta(df_all_types.drop(0), origin="2") + schema3 = make_meta(df_all_types, origin="3").remove_metadata() + validate_shared_columns([]) + validate_shared_columns([schema1]) + validate_shared_columns([schema1, schema2]) + with pytest.raises(ValueError): + validate_shared_columns([schema1, schema2, schema3]) + validate_shared_columns([schema1, schema2, schema3], ignore_pandas=True) + validate_shared_columns( + [schema1.remove_metadata(), schema2.remove_metadata(), schema3] + ) + + def test_validate_shared_columns_no_share(df_all_types): schema1 = make_meta(df_all_types.loc[:, df_all_types.columns[0:2]], origin="1") schema2 = make_meta(df_all_types.loc[:, df_all_types.columns[2:4]], origin="2")