Skip to content

Commit

Permalink
Merge pull request #164 from JDASoftwareGroup/bugfix/schema_validatio…
Browse files Browse the repository at this point in the history
…n_null_table

Allow empty tables to be part of a partition
  • Loading branch information
fjetter authored Oct 19, 2019
2 parents 43765fa + f66c5bc commit f5e82a6
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ Version 3.X.X (2019-09-XX)
defer the validation in case of inconsistencies to the final commit. Exception
messages will be less verbose in these cases as before.
- Add support for pyarrow 0.15.0
- Fix an issue where an empty dataframe of a partition in a multi-table dataset
would raise a schema validation exception
- Remove support for pyarrow < 0.13.0


Expand Down
2 changes: 2 additions & 0 deletions kartothek/core/common_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,8 @@ def validate_shared_columns(schemas, ignore_pandas=False):
obj = (field, col)
if col in seen:
ref = seen[col]
if pa.types.is_null(ref[0].type) or pa.types.is_null(field.type):
continue
if ref != obj:
raise ValueError(
'Found incompatible entries for column "{}"\n{}\n{}'.format(
Expand Down
15 changes: 15 additions & 0 deletions tests/core/test_common_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,21 @@ def test_validate_shared_columns_same(df_all_types):
)


def test_validate_shared_columns_null_value(df_all_types):
schema1 = make_meta(df_all_types, origin="1")
schema2 = make_meta(df_all_types.drop(0), origin="2")
schema3 = make_meta(df_all_types, origin="3").remove_metadata()
validate_shared_columns([])
validate_shared_columns([schema1])
validate_shared_columns([schema1, schema2])
with pytest.raises(ValueError):
validate_shared_columns([schema1, schema2, schema3])
validate_shared_columns([schema1, schema2, schema3], ignore_pandas=True)
validate_shared_columns(
[schema1.remove_metadata(), schema2.remove_metadata(), schema3]
)


def test_validate_shared_columns_no_share(df_all_types):
schema1 = make_meta(df_all_types.loc[:, df_all_types.columns[0:2]], origin="1")
schema2 = make_meta(df_all_types.loc[:, df_all_types.columns[2:4]], origin="2")
Expand Down

0 comments on commit f5e82a6

Please sign in to comment.