diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index 8189a92b812f2..bb87d7319c2c2 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -582,21 +582,27 @@ bool testFilters( for (const auto& child : scanSpec->children()) { if (child->filter()) { const auto& name = child->fieldName(); - if (!rowType->containsChild(name)) { - // If missing column is partition key. - auto iter = partitionKey.find(name); + auto iter = partitionKey.find(name); + // The partition key columns are writen in the data file for + // IcebergTables, so we need to test both cases + if (!rowType->containsChild(name) || iter != partitionKey.end()) { if (iter != partitionKey.end() && iter->second.has_value()) { auto handlesIter = partitionKeysHandle.find(name); VELOX_CHECK(handlesIter != partitionKeysHandle.end()); + // This is a non-null partition key return applyPartitionFilter( handlesIter->second->dataType()->kind(), iter->second.value(), child->filter()); } - // Column is missing. Most likely due to schema evolution. + // Column is missing, most likely due to schema evolution. Or it's a + // partition key but the partition value is NULL. if (child->filter()->isDeterministic() && !child->filter()->testNull()) { + VLOG(1) << "Skipping " << filePath + << " because the filter testNull() failed for column " + << child->fieldName(); return false; } } else { diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index 8182147ce7622..32938c90d366f 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -72,6 +72,16 @@ class TableScanTest : public virtual HiveConnectorTestBase { } std::vector makeVectors( + int32_t count, + int32_t rowsPerVector, + const RowTypePtr& rowType = nullptr, + std::function isNullAt = nullptr) { + auto inputs = rowType ? rowType : rowType_; + return HiveConnectorTestBase::makeVectors( + inputs, count, rowsPerVector, isNullAt); + } + + std::vector makeNullableVectors( int32_t count, int32_t rowsPerVector, const RowTypePtr& rowType = nullptr) { @@ -174,7 +184,8 @@ class TableScanTest : public virtual HiveConnectorTestBase { void testPartitionedTableImpl( const std::string& filePath, const TypePtr& partitionType, - const std::optional& partitionValue) { + const std::optional& partitionValue, + bool isPartitionColumnInFile = false) { auto split = HiveConnectorSplitBuilder(filePath) .partitionKey("pkey", partitionValue) .build(); @@ -194,8 +205,11 @@ class TableScanTest : public virtual HiveConnectorTestBase { std::string partitionValueStr = partitionValue.has_value() ? "'" + *partitionValue + "'" : "null"; - assertQuery( - op, split, fmt::format("SELECT {}, * FROM tmp", partitionValueStr)); + + std::string duckdbSql = isPartitionColumnInFile + ? "SELECT * FROM tmp" + : fmt::format("SELECT {}, * FROM tmp", partitionValueStr); + assertQuery(op, split, duckdbSql); outputType = ROW({"c0", "pkey", "c1"}, {BIGINT(), partitionType, DOUBLE()}); op = PlanBuilder() @@ -233,12 +247,60 @@ class TableScanTest : public virtual HiveConnectorTestBase { op, split, fmt::format("SELECT {} FROM tmp", partitionValueStr)); } - void testPartitionedTable( + void testPartitionedTableWithFilterOnPartitionKey( + const RowTypePtr& fileSchema, const std::string& filePath, const TypePtr& partitionType, const std::optional& partitionValue) { - testPartitionedTableImpl(filePath, partitionType, partitionValue); - testPartitionedTableImpl(filePath, partitionType, std::nullopt); + // The filter on the partition key cannot eliminate the partition, therefore + // the split should NOT be skipped and all rows in it should be selected. + auto split = HiveConnectorSplitBuilder(filePath) + .partitionKey("pkey", partitionValue) + .build(); + auto outputType = ROW({"c0", "c1"}, {BIGINT(), DOUBLE()}); + ColumnHandleMap assignments = { + {"pkey", partitionKey("pkey", partitionType)}, + {"c0", regularColumn("c0", BIGINT())}, + {"c1", regularColumn("c1", DOUBLE())}}; + std::string filter = partitionValue.has_value() + ? "pkey = " + partitionValue.value() + : "pkey IS NULL"; + auto op = PlanBuilder() + .startTableScan() + .dataColumns(fileSchema) + .outputType(outputType) + .assignments(assignments) + .subfieldFilter(filter) + .endTableScan() + .planNode(); + + assertQuery(op, split, fmt::format("SELECT c0, c1 FROM tmp")); + + // The split should be skipped because the partition key does not pass + // the filter + filter = partitionValue.has_value() ? "pkey <> " + partitionValue.value() + : "pkey IS NOT NULL"; + op = PlanBuilder() + .startTableScan() + .dataColumns(fileSchema) + .outputType(outputType) + .assignments(assignments) + .subfieldFilter(filter) + .endTableScan() + .planNode(); + assertQuery(op, split, fmt::format("SELECT c0, c1 FROM tmp WHERE 1 = 0")); + } + + void testPartitionedTable( + const std::string& filePath, + const TypePtr& partitionType, + const std::optional& partitionValue, + const RowTypePtr& fileSchema = nullptr, + bool isPartitionColumnInFile = false) { + testPartitionedTableImpl( + filePath, partitionType, partitionValue, isPartitionColumnInFile); + testPartitionedTableImpl( + filePath, partitionType, std::nullopt, isPartitionColumnInFile); } RowTypePtr rowType_{ @@ -1702,6 +1764,27 @@ TEST_F(TableScanTest, partitionedTableDateKey) { testPartitionedTable(filePath->getPath(), DATE(), "2023-10-27"); } +// Partition key was written as a real column in the data file, and the value is +// NULL. The column does not have column statistics +TEST_F(TableScanTest, partitionedTablePartitionKeyInFile) { + auto fileSchema = ROW({"c0", "c1", "pkey"}, {BIGINT(), DOUBLE(), BIGINT()}); + auto filePath = TempFilePath::create(); + // Create values of all nulls. + auto vectors = + makeVectors(10, 1'000, fileSchema, [](vector_size_t i) { return true; }); + // auto vectors = makeVectors(10, 1'000, rowType); + writeToFile( + filePath->getPath(), + vectors, + std::make_shared(), + false); + createDuckDbTable(vectors); + testPartitionedTable( + filePath->getPath(), BIGINT(), std::nullopt, fileSchema, true); + testPartitionedTableWithFilterOnPartitionKey( + fileSchema, filePath->getPath(), BIGINT(), std::nullopt); +} + std::vector toStringViews(const std::vector& values) { std::vector views; views.reserve(values.size()); diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.cpp b/velox/exec/tests/utils/HiveConnectorTestBase.cpp index a30c14488410e..68693c389f308 100644 --- a/velox/exec/tests/utils/HiveConnectorTestBase.cpp +++ b/velox/exec/tests/utils/HiveConnectorTestBase.cpp @@ -91,11 +91,13 @@ void HiveConnectorTestBase::writeToFile( std::vector HiveConnectorTestBase::makeVectors( const RowTypePtr& rowType, int32_t numVectors, - int32_t rowsPerVector) { + int32_t rowsPerVector, + std::function isNullAt) { std::vector vectors; for (int32_t i = 0; i < numVectors; ++i) { auto vector = std::dynamic_pointer_cast( - velox::test::BatchMaker::createBatch(rowType, rowsPerVector, *pool_)); + velox::test::BatchMaker::createBatch( + rowType, rowsPerVector, *pool_, isNullAt)); vectors.push_back(vector); } return vectors; diff --git a/velox/exec/tests/utils/HiveConnectorTestBase.h b/velox/exec/tests/utils/HiveConnectorTestBase.h index 044caa8b69cf0..cf76bc118bed9 100644 --- a/velox/exec/tests/utils/HiveConnectorTestBase.h +++ b/velox/exec/tests/utils/HiveConnectorTestBase.h @@ -53,7 +53,8 @@ class HiveConnectorTestBase : public OperatorTestBase { std::vector makeVectors( const RowTypePtr& rowType, int32_t numVectors, - int32_t rowsPerVector); + int32_t rowsPerVector, + std::function isNullAt = nullptr); using OperatorTestBase::assertQuery;