apache · RussellSpitzer · Oct 16, 2024 · Aug 28, 2024 · Aug 29, 2024 · Aug 29, 2024
diff --git a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java
@@ -26,4 +26,6 @@ private StandardBlobTypes() {}
    * href="https://datasketches.apache.org/">Apache DataSketches</a> library
    */
   public static final String APACHE_DATASKETCHES_THETA_V1 = "apache-datasketches-theta-v1";
+
+  public static final String PRESTO_SUM_DATA_SIZE_BYTES_V1 = "presto-sum-data-size-bytes-v1";
 }
diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkScan.java
@@ -199,28 +199,24 @@ protected Statistics estimateStatistics(Snapshot snapshot) {
         List<BlobMetadata> metadataList = (files.get(0)).blobMetadata();
 
         for (BlobMetadata blobMetadata : metadataList) {
-          int id = blobMetadata.fields().get(0);
-          String colName = table.schema().findColumnName(id);
-          NamedReference ref = FieldReference.column(colName);
-
-          Long ndv = null;
           if (blobMetadata
               .type()
               .equals(org.apache.iceberg.puffin.StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1)) {
+            int id = blobMetadata.fields().get(0);
+            String colName = table.schema().findColumnName(id);
+            NamedReference ref = FieldReference.column(colName);
+            Long ndv = null;
             String ndvStr = blobMetadata.properties().get(NDV_KEY);
             if (!Strings.isNullOrEmpty(ndvStr)) {
               ndv = Long.parseLong(ndvStr);
             } else {
               LOG.debug("ndv is not set in BlobMetadata for column {}", colName);
             }
-          } else {
-            LOG.debug("DataSketch blob is not available for column {}", colName);
-          }
+            ColumnStatistics colStats =
+                new SparkColumnStatistics(ndv, null, null, null, null, null, null);
 
-          ColumnStatistics colStats =
-              new SparkColumnStatistics(ndv, null, null, null, null, null, null);
-
-          colStatsMap.put(ref, colStats);
+            colStatsMap.put(ref, colStats);
+          }
         }
       }
     }

diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkScan.java
@@ -19,6 +19,7 @@
 package org.apache.iceberg.spark.source;
 
 import static org.apache.iceberg.puffin.StandardBlobTypes.APACHE_DATASKETCHES_THETA_V1;
+import static org.apache.iceberg.puffin.StandardBlobTypes.PRESTO_SUM_DATA_SIZE_BYTES_V1;
 import static org.apache.iceberg.spark.SystemFunctionPushDownHelper.createPartitionedTable;
 import static org.apache.iceberg.spark.SystemFunctionPushDownHelper.createUnpartitionedTable;
 import static org.apache.iceberg.spark.SystemFunctionPushDownHelper.timestampStrToDayOrdinal;
@@ -178,6 +179,59 @@ public void testTableWithoutColStats() throws NoSuchTableException {
         reportColStatsEnabled, () -> checkColStatisticsReported(scan, 4L, Maps.newHashMap()));
   }
 
+  @TestTemplate
+  public void testTableWithoutApacheDatasketchColStat() throws NoSuchTableException {
+    sql("CREATE TABLE %s (id int, data string) USING iceberg", tableName);
+
+    List<SimpleRecord> records =
+        Lists.newArrayList(
+            new SimpleRecord(1, "a"),
+            new SimpleRecord(2, "b"),
+            new SimpleRecord(3, "a"),
+            new SimpleRecord(4, "b"));
+    spark
+        .createDataset(records, Encoders.bean(SimpleRecord.class))
+        .coalesce(1)
+        .writeTo(tableName)
+        .append();
+
+    Table table = validationCatalog.loadTable(tableIdent);
+    long snapshotId = table.currentSnapshot().snapshotId();
+
+    SparkScanBuilder scanBuilder =
+        new SparkScanBuilder(spark, table, CaseInsensitiveStringMap.empty());
+    SparkScan scan = (SparkScan) scanBuilder.build();
+
+    Map<String, String> reportColStatsDisabled =
+        ImmutableMap.of(
+            SQLConf.CBO_ENABLED().key(), "true", SparkSQLProperties.REPORT_COLUMN_STATS, "false");
+
+    Map<String, String> reportColStatsEnabled =
+        ImmutableMap.of(SQLConf.CBO_ENABLED().key(), "true");
+
+    GenericStatisticsFile statisticsFile =
+        new GenericStatisticsFile(
+            snapshotId,
+            "/test/statistics/file.puffin",
+            100,
+            42,
+            ImmutableList.of(
+                new GenericBlobMetadata(
+                    PRESTO_SUM_DATA_SIZE_BYTES_V1,
+                    snapshotId,
+                    1,
+                    ImmutableList.of(1),
+                    ImmutableMap.of("data_size", "4"))));
+
+    table.updateStatistics().setStatistics(snapshotId, statisticsFile).commit();
+
+    checkColStatisticsNotReported(scan, 4L);
+    withSQLConf(reportColStatsDisabled, () -> checkColStatisticsNotReported(scan, 4L));
+    // The expected col NDVs are nulls
+    withSQLConf(
+        reportColStatsEnabled, () -> checkColStatisticsReported(scan, 4L, Maps.newHashMap()));
+  }
+
   @TestTemplate
   public void testTableWithOneColStats() throws NoSuchTableException {
     sql("CREATE TABLE %s (id int, data string) USING iceberg", tableName);
@@ -232,6 +286,67 @@ public void testTableWithOneColStats() throws NoSuchTableException {
     withSQLConf(reportColStatsEnabled, () -> checkColStatisticsReported(scan, 4L, expectedOneNDV));
   }
 
+  @TestTemplate
+  public void testTableWithOneApacheDatasketchColStatAndOneDifferentColStat()
+      throws NoSuchTableException {
+    sql("CREATE TABLE %s (id int, data string) USING iceberg", tableName);
+
+    List<SimpleRecord> records =
+        Lists.newArrayList(
+            new SimpleRecord(1, "a"),
+            new SimpleRecord(2, "b"),
+            new SimpleRecord(3, "a"),
+            new SimpleRecord(4, "b"));
+    spark
+        .createDataset(records, Encoders.bean(SimpleRecord.class))
+        .coalesce(1)
+        .writeTo(tableName)
+        .append();
+
+    Table table = validationCatalog.loadTable(tableIdent);
+    long snapshotId = table.currentSnapshot().snapshotId();
+
+    SparkScanBuilder scanBuilder =
+        new SparkScanBuilder(spark, table, CaseInsensitiveStringMap.empty());
+    SparkScan scan = (SparkScan) scanBuilder.build();
+
+    Map<String, String> reportColStatsDisabled =
+        ImmutableMap.of(
+            SQLConf.CBO_ENABLED().key(), "true", SparkSQLProperties.REPORT_COLUMN_STATS, "false");
+
+    Map<String, String> reportColStatsEnabled =
+        ImmutableMap.of(SQLConf.CBO_ENABLED().key(), "true");
+
+    GenericStatisticsFile statisticsFile =
+        new GenericStatisticsFile(
+            snapshotId,
+            "/test/statistics/file.puffin",
+            100,
+            42,
+            ImmutableList.of(
+                new GenericBlobMetadata(
+                    APACHE_DATASKETCHES_THETA_V1,
+                    snapshotId,
+                    1,
+                    ImmutableList.of(1),
+                    ImmutableMap.of("ndv", "4")),
+                new GenericBlobMetadata(
+                    PRESTO_SUM_DATA_SIZE_BYTES_V1,
+                    snapshotId,
+                    1,
+                    ImmutableList.of(1),
+                    ImmutableMap.of("data_size", "2"))));
+
+    table.updateStatistics().setStatistics(snapshotId, statisticsFile).commit();
+
+    checkColStatisticsNotReported(scan, 4L);
+    withSQLConf(reportColStatsDisabled, () -> checkColStatisticsNotReported(scan, 4L));
+
+    Map<String, Long> expectedOneNDV = Maps.newHashMap();
+    expectedOneNDV.put("id", 4L);
+    withSQLConf(reportColStatsEnabled, () -> checkColStatisticsReported(scan, 4L, expectedOneNDV));
+  }
+
   @TestTemplate
   public void testTableWithTwoColStats() throws NoSuchTableException {
     sql("CREATE TABLE %s (id int, data string) USING iceberg", tableName);