apache · advancedxy · Nov 3, 2021 · Oct 28, 2024 · Oct 1, 2024 · Nov 1, 2024
diff --git a/api/src/main/java/org/apache/iceberg/ExpireSnapshots.java b/api/src/main/java/org/apache/iceberg/ExpireSnapshots.java
@@ -118,4 +118,16 @@ public interface ExpireSnapshots extends PendingUpdate<List<Snapshot>> {
    * @return this for method chaining
    */
   ExpireSnapshots cleanExpiredFiles(boolean clean);
+
+  /**
+   * Allows removal of unreachable partition specs as part of the expiration operation
+   *
+   * @param removeUnusedSpecs setting this to true will remove partition specs that are no longer
+   *     reachable by any snapshot
+   * @return this for method chaining
+   */
+  default ExpireSnapshots removeUnusedSpecs(boolean removeUnusedSpecs) {
+    throw new UnsupportedOperationException(
+        this.getClass().getName() + " doesn't implement removeUnusedSpecs");
+  }
 }
diff --git a/core/src/main/java/org/apache/iceberg/IncrementalFileCleanup.java b/core/src/main/java/org/apache/iceberg/IncrementalFileCleanup.java
@@ -20,6 +20,7 @@
 
 import java.io.IOException;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ExecutorService;
@@ -256,7 +257,8 @@ public void cleanFiles(TableMetadata beforeExpiration, TableMetadata afterExpira
             });
 
     Set<String> filesToDelete =
-        findFilesToDelete(manifestsToScan, manifestsToRevert, validIds, afterExpiration);
+        findFilesToDelete(
+            manifestsToScan, manifestsToRevert, validIds, beforeExpiration.specsById());
 
     deleteFiles(filesToDelete, "data");
     deleteFiles(manifestsToDelete, "manifest");
@@ -273,7 +275,7 @@ private Set<String> findFilesToDelete(
       Set<ManifestFile> manifestsToScan,
       Set<ManifestFile> manifestsToRevert,
       Set<Long> validIds,
-      TableMetadata current) {
+      Map<Integer, PartitionSpec> specsById) {
     Set<String> filesToDelete = ConcurrentHashMap.newKeySet();
     Tasks.foreach(manifestsToScan)
         .retry(3)
@@ -285,8 +287,7 @@ private Set<String> findFilesToDelete(
         .run(
             manifest -> {
               // the manifest has deletes, scan it to find files to delete
-              try (ManifestReader<?> reader =
-                  ManifestFiles.open(manifest, fileIO, current.specsById())) {
+              try (ManifestReader<?> reader = ManifestFiles.open(manifest, fileIO, specsById)) {
                 for (ManifestEntry<?> entry : reader.entries()) {
                   // if the snapshot ID of the DELETE entry is no longer valid, the data can be
                   // deleted
@@ -311,8 +312,7 @@ private Set<String> findFilesToDelete(
         .run(
             manifest -> {
               // the manifest has deletes, scan it to find files to delete
-              try (ManifestReader<?> reader =
-                  ManifestFiles.open(manifest, fileIO, current.specsById())) {
+              try (ManifestReader<?> reader = ManifestFiles.open(manifest, fileIO, specsById)) {
                 for (ManifestEntry<?> entry : reader.entries()) {
                   // delete any ADDED file from manifests that were reverted
                   if (entry.status() == ManifestEntry.Status.ADDED) {

diff --git a/core/src/main/java/org/apache/iceberg/MetadataUpdate.java b/core/src/main/java/org/apache/iceberg/MetadataUpdate.java
@@ -165,6 +165,23 @@ public void applyTo(TableMetadata.Builder metadataBuilder) {
     }
   }
 
+  class RemovePartitionSpecs implements MetadataUpdate {
+    private final Set<Integer> specIds;
+
+    public RemovePartitionSpecs(Set<Integer> specIds) {
+      this.specIds = specIds;
+    }
+
+    public Set<Integer> specIds() {
+      return specIds;
+    }
+
+    @Override
+    public void applyTo(TableMetadata.Builder metadataBuilder) {
+      metadataBuilder.removeUnusedSpecs(specIds);
+    }
+  }
+
   class AddSortOrder implements MetadataUpdate {
     private final UnboundSortOrder sortOrder;
 

diff --git a/core/src/main/java/org/apache/iceberg/MetadataUpdateParser.java b/core/src/main/java/org/apache/iceberg/MetadataUpdateParser.java
@@ -59,6 +59,7 @@ private MetadataUpdateParser() {}
   static final String SET_CURRENT_VIEW_VERSION = "set-current-view-version";
   static final String SET_PARTITION_STATISTICS = "set-partition-statistics";
   static final String REMOVE_PARTITION_STATISTICS = "remove-partition-statistics";
+  static final String REMOVE_PARTITION_SPECS = "remove-partition-specs";
 
   // AssignUUID
   private static final String UUID = "uuid";
@@ -126,6 +127,9 @@ private MetadataUpdateParser() {}
   // SetCurrentViewVersion
   private static final String VIEW_VERSION_ID = "view-version-id";
 
+  // RemovePartitionSpecs
+  private static final String PARTITION_SPEC_IDS = "partition-spec-ids";
 spec-ids: 
 spec-ids: 
+
   private static final Map<Class<? extends MetadataUpdate>, String> ACTIONS =
       ImmutableMap.<Class<? extends MetadataUpdate>, String>builder()
           .put(MetadataUpdate.AssignUUID.class, ASSIGN_UUID)
@@ -149,6 +153,7 @@ private MetadataUpdateParser() {}
           .put(MetadataUpdate.SetLocation.class, SET_LOCATION)
           .put(MetadataUpdate.AddViewVersion.class, ADD_VIEW_VERSION)
           .put(MetadataUpdate.SetCurrentViewVersion.class, SET_CURRENT_VIEW_VERSION)
+          .put(MetadataUpdate.RemovePartitionSpecs.class, REMOVE_PARTITION_SPECS)
           .buildOrThrow();
 
   public static String toJson(MetadataUpdate metadataUpdate) {
@@ -241,6 +246,9 @@ public static void toJson(MetadataUpdate metadataUpdate, JsonGenerator generator
         writeSetCurrentViewVersionId(
             (MetadataUpdate.SetCurrentViewVersion) metadataUpdate, generator);
         break;
+      case REMOVE_PARTITION_SPECS:
+        writeRemovePartitionSpecs((MetadataUpdate.RemovePartitionSpecs) metadataUpdate, generator);
+        break;
       default:
         throw new IllegalArgumentException(
             String.format(
@@ -312,6 +320,8 @@ public static MetadataUpdate fromJson(JsonNode jsonNode) {
         return readAddViewVersion(jsonNode);
       case SET_CURRENT_VIEW_VERSION:
         return readCurrentViewVersionId(jsonNode);
+      case REMOVE_PARTITION_SPECS:
+        return readRemoveUnusedSpecs(jsonNode);
       default:
         throw new UnsupportedOperationException(
             String.format("Cannot convert metadata update action to json: %s", action));
@@ -447,6 +457,11 @@ private static void writeSetCurrentViewVersionId(
     gen.writeNumberField(VIEW_VERSION_ID, metadataUpdate.versionId());
   }
 
+  private static void writeRemovePartitionSpecs(
+      MetadataUpdate.RemovePartitionSpecs metadataUpdate, JsonGenerator gen) throws IOException {
+    JsonUtil.writeIntegerArray(PARTITION_SPEC_IDS, metadataUpdate.specIds(), gen);
+  }
+
   private static MetadataUpdate readAssignUUID(JsonNode node) {
     String uuid = JsonUtil.getString(UUID, node);
     return new MetadataUpdate.AssignUUID(uuid);
@@ -596,4 +611,9 @@ private static MetadataUpdate readAddViewVersion(JsonNode node) {
   private static MetadataUpdate readCurrentViewVersionId(JsonNode node) {
     return new MetadataUpdate.SetCurrentViewVersion(JsonUtil.getInt(VIEW_VERSION_ID, node));
   }
+
+  private static MetadataUpdate readRemoveUnusedSpecs(JsonNode node) {
+    return new MetadataUpdate.RemovePartitionSpecs(
+        JsonUtil.getIntegerSet(PARTITION_SPEC_IDS, node));
+  }
 }
diff --git a/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java b/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java
@@ -41,6 +41,7 @@
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.function.Consumer;
+import java.util.stream.Collectors;
 import org.apache.iceberg.exceptions.CommitFailedException;
 import org.apache.iceberg.exceptions.ValidationException;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
@@ -85,6 +86,7 @@ public void accept(String file) {
   private ExecutorService planExecutorService = ThreadPools.getWorkerPool();
   private Boolean incrementalCleanup;
   private boolean specifiedSnapshotId = false;
+  private boolean removeUnusedSpecs = false;
 
   RemoveSnapshots(TableOperations ops) {
     this.ops = ops;
@@ -159,6 +161,12 @@ public ExpireSnapshots planWith(ExecutorService executorService) {
     return this;
   }
 
+  @Override
+  public ExpireSnapshots removeUnusedSpecs(boolean remove) {
+    this.removeUnusedSpecs = remove;
+    return this;
+  }
+
   @Override
   public List<Snapshot> apply() {
     TableMetadata updated = internalApply();
@@ -209,6 +217,24 @@ private TableMetadata internalApply() {
         .forEach(idsToRemove::add);
     updatedMetaBuilder.removeSnapshots(idsToRemove);
 
+    if (removeUnusedSpecs) {
+      Set<Integer> reachableSpecs = Sets.newConcurrentHashSet();
+      reachableSpecs.add(base.defaultSpecId());
+      Tasks.foreach(idsToRetain)
+          .executeWith(planExecutorService)
+          .run(
+              snapshot ->
+                  base.snapshot(snapshot).allManifests(ops.io()).stream()
+                      .map(ManifestFile::partitionSpecId)
+                      .forEach(reachableSpecs::add));
+      Set<Integer> specsToRemove =
+          base.specs().stream()
+              .map(PartitionSpec::specId)
+              .filter(specId -> !reachableSpecs.contains(specId))
+              .collect(Collectors.toSet());
+      updatedMetaBuilder.removeUnusedSpecs(specsToRemove);
+    }
+
     return updatedMetaBuilder.build();
   }
 

diff --git a/core/src/main/java/org/apache/iceberg/TableMetadata.java b/core/src/main/java/org/apache/iceberg/TableMetadata.java
@@ -1108,6 +1108,24 @@ public Builder setDefaultPartitionSpec(int specId) {
       return this;
     }
 
+    Builder removeUnusedSpecs(Iterable<Integer> specIds) {
+      Set<Integer> specIdsToRemove = Sets.newHashSet();
+      for (Integer specId : specIds) {
+        Preconditions.checkArgument(
+            specId != defaultSpecId, "Cannot remove default partition spec");
+        PartitionSpec toBeRemoved = specsById.remove(specId);
+        if (toBeRemoved != null) {
+          specIdsToRemove.add(specId);
+        }
+      }
+      this.specs =
+          specs.stream()
+              .filter(s -> !specIdsToRemove.contains(s.specId()))
+              .collect(Collectors.toList());
+      changes.add(new MetadataUpdate.RemovePartitionSpecs(specIdsToRemove));
+      return this;
+    }
+
     public Builder addPartitionSpec(UnboundPartitionSpec spec) {
       addPartitionSpecInternal(spec.bind(schemasById.get(currentSchemaId)));
       return this;

diff --git a/core/src/main/java/org/apache/iceberg/UpdateRequirements.java b/core/src/main/java/org/apache/iceberg/UpdateRequirements.java
@@ -105,6 +105,8 @@ private Builder update(MetadataUpdate update) {
         update((MetadataUpdate.SetDefaultPartitionSpec) update);
       } else if (update instanceof MetadataUpdate.SetDefaultSortOrder) {
         update((MetadataUpdate.SetDefaultSortOrder) update);
+      } else if (update instanceof MetadataUpdate.RemovePartitionSpecs) {
+        update((MetadataUpdate.RemovePartitionSpecs) update);
       }
 
       return this;
@@ -173,6 +175,26 @@ private void update(MetadataUpdate.SetDefaultSortOrder unused) {
       }
     }
 
+    private void update(MetadataUpdate.RemovePartitionSpecs unused) {
+      // require that the default partition spec has not changed
+      if (!setSpecId) {
+        if (base != null && !isReplace) {
+          require(new UpdateRequirement.AssertDefaultSpecID(base.defaultSpecId()));
+        }
+        this.setSpecId = true;
+      }
+      // require that all the branch has not changed, so that old specs won't be written.
+      if (base != null && !isReplace) {
+        base.refs()
+            .forEach(
+                (name, ref) -> {
+                  if (ref.isBranch() && !name.equals(SnapshotRef.MAIN_BRANCH)) {
+                    require(new UpdateRequirement.AssertRefSnapshotID(name, ref.snapshotId()));
+                  }
+                });
+      }
+    }
+
     private List<UpdateRequirement> build() {
       return requirements.build();
     }

diff --git a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java b/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java
@@ -37,6 +37,7 @@
 import java.util.stream.Collectors;
 import org.apache.iceberg.ManifestEntry.Status;
 import org.apache.iceberg.exceptions.ValidationException;
+import org.apache.iceberg.expressions.Expressions;
 import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.io.PositionOutputStream;
 import org.apache.iceberg.puffin.Blob;
@@ -1620,6 +1621,90 @@ public void testRetainFilesOnRetainedBranches() {
     assertThat(deletedFiles).isEqualTo(expectedDeletes);
   }
 
+  @TestTemplate
+  public void testRemoveSpecDuringExpiration() {
+    DataFile file =
+        DataFiles.builder(table.spec())
+            .withPath("/path/to/data-0.parquet")
+            .withPartitionPath("data_bucket=0")
+            .withFileSizeInBytes(10)
+            .withRecordCount(100)
+            .build();
+    table.newAppend().appendFile(file).commit();
+    Snapshot append = table.currentSnapshot();
+    String appendManifest =
+        Iterables.getOnlyElement(
+            table.currentSnapshot().allManifests(table.io()).stream()
+                .map(ManifestFile::path)
+                .collect(Collectors.toList()));
+    table.newDelete().deleteFile(file).commit();
+    Snapshot delete = table.currentSnapshot();
+    String deleteManifest =
+        Iterables.getOnlyElement(
+            table.currentSnapshot().allManifests(table.io()).stream()
+                .map(ManifestFile::path)
+                .collect(Collectors.toList()));
+
+    table.updateSpec().addField("id_bucket", Expressions.bucket("id", 16)).commit();
+    PartitionSpec idAndDataBucketSpec = table.spec();
+    DataFile bucketFile =
+        DataFiles.builder(table.spec())
+            .withPath("/path/to/data-0-id-0.parquet")
+            .withFileSizeInBytes(10)
+            .withRecordCount(100)
+            .withPartitionPath("data_bucket=0/id_bucket=0")
+            .build();
+    table.newAppend().appendFile(bucketFile).commit();
+
+    Set<String> deletedFiles = Sets.newHashSet();
+    // Expiring snapshots should remove the data_bucket partition
+    removeSnapshots(table)
+        .expireOlderThan(System.currentTimeMillis())
+        .removeUnusedSpecs(true)
+        .deleteWith(deletedFiles::add)
+        .commit();
+
+    assertThat(deletedFiles)
+        .containsExactlyInAnyOrder(
+            appendManifest,
+            deleteManifest,
+            file.location(),
+            append.manifestListLocation(),
+            delete.manifestListLocation());
+    assertThat(Iterables.getOnlyElement(table.specs().keySet()))
-    assertThat(Iterables.getOnlyElement(table.specs().keySet()))
+    assertThat(table.specs().keySet()).containsExactly(idAndDataBucketSpec.specId())
-    assertThat(Iterables.getOnlyElement(table.specs().keySet()))
+    assertThat(table.specs().keySet()).containsExactly(idAndDataBucketSpec.specId())
+        .as("Only id_bucket + data_bucket transform should exist")
+        .isEqualTo(idAndDataBucketSpec.specId());
+  }
+
+  @TestTemplate
+  public void testRemoveSpecsDoesntRemoveDefaultSpec() throws IOException {
+    // The default spec for table is bucketed on data, but write using unpartitioned
+    PartitionSpec dataBucketSpec = table.spec();
+    DataFile file =
+        DataFiles.builder(PartitionSpec.unpartitioned())
+            .withPath("/path/to/data-0.parquet")
+            .withFileSizeInBytes(10)
+            .withRecordCount(100)
+            .build();
+
+    table.newAppend().appendFile(file).commit();
+    Snapshot append = table.currentSnapshot();
+    table.newDelete().deleteFile(file).commit();
+
+    Set<String> deletedFiles = Sets.newHashSet();
+    // Expiring snapshots should remove only the unpartitioned spec
+    removeSnapshots(table)
+        .expireOlderThan(System.currentTimeMillis())
+        .removeUnusedSpecs(true)
+        .deleteWith(deletedFiles::add)
+        .commit();
+
+    assertThat(deletedFiles).containsExactlyInAnyOrder(append.manifestListLocation());
+    assertThat(Iterables.getOnlyElement(table.specs().keySet()))
+        .as("Only data_bucket transform should exist")
+        .isEqualTo(dataBucketSpec.specId());
+  }
+
   private Set<String> manifestPaths(Snapshot snapshot, FileIO io) {
     return snapshot.allManifests(io).stream().map(ManifestFile::path).collect(Collectors.toSet());
   }