-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Core, Data, Spark 3.5: Support file and partition delete granularity (#…
- Loading branch information
1 parent
ad42314
commit e7999a1
Showing
18 changed files
with
846 additions
and
35 deletions.
There are no files selected for viewing
45 changes: 45 additions & 0 deletions
45
api/src/main/java/org/apache/iceberg/util/CharSequenceUtil.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.iceberg.util; | ||
|
||
public class CharSequenceUtil { | ||
|
||
private CharSequenceUtil() {} | ||
|
||
public static boolean unequalPaths(CharSequence s1, CharSequence s2) { | ||
if (s1 == s2) { | ||
return false; | ||
} | ||
|
||
int s1Length = s1.length(); | ||
int s2Length = s2.length(); | ||
|
||
if (s1Length != s2Length) { | ||
return true; | ||
} | ||
|
||
for (int index = s1Length - 1; index >= 0; index--) { | ||
if (s1.charAt(index) != s2.charAt(index)) { | ||
return true; | ||
} | ||
} | ||
|
||
return false; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
71 changes: 71 additions & 0 deletions
71
core/src/main/java/org/apache/iceberg/deletes/DeleteGranularity.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.iceberg.deletes; | ||
|
||
import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
|
||
/** | ||
* An enum that represents the granularity of deletes. | ||
* | ||
* <p>Under partition granularity, delete writers are directed to group deletes for different data | ||
* files into one delete file. This strategy tends to reduce the total number of delete files in the | ||
* table. However, a scan for a single data file will require reading delete information for | ||
* multiple data files even if those other files are not required for the scan. All irrelevant | ||
* deletes will be discarded by readers but reading this extra information will cause overhead. The | ||
* overhead can potentially be mitigated via delete file caching. | ||
* | ||
* <p>Under file granularity, delete writers always organize deletes by their target data file, | ||
* creating separate delete files for each referenced data file. This strategy ensures the job | ||
* planning does not assign irrelevant deletes to data files and readers only load necessary delete | ||
* information. However, it also increases the total number of delete files in the table and may | ||
* require a more aggressive approach for delete file compaction. | ||
* | ||
* <p>Currently, this configuration is only applicable to position deletes. | ||
* | ||
* <p>Each granularity has its own benefits and drawbacks and should be picked based on a use case. | ||
* Regular delete compaction is still required regardless of which granularity is chosen. It is also | ||
* possible to use one granularity for ingestion and another one for table maintenance. | ||
*/ | ||
public enum DeleteGranularity { | ||
FILE, | ||
PARTITION; | ||
|
||
@Override | ||
public String toString() { | ||
switch (this) { | ||
case FILE: | ||
return "file"; | ||
case PARTITION: | ||
return "partition"; | ||
default: | ||
throw new IllegalArgumentException("Unknown delete granularity: " + this); | ||
} | ||
} | ||
|
||
public static DeleteGranularity fromString(String valueAsString) { | ||
Preconditions.checkArgument(valueAsString != null, "Value is null"); | ||
if (FILE.toString().equalsIgnoreCase(valueAsString)) { | ||
return FILE; | ||
} else if (PARTITION.toString().equalsIgnoreCase(valueAsString)) { | ||
return PARTITION; | ||
} else { | ||
throw new IllegalArgumentException("Unknown delete granularity: " + valueAsString); | ||
} | ||
} | ||
} |
113 changes: 113 additions & 0 deletions
113
core/src/main/java/org/apache/iceberg/deletes/FileScopedPositionDeleteWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.apache.iceberg.deletes; | ||
|
||
import java.io.IOException; | ||
import java.io.UncheckedIOException; | ||
import java.util.List; | ||
import java.util.function.Supplier; | ||
import org.apache.iceberg.DeleteFile; | ||
import org.apache.iceberg.io.DeleteWriteResult; | ||
import org.apache.iceberg.io.FileWriter; | ||
import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
import org.apache.iceberg.util.CharSequenceSet; | ||
import org.apache.iceberg.util.CharSequenceUtil; | ||
|
||
/** | ||
* A position delete writer that produces a separate delete file for each referenced data file. | ||
* | ||
* <p>This writer does not keep track of seen deletes and assumes all incoming records are ordered | ||
* by file and position as required by the spec. If there is no external process to order the | ||
* records, consider using {@link SortingPositionOnlyDeleteWriter} instead. | ||
*/ | ||
public class FileScopedPositionDeleteWriter<T> | ||
implements FileWriter<PositionDelete<T>, DeleteWriteResult> { | ||
|
||
private final Supplier<FileWriter<PositionDelete<T>, DeleteWriteResult>> writers; | ||
private final List<DeleteFile> deleteFiles; | ||
private final CharSequenceSet referencedDataFiles; | ||
|
||
private FileWriter<PositionDelete<T>, DeleteWriteResult> currentWriter = null; | ||
private CharSequence currentPath = null; | ||
private boolean closed = false; | ||
|
||
public FileScopedPositionDeleteWriter( | ||
Supplier<FileWriter<PositionDelete<T>, DeleteWriteResult>> writers) { | ||
this.writers = writers; | ||
this.deleteFiles = Lists.newArrayList(); | ||
this.referencedDataFiles = CharSequenceSet.empty(); | ||
} | ||
|
||
@Override | ||
public void write(PositionDelete<T> positionDelete) { | ||
writer(positionDelete.path()).write(positionDelete); | ||
} | ||
|
||
private FileWriter<PositionDelete<T>, DeleteWriteResult> writer(CharSequence path) { | ||
if (currentWriter == null) { | ||
openCurrentWriter(path); | ||
} else if (CharSequenceUtil.unequalPaths(currentPath, path)) { | ||
closeCurrentWriter(); | ||
openCurrentWriter(path); | ||
} | ||
|
||
return currentWriter; | ||
} | ||
|
||
@Override | ||
public long length() { | ||
throw new UnsupportedOperationException(getClass().getName() + " does not implement length"); | ||
} | ||
|
||
@Override | ||
public DeleteWriteResult result() { | ||
Preconditions.checkState(closed, "Cannot get result from unclosed writer"); | ||
return new DeleteWriteResult(deleteFiles, referencedDataFiles); | ||
} | ||
|
||
@Override | ||
public void close() throws IOException { | ||
if (!closed) { | ||
closeCurrentWriter(); | ||
this.closed = true; | ||
} | ||
} | ||
|
||
private void openCurrentWriter(CharSequence path) { | ||
Preconditions.checkState(!closed, "Writer has already been closed"); | ||
this.currentWriter = writers.get(); | ||
this.currentPath = path; | ||
} | ||
|
||
private void closeCurrentWriter() { | ||
if (currentWriter != null) { | ||
try { | ||
currentWriter.close(); | ||
DeleteWriteResult result = currentWriter.result(); | ||
deleteFiles.addAll(result.deleteFiles()); | ||
referencedDataFiles.addAll(result.referencedDataFiles()); | ||
this.currentWriter = null; | ||
this.currentPath = null; | ||
} catch (IOException e) { | ||
throw new UncheckedIOException("Failed to close current writer", e); | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.