From 5f40738c3b328e8b0c32fd83821c5c41566b135d Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Mon, 25 Sep 2023 13:08:30 +0200 Subject: [PATCH] Core: Add ManifestWrite benchmark --- .../apache/iceberg/ManifestReadBenchmark.java | 7 +- .../iceberg/ManifestWriteBenchmark.java | 153 ++++++++++++++++++ 2 files changed, 155 insertions(+), 5 deletions(-) create mode 100644 core/src/jmh/java/org/apache/iceberg/ManifestWriteBenchmark.java diff --git a/core/src/jmh/java/org/apache/iceberg/ManifestReadBenchmark.java b/core/src/jmh/java/org/apache/iceberg/ManifestReadBenchmark.java index 6677e5d8b651..e4a63beceaa6 100644 --- a/core/src/jmh/java/org/apache/iceberg/ManifestReadBenchmark.java +++ b/core/src/jmh/java/org/apache/iceberg/ManifestReadBenchmark.java @@ -69,10 +69,9 @@ public void before() { manifestListFile = String.format("%s/%s.avro", baseDir, UUID.randomUUID()); Random random = new Random(System.currentTimeMillis()); - ManifestListWriter listWriter = - ManifestLists.write(1, org.apache.iceberg.Files.localOutput(manifestListFile), 0, 1L, 0); - try { + try (ManifestListWriter listWriter = + ManifestLists.write(1, org.apache.iceberg.Files.localOutput(manifestListFile), 0, 1L, 0)) { for (int i = 0; i < NUM_FILES; i++) { OutputFile manifestFile = org.apache.iceberg.Files.localOutput( @@ -98,8 +97,6 @@ public void before() { listWriter.add(writer.toManifestFile()); } - - listWriter.close(); } catch (IOException e) { throw new UncheckedIOException(e); } diff --git a/core/src/jmh/java/org/apache/iceberg/ManifestWriteBenchmark.java b/core/src/jmh/java/org/apache/iceberg/ManifestWriteBenchmark.java new file mode 100644 index 000000000000..b256350a67b5 --- /dev/null +++ b/core/src/jmh/java/org/apache/iceberg/ManifestWriteBenchmark.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.io.File; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.util.Map; +import java.util.Random; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import org.apache.commons.io.FileUtils; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.io.Files; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.TearDown; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Timeout; + +/** + * A benchmark that evaluates the performance of writing manifest files + * + *

To run this benchmark: + * ./gradlew :iceberg-core:jmh -PjmhIncludeRegex=ManifestWriteBenchmark + * + */ +@Fork(1) +@State(Scope.Benchmark) +@Measurement(iterations = 5) +@BenchmarkMode(Mode.SingleShotTime) +@Timeout(time = 1000, timeUnit = TimeUnit.HOURS) +public class ManifestWriteBenchmark { + + private static final int NUM_FILES = 10; + private static final int NUM_ROWS = 100000; + private static final int NUM_COLS = 100; + + private String baseDir; + private String manifestListFile; + + private Metrics metrics; + + @Setup + public void before() { + Random random = new Random(System.currentTimeMillis()); + // Pre-create the metrics to avoid doing this in the benchmark itself + metrics = randomMetrics(random); + } + + @TearDown + public void after() { + if (baseDir != null) { + FileUtils.deleteQuietly(new File(baseDir)); + baseDir = null; + } + + manifestListFile = null; + } + + @Benchmark + @Threads(1) + public void writeManifestFile() throws IOException { + baseDir = Files.createTempDir().getAbsolutePath(); + manifestListFile = String.format("%s/%s.avro", baseDir, UUID.randomUUID()); + + try (ManifestListWriter listWriter = + ManifestLists.write(1, org.apache.iceberg.Files.localOutput(manifestListFile), 0, 1L, 0)) { + for (int i = 0; i < NUM_FILES; i++) { + OutputFile manifestFile = + org.apache.iceberg.Files.localOutput( + String.format("%s/%s.avro", baseDir, UUID.randomUUID())); + + ManifestWriter writer = + ManifestFiles.write(1, PartitionSpec.unpartitioned(), manifestFile, 1L); + try (ManifestWriter finalWriter = writer) { + for (int j = 0; j < NUM_ROWS; j++) { + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withFormat(FileFormat.PARQUET) + .withPath(String.format("/path/to/data-%s-%s.parquet", i, j)) + .withFileSizeInBytes(j) + .withRecordCount(j) + .withMetrics(metrics) + .build(); + finalWriter.add(dataFile); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + + listWriter.add(writer.toManifestFile()); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private Metrics randomMetrics(Random random) { + long rowCount = 100000L + random.nextInt(1000); + Map columnSizes = Maps.newHashMap(); + Map valueCounts = Maps.newHashMap(); + Map nullValueCounts = Maps.newHashMap(); + Map nanValueCounts = Maps.newHashMap(); + Map lowerBounds = Maps.newHashMap(); + Map upperBounds = Maps.newHashMap(); + for (int i = 0; i < NUM_COLS; i++) { + columnSizes.put(i, 1000000L + random.nextInt(100000)); + valueCounts.put(i, 100000L + random.nextInt(100)); + nullValueCounts.put(i, (long) random.nextInt(5)); + nanValueCounts.put(i, (long) random.nextInt(5)); + byte[] lower = new byte[8]; + random.nextBytes(lower); + lowerBounds.put(i, ByteBuffer.wrap(lower)); + byte[] upper = new byte[8]; + random.nextBytes(upper); + upperBounds.put(i, ByteBuffer.wrap(upper)); + } + + return new Metrics( + rowCount, + columnSizes, + valueCounts, + nullValueCounts, + nanValueCounts, + lowerBounds, + upperBounds); + } +}