oap-project · zhixingheyi-tian · Jul 19, 2022 · Jul 22, 2022 · Jul 22, 2022 · Aug 11, 2022
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -48,6 +48,7 @@ if(POLICY CMP0074)
 endif()
 
 set(ARROW_VERSION "4.0.0")
+#add_compile_options(-g -O0)
 
 string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}")
 
@@ -937,3 +938,7 @@ config_summary_message()
 if(${ARROW_BUILD_CONFIG_SUMMARY_JSON})
   config_summary_json()
 endif()
+
+
+
+
diff --git a/cpp/README.md b/cpp/README.md
@@ -32,3 +32,17 @@ to install pre-compiled binary versions of the library.
 Please refer to our latest [C++ Development Documentation][1].
 
 [1]: https://github.com/apache/arrow/blob/master/docs/source/developers/cpp
+
+## Run parquet string scan benchmark
+#### Minimal benchmark build
+cd arrow
+mkdir -p cpp/debug
+cd cpp/debug
+cmake -DCMAKE_BUILD_TYPE=Release -DARROW_BUILD_BENCHMARKS=ON -DARROW_WITH_ZLIB=ON -DARROW_JEMALLOC=OFF -DARROW_PARQUET=ON -DARROW_COMPUTE=ON -DARROW_DATASET=ON -DARROW_WITH_SNAPPY=ON -DARROW_FILESYSTEM=ON ..
+
+#### Run benchmark and collect perf data
+cpp/debug
+./release/parquet-arrow-parquet-scan-string-benchmark --iterations 10  --threads 1  --file {parquet_path} --cpu 0 &
+perf record -e cycles:ppp -C 0 sleep 10
+perf report
+
diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt
@@ -399,6 +399,8 @@ add_parquet_benchmark(column_io_benchmark)
 add_parquet_benchmark(encoding_benchmark)
 add_parquet_benchmark(level_conversion_benchmark)
 add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow")
+add_parquet_benchmark(arrow/parquet_scan_benchmark PREFIX "parquet-arrow")
+add_parquet_benchmark(arrow/parquet_scan_string_benchmark PREFIX "parquet-arrow")
 
 if(ARROW_WITH_BROTLI)
   add_definitions(-DARROW_WITH_BROTLI)

diff --git a/cpp/src/parquet/arrow/parquet_scan_benchmark.cc b/cpp/src/parquet/arrow/parquet_scan_benchmark.cc
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arrow/filesystem/filesystem.h>
+#include <arrow/io/interfaces.h>
+#include <arrow/memory_pool.h>
+#include <arrow/record_batch.h>
+#include <arrow/testing/gtest_util.h>
+#include <arrow/type.h>
+#include <arrow/util/io_util.h>
+#include <benchmark/benchmark.h>
+#include <gtest/gtest.h>
+#include <parquet/arrow/reader.h>
+#include <parquet/file_reader.h>
+
+#include <chrono>
+
+#include "arrow/record_batch.h"
+#include "parquet/arrow/utils/macros.h"
+#include "parquet/arrow/test_utils.h"
+
+
+// namespace parquet {
+// namespace benchmark {
+
+const int batch_buffer_size = 32768;
+
+class GoogleBenchmarkColumnarToRow {
+ public:
+  GoogleBenchmarkColumnarToRow(std::string file_name) { GetRecordBatchReader(file_name); }
+
+  void GetRecordBatchReader(const std::string& input_file) {
+    std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
+    std::shared_ptr<arrow::RecordBatchReader> record_batch_reader;
+
+    std::shared_ptr<arrow::fs::FileSystem> fs;
+    std::string file_name;
+    ARROW_ASSIGN_OR_THROW(fs, arrow::fs::FileSystemFromUriOrPath(input_file, &file_name))
+
+    ARROW_ASSIGN_OR_THROW(file, fs->OpenInputFile(file_name));
+
+    properties.set_batch_size(batch_buffer_size);
+    properties.set_pre_buffer(false);
+    properties.set_use_threads(false);
+
+    ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
+        arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file),
+        properties, &parquet_reader));
+
+    ASSERT_NOT_OK(parquet_reader->GetSchema(&schema));
+
+    auto num_rowgroups = parquet_reader->num_row_groups();
+
+    for (int i = 0; i < num_rowgroups; ++i) {
+      row_group_indices.push_back(i);
+    }
+
+    auto num_columns = schema->num_fields();
+    for (int i = 0; i < num_columns; ++i) {
+      column_indices.push_back(i);
+    }
+  }
+
+  virtual void operator()(benchmark::State& state) {}
+
+ protected:
+  long SetCPU(uint32_t cpuindex) {
+    cpu_set_t cs;
+    CPU_ZERO(&cs);
+    CPU_SET(cpuindex, &cs);
+    return sched_setaffinity(0, sizeof(cs), &cs);
+  }
+
+ protected:
+  std::string file_name;
+  std::shared_ptr<arrow::io::RandomAccessFile> file;
+  std::vector<int> row_group_indices;
+  std::vector<int> column_indices;
+  std::shared_ptr<arrow::Schema> schema;
+  parquet::ArrowReaderProperties properties;
+};
+class GoogleBenchmarkColumnarToRow_CacheScan_Benchmark
+    : public GoogleBenchmarkColumnarToRow {
+ public:
+  GoogleBenchmarkColumnarToRow_CacheScan_Benchmark(std::string filename)
+      : GoogleBenchmarkColumnarToRow(filename) {}
+  void operator()(benchmark::State& state) {
+    if (state.range(0) == 0xffffffff) {
+      SetCPU(state.thread_index());
+    } else {
+      SetCPU(state.range(0));
+    }
+
+    arrow::Compression::type compression_type = (arrow::Compression::type)1;
+
+    std::shared_ptr<arrow::RecordBatch> record_batch;
+    int64_t elapse_read = 0;
+    int64_t num_batches = 0;
+    int64_t num_rows = 0;
+    int64_t init_time = 0;
+    int64_t write_time = 0;
+
+
+    std::vector<int> local_column_indices = column_indices;
+
+    std::shared_ptr<arrow::Schema> local_schema;
+    local_schema = std::make_shared<arrow::Schema>(*schema.get());
+
+    if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl;
+
+    for (auto _ : state) {
+      std::unique_ptr<::parquet::arrow::FileReader> parquet_reader;
+      std::shared_ptr<arrow::RecordBatchReader> record_batch_reader;
+      ASSERT_NOT_OK(::parquet::arrow::FileReader::Make(
+          ::arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file),
+          properties, &parquet_reader));
+
+      std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
+      ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader(
+          row_group_indices, local_column_indices, &record_batch_reader));
+      do {
+        TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch));
+
+        if (record_batch) {
+          // batches.push_back(record_batch);
+          num_batches += 1;
+          num_rows += record_batch->num_rows();
+        }
+      } while (record_batch);
+
+      std::cout << " parquet parse done elapsed time = " << elapse_read / 1000000
+              << " rows = " << num_rows << std::endl;
+    }
+
+    state.counters["rowgroups"] =
+        benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads,
+                           benchmark::Counter::OneK::kIs1000);
+    state.counters["columns"] =
+        benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads,
+                           benchmark::Counter::OneK::kIs1000);
+    state.counters["batches"] = benchmark::Counter(
+        num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+    state.counters["num_rows"] = benchmark::Counter(
+        num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+    state.counters["batch_buffer_size"] =
+        benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads,
+                           benchmark::Counter::OneK::kIs1024);
+
+    state.counters["parquet_parse"] = benchmark::Counter(
+        elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+    state.counters["init_time"] = benchmark::Counter(
+        init_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+    state.counters["write_time"] = benchmark::Counter(
+        write_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000);
+  }
+};
+
+// }  // namespace columnartorow
+// }  // namespace sparkcolumnarplugin
+
+int main(int argc, char** argv) {
+  uint32_t iterations = 1;
+  uint32_t threads = 1;
+  std::string datafile;
+  uint32_t cpu = 0xffffffff;
+
+  for (int i = 0; i < argc; i++) {
+    if (strcmp(argv[i], "--iterations") == 0) {
+      iterations = atol(argv[i + 1]);
+    } else if (strcmp(argv[i], "--threads") == 0) {
+      threads = atol(argv[i + 1]);
+    } else if (strcmp(argv[i], "--file") == 0) {
+      datafile = argv[i + 1];
+    } else if (strcmp(argv[i], "--cpu") == 0) {
+      cpu = atol(argv[i + 1]);
+    }
+  }
+  std::cout << "iterations = " << iterations << std::endl;
+  std::cout << "threads = " << threads << std::endl;
+  std::cout << "datafile = " << datafile << std::endl;
+  std::cout << "cpu = " << cpu << std::endl;
+
+  GoogleBenchmarkColumnarToRow_CacheScan_Benchmark
+      bck(datafile);
+
+  benchmark::RegisterBenchmark("GoogleBenchmarkColumnarToRow::CacheScan", bck)
+      ->Args({
+          cpu,
+      })
+      ->Iterations(iterations)
+      ->Threads(threads)
+      ->ReportAggregatesOnly(false)
+      ->MeasureProcessCPUTime()
+      ->Unit(benchmark::kSecond);
+
+  benchmark::Initialize(&argc, argv);
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+}