From 0b2c8a2b1088ae26aa3d0b0858aa2a3f587d64a7 Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Tue, 26 May 2026 09:37:37 +0800 Subject: [PATCH 01/13] chore: add benchmark for append and pk table --- CMakeLists.txt | 18 + benchmark/CMakeLists.txt | 69 ++ benchmark/benchmark_case_mor_read.cpp | 33 + benchmark/benchmark_case_pk_write.cpp | 27 + benchmark/benchmark_case_read.cpp | 33 + benchmark/benchmark_case_write.cpp | 27 + benchmark/benchmark_helpers.cpp | 92 +++ benchmark/benchmark_helpers.h | 58 ++ benchmark/benchmark_suite.cpp | 849 ++++++++++++++++++++++++ benchmark/benchmark_suite.h | 32 + benchmark/cli_option_parsing.h | 157 +++++ benchmark/cli_option_parsing_test.cpp | 145 ++++ benchmark/read_write_benchmark.cpp | 42 ++ cmake_modules/BuildUtils.cmake | 112 ++++ cmake_modules/DefineOptions.cmake | 10 + cmake_modules/FindbenchmarkAlt.cmake | 60 ++ cmake_modules/ThirdpartyToolchain.cmake | 56 ++ third_party/versions.txt | 4 + 18 files changed, 1824 insertions(+) create mode 100644 benchmark/CMakeLists.txt create mode 100644 benchmark/benchmark_case_mor_read.cpp create mode 100644 benchmark/benchmark_case_pk_write.cpp create mode 100644 benchmark/benchmark_case_read.cpp create mode 100644 benchmark/benchmark_case_write.cpp create mode 100644 benchmark/benchmark_helpers.cpp create mode 100644 benchmark/benchmark_helpers.h create mode 100644 benchmark/benchmark_suite.cpp create mode 100644 benchmark/benchmark_suite.h create mode 100644 benchmark/cli_option_parsing.h create mode 100644 benchmark/cli_option_parsing_test.cpp create mode 100644 benchmark/read_write_benchmark.cpp create mode 100644 cmake_modules/FindbenchmarkAlt.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 124d6c540..30d557b7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) option(PAIMON_BUILD_STATIC "Build static library" ON) option(PAIMON_BUILD_SHARED "Build shared library" ON) option(PAIMON_BUILD_TESTS "Build tests" OFF) +option(PAIMON_BUILD_BENCHMARKS "Build benchmarks" OFF) option(PAIMON_USE_ASAN "Use Address Sanitizer" OFF) option(PAIMON_USE_UBSAN "Use Undefined Behavior Sanitizer" OFF) option(PAIMON_USE_CXX11_ABI "Use C++11 ABI" ON) @@ -441,6 +442,22 @@ if(PAIMON_BUILD_TESTS) endif() endif() +if(PAIMON_BUILD_BENCHMARKS) + add_custom_target(paimon-benchmarks) + add_custom_target(benchmark + ctest + -j4 + -L + benchmark + --output-on-failure) + add_dependencies(benchmark paimon-benchmarks) + + set(PAIMON_BENCHMARK_LINK_TOOLCHAIN benchmark::benchmark) + if(TARGET benchmark::benchmark_main) + list(APPEND PAIMON_BENCHMARK_LINK_TOOLCHAIN benchmark::benchmark_main) + endif() +endif() + paimon_print_dependency_resolution_summary() include(CMakePackageConfigHelpers) @@ -472,3 +489,4 @@ add_subdirectory(src/paimon/global_index/lucene) add_subdirectory(src/paimon/testing/mock) add_subdirectory(src/paimon/testing/utils) add_subdirectory(test/inte) +add_subdirectory(benchmark) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 000000000..23d863ef7 --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,69 @@ +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT PAIMON_BUILD_BENCHMARKS AND NOT PAIMON_BUILD_TESTS) + return() +endif() + +find_package(Threads REQUIRED) + +set(PAIMON_BENCHMARK_STATIC_LINK_LIBS + paimon_shared + "-Wl,--whole-archive" + paimon_local_file_system_shared + "-Wl,--no-whole-archive" + "-Wl,--no-as-needed" + paimon_parquet_file_format_shared + paimon_blob_file_format_shared + "-Wl,--as-needed") + +if(PAIMON_ENABLE_ORC) + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS "-Wl,--no-as-needed") + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS paimon_orc_file_format_shared) + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS "-Wl,--as-needed") +endif() + +if(PAIMON_ENABLE_AVRO) + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS "-Wl,--no-as-needed") + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS paimon_avro_file_format_shared) + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS "-Wl,--as-needed") +endif() + +if(PAIMON_BUILD_BENCHMARKS) + add_paimon_benchmark(read_write_benchmark + SOURCES + benchmark_helpers.cpp + benchmark_suite.cpp + benchmark_case_write.cpp + benchmark_case_read.cpp + benchmark_case_pk_write.cpp + benchmark_case_mor_read.cpp + read_write_benchmark.cpp + STATIC_LINK_LIBS + arrow + parquet + ${PAIMON_BENCHMARK_STATIC_LINK_LIBS} + Threads::Threads + ${CMAKE_DL_LIBS} + rt + ${PAIMON_BENCHMARK_LINK_TOOLCHAIN}) +endif() + +if(PAIMON_BUILD_TESTS) + add_paimon_test(cli_option_parsing_test + SOURCES + cli_option_parsing_test.cpp + STATIC_LINK_LIBS + ${GTEST_LINK_TOOLCHAIN}) +endif() diff --git a/benchmark/benchmark_case_mor_read.cpp b/benchmark/benchmark_case_mor_read.cpp new file mode 100644 index 000000000..f9b938498 --- /dev/null +++ b/benchmark/benchmark_case_mor_read.cpp @@ -0,0 +1,33 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark_suite.h" + +namespace { + +void BM_MOR_Read(::benchmark::State& state) { + paimon::benchmark::RunBMMorRead(state); +} + +} // namespace + +BENCHMARK(BM_MOR_Read) + ->ArgNames({"prefetch_parallel"}) + ->Unit(benchmark::kMillisecond) + ->UseRealTime() + ->Args({1}) + ->Args({2}) + ->Args({4}); diff --git a/benchmark/benchmark_case_pk_write.cpp b/benchmark/benchmark_case_pk_write.cpp new file mode 100644 index 000000000..7c1ac5f4f --- /dev/null +++ b/benchmark/benchmark_case_pk_write.cpp @@ -0,0 +1,27 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark_suite.h" + +namespace { + +void BM_PK_Write(::benchmark::State& state) { + paimon::benchmark::RunBMPkWrite(state); +} + +} // namespace + +BENCHMARK(BM_PK_Write)->Unit(benchmark::kMillisecond)->UseRealTime(); diff --git a/benchmark/benchmark_case_read.cpp b/benchmark/benchmark_case_read.cpp new file mode 100644 index 000000000..3ae35efd6 --- /dev/null +++ b/benchmark/benchmark_case_read.cpp @@ -0,0 +1,33 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark_suite.h" + +namespace { + +void BM_Read(::benchmark::State& state) { + paimon::benchmark::RunBMRead(state); +} + +} // namespace + +BENCHMARK(BM_Read) + ->ArgNames({"prefetch_parallel"}) + ->Unit(benchmark::kMillisecond) + ->UseRealTime() + ->Args({1}) + ->Args({2}) + ->Args({4}); diff --git a/benchmark/benchmark_case_write.cpp b/benchmark/benchmark_case_write.cpp new file mode 100644 index 000000000..43693e391 --- /dev/null +++ b/benchmark/benchmark_case_write.cpp @@ -0,0 +1,27 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark_suite.h" + +namespace { + +void BM_Write(::benchmark::State& state) { + paimon::benchmark::RunBMWrite(state); +} + +} // namespace + +BENCHMARK(BM_Write)->Unit(benchmark::kMillisecond)->UseRealTime(); diff --git a/benchmark/benchmark_helpers.cpp b/benchmark/benchmark_helpers.cpp new file mode 100644 index 000000000..09d601c0e --- /dev/null +++ b/benchmark/benchmark_helpers.cpp @@ -0,0 +1,92 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark_helpers.h" + +#include + +#include "benchmark/benchmark.h" + +namespace paimon::benchmark { + +bool BenchmarkHelpers::ValidateFileFormatOrSkip(::benchmark::State& state, + const std::string& file_format, + bool is_supported, SkipFn skip) { + if (!is_supported) { + skip(state, "file format is not supported in this build: " + file_format); + return false; + } + return true; +} + +bool BenchmarkHelpers::ValidateSourcePresenceOrSkip(::benchmark::State& state, + const std::string& source_path, + const std::string& message, + SkipFn skip) { + if (source_path.empty()) { + skip(state, message); + return false; + } + return true; +} + +bool BenchmarkHelpers::ValidateSourceSupportOrSkip(::benchmark::State& state, + const std::string& source_format, + bool is_supported, SkipFn skip) { + if (!is_supported) { + skip(state, + "source data mode requires reader support in this build for format: " + + source_format); + return false; + } + return true; +} + +bool BenchmarkHelpers::ValidatePrefetchParallelOrSkip(::benchmark::State& state, + int32_t prefetch_parallel_num, + SkipFn skip) { + if (prefetch_parallel_num <= 0) { + skip(state, "prefetch_parallel must be greater than 0"); + return false; + } + return true; +} + +int64_t BenchmarkHelpers::RunReadIterations(::benchmark::State& state, + const ReadOnceFn& read_once) { + int64_t rows_read = 0; + for (auto _ : state) { + rows_read = read_once(); + } + return rows_read; +} + +bool BenchmarkHelpers::TryRunExternalReadMode(::benchmark::State& state, + const std::string& benchmark_name, + const std::string& external_table_path, + const ReadOnceFn& read_once) { + if (external_table_path.empty()) { + return false; + } + + std::cout << "[benchmark][" << benchmark_name + << "] external_table_path=" << external_table_path << std::endl; + const int64_t rows_read = RunReadIterations(state, read_once); + state.SetItemsProcessed(state.iterations() * rows_read); + return true; +} + +} // namespace paimon::benchmark diff --git a/benchmark/benchmark_helpers.h b/benchmark/benchmark_helpers.h new file mode 100644 index 000000000..30fb0c3a9 --- /dev/null +++ b/benchmark/benchmark_helpers.h @@ -0,0 +1,58 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace benchmark { +class State; +} + +namespace paimon::benchmark { + +class BenchmarkHelpers { + public: + using ReadOnceFn = std::function; + using SkipFn = void (*)(::benchmark::State&, const std::string&); + + static bool ValidateFileFormatOrSkip(::benchmark::State& state, + const std::string& file_format, + bool is_supported, SkipFn skip); + + static bool ValidateSourcePresenceOrSkip(::benchmark::State& state, + const std::string& source_path, + const std::string& message, SkipFn skip); + + static bool ValidateSourceSupportOrSkip(::benchmark::State& state, + const std::string& source_format, + bool is_supported, SkipFn skip); + + static bool ValidatePrefetchParallelOrSkip(::benchmark::State& state, + int32_t prefetch_parallel_num, + SkipFn skip); + + static int64_t RunReadIterations(::benchmark::State& state, const ReadOnceFn& read_once); + + static bool TryRunExternalReadMode(::benchmark::State& state, + const std::string& benchmark_name, + const std::string& external_table_path, + const ReadOnceFn& read_once); +}; + +} // namespace paimon::benchmark diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp new file mode 100644 index 000000000..15dfc335d --- /dev/null +++ b/benchmark/benchmark_suite.cpp @@ -0,0 +1,849 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/c/bridge.h" +#include "arrow/io/api.h" +#include "benchmark_helpers.h" +#include "benchmark_suite.h" +#include "cli_option_parsing.h" +#include "paimon/api.h" +#include "paimon/catalog/catalog.h" + +#if __has_include("parquet/arrow/reader.h") +#include "parquet/arrow/reader.h" +#define PAIMON_BENCHMARK_HAS_PARQUET_READER 1 +#else +#define PAIMON_BENCHMARK_HAS_PARQUET_READER 0 +#endif + +namespace paimon::benchmark { + +namespace { + +using DataBatches = std::vector>; + +struct BenchmarkCliOptions { + std::string source_parquet; + std::string external_table_path; + std::string file_format = "parquet"; + std::vector pk_columns; + std::vector> extra_options; +}; + +struct SourceDataSpec { + std::string format; + std::string path; +}; + +BenchmarkCliOptions& MutableBenchmarkCliOptions() { + static BenchmarkCliOptions options; + return options; +} + +const BenchmarkCliOptions& GetBenchmarkCliOptions() { + return MutableBenchmarkCliOptions(); +} + +void ParsePaimonBenchmarkCliArgsImpl(int* argc, char** argv) { + auto& options = MutableBenchmarkCliOptions(); + int write_idx = 1; + for (int i = 1; i < *argc; ++i) { + const std::string arg(argv[i]); + + if (paimon::benchmark::ParseStringOptionArg(&i, *argc, argv, arg, "--paimon_source_parquet", + &options.source_parquet)) { + continue; + } + if (paimon::benchmark::ParseStringOptionArg(&i, *argc, argv, arg, + "--paimon_external_table_path", + &options.external_table_path)) { + continue; + } + if (paimon::benchmark::ParseStringOptionArg(&i, *argc, argv, arg, "--paimon_file_format", + &options.file_format)) { + continue; + } + if (paimon::benchmark::ParseCsvOptionArg(&i, *argc, argv, arg, "--paimon_pk_columns", + &options.pk_columns)) { + continue; + } + if (paimon::benchmark::ParseDelimitedRepeatableOptionArg( + &i, *argc, argv, arg, "--paimon_option", &options.extra_options)) { + continue; + } + + argv[write_idx++] = argv[i]; + } + + *argc = write_idx; + argv[write_idx] = nullptr; +} + +bool HasHelpFlagImpl(int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + const std::string arg(argv[i]); + if (arg == "-h" || arg == "--help" || arg == "--help=true") { + return true; + } + } + return false; +} + +void PrintPaimonBenchmarkCliHelpImpl() { + std::cout + << "Paimon benchmark custom options:\n" + << " --paimon_source_parquet=\n" + << " Required. External Parquet source file used to build benchmark data.\n" + << " Also supports: --paimon_source_parquet \n" + << " --paimon_external_table_path=\n" + << " Optional for BM_Read and BM_MOR_Read. If set, read directly from existing\n" + << " table path and\n" + << " skip source file loading and pre-write stage.\n" + << " Also supports: --paimon_external_table_path \n" + << " --paimon_file_format=\n" + << " Optional. Target table file format. Default: parquet.\n" + << " Also supports: --paimon_file_format \n" + << " --paimon_pk_columns=\n" + << " Required by BM_PK_Write and BM_MOR_Read.\n" + << " Also supports: --paimon_pk_columns \n" + << " --paimon_option=:;:\n" + << " Optional and repeatable. Pass through table options as-is.\n" + << " Also supports: --paimon_option :;:\n" + << " Note: use quotes in shell, e.g. \"--paimon_option k1:v1;k2:v2\".\n" + << "\n" + << "Example:\n" + << " paimon-read-write-benchmark --paimon_source_parquet /path/data.parquet \\\n" + << " --paimon_file_format parquet --paimon_pk_columns=id \\\n" + << " --paimon_option \"read.batch-size:8192;bucket:4\" --benchmark_filter=BM_Read\n" + << std::endl; +} + +struct BenchmarkWorkspace { + explicit BenchmarkWorkspace(const std::string& prefix) { + root_path = "/tmp/" + prefix + "_" + std::to_string(NextId()); + EnsureDirectory(root_path); + } + + ~BenchmarkWorkspace() { + const std::string cleanup_cmd = "rm -rf '" + root_path + "'"; + std::system(cleanup_cmd.c_str()); + } + + std::string root_path; + + private: + static void EnsureDirectory(const std::string& path) { + if (mkdir(path.c_str(), 0755) != 0 && errno != EEXIST) { + throw std::runtime_error("failed to create benchmark workspace: " + path + + ", errno=" + std::to_string(errno)); + } + } + + static uint64_t NextId() { + static std::atomic id{0}; + return ++id; + } +}; + +uint64_t NextTableId() { + static std::atomic id{0}; + return ++id; +} + +std::string RequirePath(const std::string& root_path, const std::string& db_name, + const std::string& table_name) { + return root_path + "/" + db_name + ".db/" + table_name; +} + +template +T ValueOrThrow(paimon::Result&& result, const std::string& context) { + if (!result.ok()) { + throw std::runtime_error(context + ": " + result.status().ToString()); + } + return std::move(result).value(); +} + +void CheckStatus(const paimon::Status& status, const std::string& context) { + if (!status.ok()) { + throw std::runtime_error(context + ": " + status.ToString()); + } +} + +void SkipWithMessage(::benchmark::State& state, const std::string& message) { + static thread_local std::string owned_message; + owned_message = message; + state.SkipWithError(owned_message.c_str()); +} + +std::string GetFileFormatFromEnv() { + std::string file_format = GetBenchmarkCliOptions().file_format; + for (const auto& kv : GetBenchmarkCliOptions().extra_options) { + if (kv.first == paimon::Options::FILE_FORMAT) { + file_format = kv.second; + } + } + return file_format; +} + +bool IsFileFormatSupported(const std::string& format) { + if (format == "parquet") { + return true; + } + if (format == "orc") { +#ifdef PAIMON_ENABLE_ORC + return true; +#else + return false; +#endif + } + return false; +} + +void ApplyExtraOptions(std::map* options) { + for (const auto& kv : GetBenchmarkCliOptions().extra_options) { + (*options)[kv.first] = kv.second; + } +} + +std::map BuildOptions(const std::string& file_format) { + std::map options = { + {paimon::Options::FILE_FORMAT, file_format}, + }; + ApplyExtraOptions(&options); + return options; +} + +std::map BuildPkOptions(const std::string& file_format) { + auto options = BuildOptions(file_format); + options.emplace(paimon::Options::BUCKET, "4"); + options.emplace(paimon::Options::MERGE_ENGINE, "deduplicate"); + return options; +} + +std::string GetParquetDataSourcePath() { + return GetBenchmarkCliOptions().source_parquet; +} + +std::string GetExternalTablePath() { + return GetBenchmarkCliOptions().external_table_path; +} + +const std::vector& GetPkColumns() { + return GetBenchmarkCliOptions().pk_columns; +} + +SourceDataSpec GetSourceDataSpec() { + const std::string parquet_source_path = GetParquetDataSourcePath(); + if (!parquet_source_path.empty()) { + return {"parquet", parquet_source_path}; + } + return {"", ""}; +} + +int64_t GetParquetSourceBatchMaxRows() { + const char* value = std::getenv("PAIMON_BENCHMARK_SOURCE_BATCH_MAX_ROWS"); + if (value == nullptr || std::strlen(value) == 0) { + return 4096; + } + + char* end = nullptr; + const auto parsed = std::strtoll(value, &end, 10); + if (end == value || *end != '\0' || parsed <= 0) { + throw std::runtime_error( + "invalid PAIMON_BENCHMARK_SOURCE_BATCH_MAX_ROWS, expected positive integer"); + } + return static_cast(parsed); +} + +bool SupportsParquetSourceDataMode() { +#if PAIMON_BENCHMARK_HAS_PARQUET_READER + return true; +#else + return false; +#endif +} + +bool SupportsSourceDataMode(const std::string& source_format) { + if (source_format == "parquet") { + return SupportsParquetSourceDataMode(); + } + return false; +} + +struct ParquetSourceCache { + std::string path; + int64_t batch_max_rows = 0; + std::shared_ptr schema; + DataBatches batches; + int64_t total_rows = 0; +}; + +struct SourceDataCache { + std::shared_ptr schema; + const DataBatches* batches = nullptr; + int64_t total_rows = 0; + std::string format; + std::string path; +}; + +std::shared_ptr BuildStructArrayFromRecordBatch( + const std::shared_ptr& batch) { + return std::make_shared(arrow::struct_(batch->schema()->fields()), + batch->num_rows(), batch->columns()); +} + +const ParquetSourceCache& LoadParquetSource(const std::string& path) { + const int64_t batch_max_rows = GetParquetSourceBatchMaxRows(); + static ParquetSourceCache cache; + if (cache.path == path && cache.batch_max_rows == batch_max_rows) { + return cache; + } + +#if !PAIMON_BENCHMARK_HAS_PARQUET_READER + throw std::runtime_error( + "Parquet source data mode requires parquet::arrow reader support in this build"); +#else + auto input = arrow::io::ReadableFile::Open(path); + if (!input.ok()) { + throw std::runtime_error("open Parquet source failed: " + path + ", " + + input.status().ToString()); + } + + std::unique_ptr parquet_reader; + const auto open_status = parquet::arrow::OpenFile( + input.ValueUnsafe(), arrow::default_memory_pool(), &parquet_reader); + if (!open_status.ok()) { + throw std::runtime_error("create Parquet reader failed: " + open_status.ToString()); + } + + std::shared_ptr table; + const auto read_status = parquet_reader->ReadTable(&table); + if (!read_status.ok()) { + throw std::runtime_error("read Parquet source failed: " + read_status.ToString()); + } + + if (table->num_rows() <= 0) { + throw std::runtime_error("Parquet source is empty: " + path); + } + + DataBatches batches; + int64_t total_rows = 0; + arrow::TableBatchReader batch_reader(*table); + batch_reader.set_chunksize(batch_max_rows); + std::shared_ptr record_batch; + while (true) { + const auto read_batch_status = batch_reader.ReadNext(&record_batch); + if (!read_batch_status.ok()) { + throw std::runtime_error("split Parquet table into batches failed: " + + read_batch_status.ToString()); + } + if (record_batch == nullptr) { + break; + } + if (record_batch->num_rows() <= 0) { + continue; + } + batches.push_back(BuildStructArrayFromRecordBatch(record_batch)); + total_rows += record_batch->num_rows(); + } + + if (batches.empty() || total_rows <= 0) { + throw std::runtime_error("Parquet source has no non-empty batches: " + path); + } + + cache.path = path; + cache.batch_max_rows = batch_max_rows; + cache.schema = table->schema(); + cache.batches = std::move(batches); + cache.total_rows = total_rows; + return cache; +#endif +} + +SourceDataCache LoadSourceData(const SourceDataSpec& source_spec) { + if (source_spec.format == "parquet") { + const auto& source = LoadParquetSource(source_spec.path); + return {source.schema, &source.batches, source.total_rows, source_spec.format, + source_spec.path}; + } + throw std::runtime_error("unknown source format: " + source_spec.format); +} + +std::shared_ptr BuildSchema(const SourceDataSpec& source_spec) { + return LoadSourceData(source_spec).schema; +} +DataBatches BuildDataBatches(const SourceDataSpec& source_spec) { + const auto source = LoadSourceData(source_spec); + if (source.batches == nullptr || source.batches->empty() || source.total_rows <= 0) { + throw std::runtime_error("source file has no non-empty data batches"); + } + + std::cout << "[benchmark][source] format=" << source.format << ", path=" << source.path + << ", source_rows=" << source.total_rows + << ", generated_data_batches=" << source.batches->size() << std::endl; + return *source.batches; +} + +std::unique_ptr MakeRecordBatch( + const std::shared_ptr& arr) { + ArrowArray c_array; + if (!arrow::ExportArray(*arr, &c_array).ok()) { + throw std::runtime_error("failed to export arrow array"); + } + paimon::RecordBatchBuilder builder(&c_array); + builder.SetBucket(0); + return ValueOrThrow(builder.Finish(), "build paimon record batch"); +} + +void EnsureTable(const std::string& root_path, const std::string& db_name, + const std::string& table_name, const std::map& options, + const std::shared_ptr& schema, + const std::vector& primary_keys = {}) { + auto catalog = ValueOrThrow(paimon::Catalog::Create(root_path, options), "create catalog"); + CheckStatus(catalog->CreateDatabase(db_name, options, true), "create database"); + + ArrowSchema c_schema; + if (!arrow::ExportSchema(*schema, &c_schema).ok()) { + throw std::runtime_error("failed to export table schema"); + } + CheckStatus(catalog->CreateTable(paimon::Identifier(db_name, table_name), &c_schema, + /*partition_keys=*/{}, primary_keys, options, + /*ignore_if_exists=*/false), + "create table"); +} + +void WriteAndCommit(const std::string& table_path, + const std::map& options, + const DataBatches& data_batches) { + paimon::WriteContextBuilder write_builder(table_path, "benchmark-writer"); + auto write_ctx = + ValueOrThrow(write_builder.SetOptions(options).Finish(), "create write context"); + auto writer = ValueOrThrow(paimon::FileStoreWrite::Create(std::move(write_ctx)), + "create file store writer"); + + for (const auto& data : data_batches) { + auto batch = MakeRecordBatch(data); + CheckStatus(writer->Write(std::move(batch)), "write batch"); + } + auto messages = ValueOrThrow(writer->PrepareCommit(), "prepare commit"); + + paimon::CommitContextBuilder commit_builder(table_path, "benchmark-writer"); + auto commit_ctx = + ValueOrThrow(commit_builder.SetOptions(options).Finish(), "create commit context"); + auto committer = + ValueOrThrow(paimon::FileStoreCommit::Create(std::move(commit_ctx)), "create committer"); + CheckStatus(committer->Commit(messages), "commit write"); +} + +struct SharedReadTableCache { + std::string key; + std::unique_ptr workspace; + std::string table_path; + int64_t total_rows = 0; +}; + +struct SharedMorReadTableCache { + std::string key; + std::unique_ptr workspace; + std::string table_path; + int64_t total_rows = 0; +}; + +std::string BuildReadTableCacheKey(const std::string& file_format, + const SourceDataSpec& source_spec) { + return file_format + "|" + source_spec.format + "|" + source_spec.path + "|" + + std::to_string(GetParquetSourceBatchMaxRows()); +} + +std::string JoinColumns(const std::vector& columns) { + std::string joined; + for (size_t i = 0; i < columns.size(); ++i) { + if (i > 0) { + joined.append(","); + } + joined.append(columns[i]); + } + return joined; +} + +const SharedMorReadTableCache& GetOrCreateSharedMorReadTable(const std::string& file_format, + const SourceDataSpec& source_spec) { + static SharedMorReadTableCache cache; + static std::mutex cache_mutex; + + const std::vector& pk_columns = GetPkColumns(); + const std::string cache_key = + BuildReadTableCacheKey(file_format, source_spec) + "|pk=" + JoinColumns(pk_columns); + std::lock_guard lock(cache_mutex); + if (cache.workspace != nullptr && cache.key == cache_key) { + std::cout << "[benchmark][mor-read] reuse_output_table_path=" << cache.table_path + << std::endl; + return cache; + } + + auto options = BuildPkOptions(file_format); + const auto source = LoadSourceData(source_spec); + auto schema = BuildSchema(source_spec); + auto data_batches = BuildDataBatches(source_spec); + + auto workspace = std::make_unique("paimon_mor_read_bench_shared"); + const std::string db_name = "bench_db"; + const std::string table_name = "mor_read_shared_" + std::to_string(NextTableId()); + EnsureTable(workspace->root_path, db_name, table_name, options, schema, + /*primary_keys=*/pk_columns); + const std::string table_path = RequirePath(workspace->root_path, db_name, table_name); + std::cout << "[benchmark][mor-read] create_shared_output_table_path=" << table_path + << std::endl; + WriteAndCommit(table_path, options, data_batches); + + cache.key = cache_key; + cache.workspace = std::move(workspace); + cache.table_path = table_path; + cache.total_rows = source.total_rows; + return cache; +} + +const SharedReadTableCache& GetOrCreateSharedReadTable(const std::string& file_format, + const SourceDataSpec& source_spec) { + static SharedReadTableCache cache; + static std::mutex cache_mutex; + + const std::string cache_key = BuildReadTableCacheKey(file_format, source_spec); + std::lock_guard lock(cache_mutex); + if (cache.workspace != nullptr && cache.key == cache_key) { + std::cout << "[benchmark][read] reuse_output_table_path=" << cache.table_path << std::endl; + return cache; + } + + auto options = BuildOptions(file_format); + const auto source = LoadSourceData(source_spec); + auto schema = BuildSchema(source_spec); + auto data_batches = BuildDataBatches(source_spec); + + auto workspace = std::make_unique("paimon_read_bench_shared"); + const std::string db_name = "bench_db"; + const std::string table_name = "read_shared_" + std::to_string(NextTableId()); + EnsureTable(workspace->root_path, db_name, table_name, options, schema); + const std::string table_path = RequirePath(workspace->root_path, db_name, table_name); + std::cout << "[benchmark][read] create_shared_output_table_path=" << table_path << std::endl; + WriteAndCommit(table_path, options, data_batches); + + cache.key = cache_key; + cache.workspace = std::move(workspace); + cache.table_path = table_path; + cache.total_rows = source.total_rows; + return cache; +} + +int64_t ReadRows(const std::string& table_path, const std::map& options, + int32_t prefetch_parallel_num) { + paimon::ScanContextBuilder scan_builder(table_path); + auto scan_ctx = ValueOrThrow(scan_builder.SetOptions(options).Finish(), "create scan context"); + auto scanner = ValueOrThrow(paimon::TableScan::Create(std::move(scan_ctx)), "create scanner"); + auto plan = ValueOrThrow(scanner->CreatePlan(), "create plan"); + + paimon::ReadContextBuilder read_builder(table_path); + constexpr int32_t kPrefetchBatchCount = 600; + read_builder.SetOptions(options) + .EnablePrefetch(true) + .SetPrefetchBatchCount(kPrefetchBatchCount) + .SetPrefetchMaxParallelNum(prefetch_parallel_num) + .EnableMultiThreadRowToBatch(false) + .SetRowToBatchThreadNumber(1); + auto read_ctx = ValueOrThrow(read_builder.Finish(), "create read context"); + auto reader = + ValueOrThrow(paimon::TableRead::Create(std::move(read_ctx)), "create table reader"); + auto batch_reader = ValueOrThrow(reader->CreateReader(plan->Splits()), "create batch reader"); + + int64_t total_rows = 0; + while (true) { + auto batch = ValueOrThrow(batch_reader->NextBatch(), "read next batch"); + if (paimon::BatchReader::IsEofBatch(batch)) { + break; + } + auto& [array, schema] = batch; + auto imported = arrow::ImportArray(array.get(), schema.get()); + if (!imported.ok()) { + throw std::runtime_error("import c data array failed: " + imported.status().ToString()); + } + total_rows += imported.ValueUnsafe()->length(); + } + + return total_rows; +} + +struct PreparedSourceData { + std::shared_ptr schema; + DataBatches data_batches; + int64_t total_rows = 0; +}; + +bool TryGetSourceSpec(::benchmark::State& state, SourceDataSpec* source_spec) { + try { + *source_spec = GetSourceDataSpec(); + return true; + } catch (const std::exception& e) { + SkipWithMessage(state, e.what()); + return false; + } +} + +bool TryPrepareSourceData(::benchmark::State& state, const SourceDataSpec& source_spec, + PreparedSourceData* prepared) { + try { + prepared->total_rows = LoadSourceData(source_spec).total_rows; + prepared->schema = BuildSchema(source_spec); + prepared->data_batches = BuildDataBatches(source_spec); + return true; + } catch (const std::exception& e) { + SkipWithMessage(state, e.what()); + return false; + } +} + +} // namespace + +void ParsePaimonBenchmarkCliArgs(int* argc, char** argv) { + ParsePaimonBenchmarkCliArgsImpl(argc, argv); +} + +bool HasHelpFlag(int argc, char** argv) { + return HasHelpFlagImpl(argc, argv); +} + +void PrintPaimonBenchmarkCliHelp() { + PrintPaimonBenchmarkCliHelpImpl(); +} + +void RunBMWrite(::benchmark::State& state) { + const std::string file_format = GetFileFormatFromEnv(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, "--paimon_source_parquet is required", &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip( + state, source_spec.format, SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip(state, file_format, + IsFileFormatSupported(file_format), + &SkipWithMessage)) { + return; + } + + auto options = BuildOptions(file_format); + PreparedSourceData prepared; + if (!TryPrepareSourceData(state, source_spec, &prepared)) { + return; + } + BenchmarkWorkspace workspace("paimon_write_bench"); + + for (auto _ : state) { + const std::string db_name = "bench_db"; + const std::string table_name = "write_" + std::to_string(NextTableId()); + EnsureTable(workspace.root_path, db_name, table_name, options, prepared.schema); + const std::string table_path = RequirePath(workspace.root_path, db_name, table_name); + std::cout << "[benchmark][write] output_table_path=" << table_path << std::endl; + WriteAndCommit(table_path, options, prepared.data_batches); + } + + state.SetItemsProcessed(state.iterations() * prepared.total_rows); +} + +void RunBMRead(::benchmark::State& state) { + const auto prefetch_parallel_num = static_cast(state.range(0)); + const std::string file_format = GetFileFormatFromEnv(); + const std::string external_table_path = GetExternalTablePath(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip(state, file_format, + IsFileFormatSupported(file_format), + &SkipWithMessage)) { + return; + } + + if (!BenchmarkHelpers::ValidatePrefetchParallelOrSkip(state, prefetch_parallel_num, + &SkipWithMessage)) { + return; + } + + auto options = BuildOptions(file_format); + + if (BenchmarkHelpers::TryRunExternalReadMode( + state, "read", external_table_path, + [&]() { return ReadRows(external_table_path, options, prefetch_parallel_num); })) { + return; + } + + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, + "--paimon_source_parquet is required when --paimon_external_table_path is not set", + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip( + state, source_spec.format, SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + + const SharedReadTableCache* shared_table = nullptr; + try { + shared_table = &GetOrCreateSharedReadTable(file_format, source_spec); + } catch (const std::exception& e) { + SkipWithMessage(state, e.what()); + return; + } + + const int64_t rows_read = BenchmarkHelpers::RunReadIterations( + state, [&]() { return ReadRows(shared_table->table_path, options, prefetch_parallel_num); }); + + state.SetItemsProcessed(state.iterations() * rows_read); +} + +void RunBMPkWrite(::benchmark::State& state) { + const std::string file_format = GetFileFormatFromEnv(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, "--paimon_source_parquet is required", &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip( + state, source_spec.format, SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip(state, file_format, + IsFileFormatSupported(file_format), + &SkipWithMessage)) { + return; + } + const std::vector& pk_columns = GetPkColumns(); + if (pk_columns.empty()) { + SkipWithMessage(state, "--paimon_pk_columns is required for BM_PK_Write"); + return; + } + + auto options = BuildPkOptions(file_format); + PreparedSourceData prepared; + if (!TryPrepareSourceData(state, source_spec, &prepared)) { + return; + } + BenchmarkWorkspace workspace("paimon_pk_write_bench"); + + for (auto _ : state) { + const std::string db_name = "bench_db"; + const std::string table_name = "pk_write_" + std::to_string(NextTableId()); + EnsureTable(workspace.root_path, db_name, table_name, options, prepared.schema, + /*primary_keys=*/pk_columns); + const std::string table_path = RequirePath(workspace.root_path, db_name, table_name); + std::cout << "[benchmark][pk-write] output_table_path=" << table_path << std::endl; + WriteAndCommit(table_path, options, prepared.data_batches); + } + + state.SetItemsProcessed(state.iterations() * prepared.total_rows); +} + +void RunBMMorRead(::benchmark::State& state) { + const auto prefetch_parallel_num = static_cast(state.range(0)); + const std::string file_format = GetFileFormatFromEnv(); + const std::string external_table_path = GetExternalTablePath(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip(state, file_format, + IsFileFormatSupported(file_format), + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidatePrefetchParallelOrSkip(state, prefetch_parallel_num, + &SkipWithMessage)) { + return; + } + + const auto external_read_options = BuildOptions(file_format); + if (BenchmarkHelpers::TryRunExternalReadMode( + state, "mor-read", external_table_path, + [&]() { + return ReadRows(external_table_path, external_read_options, + prefetch_parallel_num); + })) { + return; + } + + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, + "--paimon_source_parquet is required when --paimon_external_table_path is not set", + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip( + state, source_spec.format, SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + if (GetPkColumns().empty()) { + SkipWithMessage(state, "--paimon_pk_columns is required for BM_MOR_Read"); + return; + } + + auto options = BuildPkOptions(file_format); + const SharedMorReadTableCache* shared_table = nullptr; + try { + shared_table = &GetOrCreateSharedMorReadTable(file_format, source_spec); + } catch (const std::exception& e) { + SkipWithMessage(state, e.what()); + return; + } + + const int64_t rows_read = BenchmarkHelpers::RunReadIterations( + state, [&]() { return ReadRows(shared_table->table_path, options, prefetch_parallel_num); }); + state.SetItemsProcessed(state.iterations() * rows_read); +} + +} // namespace paimon::benchmark diff --git a/benchmark/benchmark_suite.h b/benchmark/benchmark_suite.h new file mode 100644 index 000000000..31ba6c8bb --- /dev/null +++ b/benchmark/benchmark_suite.h @@ -0,0 +1,32 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "benchmark/benchmark.h" + +namespace paimon::benchmark { + +void ParsePaimonBenchmarkCliArgs(int* argc, char** argv); +bool HasHelpFlag(int argc, char** argv); +void PrintPaimonBenchmarkCliHelp(); + +void RunBMWrite(::benchmark::State& state); +void RunBMRead(::benchmark::State& state); +void RunBMPkWrite(::benchmark::State& state); +void RunBMMorRead(::benchmark::State& state); + +} // namespace paimon::benchmark diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h new file mode 100644 index 000000000..1ef47ca2b --- /dev/null +++ b/benchmark/cli_option_parsing.h @@ -0,0 +1,157 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace paimon::benchmark { + +inline bool ConsumeCliOption(const std::string& arg, const std::string& option_name, + std::string* value_out) { + const std::string prefix = option_name + "="; + if (arg.rfind(prefix, 0) != 0) { + return false; + } + *value_out = arg.substr(prefix.size()); + return true; +} + +inline std::vector ParseCsvColumns(const std::string& csv, + const std::string& option_name) { + if (csv.empty()) { + throw std::runtime_error("missing value for " + option_name); + } + + std::vector columns; + std::string current; + for (char c : csv) { + if (c == ',') { + if (current.empty()) { + throw std::runtime_error("invalid " + option_name + ": empty column name"); + } + columns.push_back(current); + current.clear(); + continue; + } + if (c != ' ' && c != '\t') { + current.push_back(c); + } + } + if (current.empty()) { + throw std::runtime_error("invalid " + option_name + ": empty column name"); + } + columns.push_back(current); + return columns; +} + +inline std::vector> ParseDelimitedOptions( + const std::string& input, const std::string& option_name) { + if (input.empty()) { + throw std::runtime_error("missing value for " + option_name); + } + + std::vector> parsed; + std::string token; + for (size_t i = 0; i <= input.size(); ++i) { + const bool at_end = (i == input.size()); + if (!at_end && input[i] != ';') { + token.push_back(input[i]); + continue; + } + + if (token.empty()) { + throw std::runtime_error("invalid " + option_name + ": empty option segment"); + } + + const auto sep = token.find(':'); + if (sep == std::string::npos || sep == 0 || sep + 1 >= token.size()) { + throw std::runtime_error("invalid " + option_name + ": expected key:value"); + } + + parsed.emplace_back(token.substr(0, sep), token.substr(sep + 1)); + token.clear(); + } + return parsed; +} + +inline bool ParseStringOptionArg(int* i, int argc, char** argv, const std::string& arg, + const std::string& option_name, std::string* value_out) { + std::string parsed_value; + if (ConsumeCliOption(arg, option_name, &parsed_value)) { + *value_out = std::move(parsed_value); + return true; + } + + if (arg != option_name) { + return false; + } + + if (*i + 1 >= argc) { + throw std::runtime_error("missing value for " + option_name); + } + *value_out = argv[++(*i)]; + return true; +} + +inline bool ParseCsvOptionArg(int* i, int argc, char** argv, const std::string& arg, + const std::string& option_name, + std::vector* columns_out) { + std::string parsed_value; + if (ConsumeCliOption(arg, option_name, &parsed_value)) { + *columns_out = ParseCsvColumns(parsed_value, option_name); + return true; + } + + if (arg != option_name) { + return false; + } + + if (*i + 1 >= argc) { + throw std::runtime_error("missing value for " + option_name); + } + *columns_out = ParseCsvColumns(std::string(argv[++(*i)]), option_name); + return true; +} + +inline bool ParseDelimitedRepeatableOptionArg( + int* i, int argc, char** argv, const std::string& arg, const std::string& option_name, + std::vector>* options_out) { + std::string parsed_value; + if (ConsumeCliOption(arg, option_name, &parsed_value)) { + const auto parsed_options = ParseDelimitedOptions(parsed_value, option_name); + options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); + return true; + } + + if (arg != option_name) { + return false; + } + + if (*i + 1 >= argc) { + throw std::runtime_error("missing value for " + option_name); + } + + const std::string option_arg = argv[++(*i)]; + const auto parsed_options = ParseDelimitedOptions(option_arg, option_name); + options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); + return true; +} + +} // namespace paimon::benchmark diff --git a/benchmark/cli_option_parsing_test.cpp b/benchmark/cli_option_parsing_test.cpp new file mode 100644 index 000000000..b89328ea3 --- /dev/null +++ b/benchmark/cli_option_parsing_test.cpp @@ -0,0 +1,145 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "cli_option_parsing.h" + +namespace paimon::testing { +namespace { + +struct ArgvHolder { + std::vector args; + std::vector argv; + + explicit ArgvHolder(std::vector in_args) : args(std::move(in_args)) { + argv.reserve(args.size()); + for (auto& arg : args) { + argv.push_back(arg.data()); + } + } + + int argc() const { + return static_cast(argv.size()); + } +}; + +TEST(CliOptionParsingTest, ConsumeCliOptionWorks) { + std::string value; + EXPECT_TRUE(paimon::benchmark::ConsumeCliOption("--foo=bar", "--foo", &value)); + EXPECT_EQ(value, "bar"); + + value.clear(); + EXPECT_FALSE(paimon::benchmark::ConsumeCliOption("--foo", "--foo", &value)); +} + +TEST(CliOptionParsingTest, ParseCsvColumnsWorks) { + const auto parsed = paimon::benchmark::ParseCsvColumns("id, name\tage", "--cols"); + ASSERT_EQ(parsed.size(), 3U); + EXPECT_EQ(parsed[0], "id"); + EXPECT_EQ(parsed[1], "name"); + EXPECT_EQ(parsed[2], "age"); +} + +TEST(CliOptionParsingTest, ParseCsvColumnsRejectsInvalidInput) { + EXPECT_THROW((void)paimon::benchmark::ParseCsvColumns("", "--cols"), std::runtime_error); + EXPECT_THROW((void)paimon::benchmark::ParseCsvColumns("id,", "--cols"), + std::runtime_error); +} + +TEST(CliOptionParsingTest, ParseDelimitedOptionsWorks) { + const auto parsed = + paimon::benchmark::ParseDelimitedOptions("k1:v1;k2:v2", "--paimon_option"); + ASSERT_EQ(parsed.size(), 2U); + EXPECT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); + EXPECT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); +} + +TEST(CliOptionParsingTest, ParseDelimitedOptionsRejectsInvalidInput) { + EXPECT_THROW((void)paimon::benchmark::ParseDelimitedOptions("", "--paimon_option"), + std::runtime_error); + EXPECT_THROW((void)paimon::benchmark::ParseDelimitedOptions("k1:v1;", "--paimon_option"), + std::runtime_error); +} + +TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) { + { + ArgvHolder argv_holder({"prog", "--foo=bar"}); + int i = 1; + std::string value; + EXPECT_TRUE(paimon::benchmark::ParseStringOptionArg( + &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--foo", + &value)); + EXPECT_EQ(i, 1); + EXPECT_EQ(value, "bar"); + } + + { + ArgvHolder argv_holder({"prog", "--foo", "bar"}); + int i = 1; + std::string value; + EXPECT_TRUE(paimon::benchmark::ParseStringOptionArg( + &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--foo", + &value)); + EXPECT_EQ(i, 2); + EXPECT_EQ(value, "bar"); + } +} + +TEST(CliOptionParsingTest, ParseStringOptionArgRejectsMissingValue) { + ArgvHolder argv_holder({"prog", "--foo"}); + int i = 1; + std::string value; + EXPECT_THROW((void)paimon::benchmark::ParseStringOptionArg( + &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], + "--foo", &value), + std::runtime_error); +} + +TEST(CliOptionParsingTest, ParseCsvOptionArgAndDelimitedRepeatableOptionArgWorks) { + { + ArgvHolder argv_holder({"prog", "--cols", "id,name"}); + int i = 1; + std::vector columns; + EXPECT_TRUE(paimon::benchmark::ParseCsvOptionArg(&i, argv_holder.argc(), + argv_holder.argv.data(), + argv_holder.args[i], "--cols", + &columns)); + EXPECT_EQ(i, 2); + ASSERT_EQ(columns.size(), 2U); + EXPECT_EQ(columns[0], "id"); + EXPECT_EQ(columns[1], "name"); + } + + { + ArgvHolder argv_holder({"prog", "--paimon_option", "k1:v1;k2:v2"}); + int i = 1; + std::vector> options; + EXPECT_TRUE(paimon::benchmark::ParseDelimitedRepeatableOptionArg( + &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], + "--paimon_option", &options)); + EXPECT_EQ(i, 2); + ASSERT_EQ(options.size(), 2U); + EXPECT_EQ(options[0], std::make_pair(std::string("k1"), std::string("v1"))); + EXPECT_EQ(options[1], std::make_pair(std::string("k2"), std::string("v2"))); + } +} + +} // namespace +} // namespace paimon::testing diff --git a/benchmark/read_write_benchmark.cpp b/benchmark/read_write_benchmark.cpp new file mode 100644 index 000000000..9130b27c3 --- /dev/null +++ b/benchmark/read_write_benchmark.cpp @@ -0,0 +1,42 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "benchmark/benchmark.h" +#include "benchmark_suite.h" + +int main(int argc, char** argv) { + try { + paimon::benchmark::ParsePaimonBenchmarkCliArgs(&argc, argv); + } catch (const std::exception& e) { + std::cerr << "[benchmark][cli] " << e.what() << std::endl; + return 1; + } + + if (paimon::benchmark::HasHelpFlag(argc, argv)) { + paimon::benchmark::PrintPaimonBenchmarkCliHelp(); + } + + benchmark::Initialize(&argc, argv); + if (benchmark::ReportUnrecognizedArguments(argc, argv)) { + return 1; + } + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/cmake_modules/BuildUtils.cmake b/cmake_modules/BuildUtils.cmake index ed27ed786..632d9b7e2 100644 --- a/cmake_modules/BuildUtils.cmake +++ b/cmake_modules/BuildUtils.cmake @@ -404,3 +404,115 @@ function(add_paimon_test REL_TEST_NAME) ${PCH_ARGS} ${ARG_UNPARSED_ARGUMENTS}) endfunction() + +function(add_benchmark_case REL_BENCHMARK_NAME) + set(options ENABLED) + set(one_value_args) + set(multi_value_args SOURCES STATIC_LINK_LIBS EXTRA_LINK_LIBS EXTRA_INCLUDES LABELS PREFIX) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(NOT PAIMON_BUILD_BENCHMARKS AND NOT ARG_ENABLED) + return() + endif() + + get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) + + if(ARG_PREFIX) + set(BENCHMARK_NAME "${ARG_PREFIX}-${BENCHMARK_NAME}") + endif() + + if(ARG_SOURCES) + set(SOURCES ${ARG_SOURCES}) + else() + set(SOURCES "${REL_BENCHMARK_NAME}.cpp") + endif() + + string(REPLACE "_" "-" BENCHMARK_NAME ${BENCHMARK_NAME}) + set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") + message(STATUS ${BENCHMARK_NAME}) + add_executable(${BENCHMARK_NAME} ${SOURCES}) + + if(ARG_STATIC_LINK_LIBS) + target_link_libraries(${BENCHMARK_NAME} PRIVATE ${ARG_STATIC_LINK_LIBS}) + endif() + + if(ARG_EXTRA_LINK_LIBS) + target_link_libraries(${BENCHMARK_NAME} PRIVATE ${ARG_EXTRA_LINK_LIBS}) + endif() + + if(ARG_EXTRA_INCLUDES) + target_include_directories(${BENCHMARK_NAME} SYSTEM PUBLIC ${ARG_EXTRA_INCLUDES}) + endif() + + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + target_compile_options(${BENCHMARK_NAME} PRIVATE -Wno-global-constructors) + endif() + target_compile_options(${BENCHMARK_NAME} PRIVATE -fno-access-control) + + add_test(${BENCHMARK_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh + ${CMAKE_BINARY_DIR} + benchmark + ${BENCHMARK_PATH}) + + foreach(TARGET ${ARG_LABELS}) + add_dependencies(${TARGET} ${BENCHMARK_NAME}) + endforeach() + + set(LABELS) + list(APPEND LABELS "benchmark") + if(ARG_LABELS) + list(APPEND LABELS ${ARG_LABELS}) + endif() + + foreach(LABEL ${ARG_LABELS}) + set(LABEL_BENCHMARK_NAME "benchmark-${LABEL}") + if(NOT TARGET ${LABEL_BENCHMARK_NAME}) + add_custom_target(${LABEL_BENCHMARK_NAME} + ctest -L "${LABEL}" --output-on-failure + USES_TERMINAL) + endif() + add_dependencies(${LABEL_BENCHMARK_NAME} ${BENCHMARK_NAME}) + endforeach() + + set_property(TEST ${BENCHMARK_NAME} + APPEND + PROPERTY LABELS ${LABELS}) +endfunction() + +function(add_paimon_benchmark REL_BENCHMARK_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args LABELS) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + + if(ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "paimon") + endif() + + if(ARG_LABELS) + set(LABELS ${ARG_LABELS}) + else() + set(LABELS "paimon-benchmarks") + endif() + + add_benchmark_case(${REL_BENCHMARK_NAME} + PREFIX + ${PREFIX} + LABELS + ${LABELS} + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() diff --git a/cmake_modules/DefineOptions.cmake b/cmake_modules/DefineOptions.cmake index 41ff252c5..6d965cd1d 100644 --- a/cmake_modules/DefineOptions.cmake +++ b/cmake_modules/DefineOptions.cmake @@ -107,6 +107,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(PAIMON_BUILD_TESTS "Build the Paimon googletest unit tests" OFF) + define_option(PAIMON_BUILD_BENCHMARKS + "Build the Paimon Google Benchmark performance benchmarks" OFF) + if(PAIMON_BUILD_SHARED) set(PAIMON_TEST_LINKAGE_DEFAULT "shared") else() @@ -241,6 +244,13 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") AUTO BUNDLED SYSTEM) + + define_option_string(Benchmark_SOURCE + "Dependency source for Google Benchmark" + "" + AUTO + BUNDLED + SYSTEM) endif() macro(validate_config) diff --git a/cmake_modules/FindbenchmarkAlt.cmake b/cmake_modules/FindbenchmarkAlt.cmake new file mode 100644 index 000000000..4c29e3a34 --- /dev/null +++ b/cmake_modules/FindbenchmarkAlt.cmake @@ -0,0 +1,60 @@ +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set(_PAIMON_BENCHMARK_ROOTS ${Benchmark_ROOT} ${benchmark_ROOT} ${PAIMON_PACKAGE_PREFIX}) +list(REMOVE_ITEM _PAIMON_BENCHMARK_ROOTS "") +if(_PAIMON_BENCHMARK_ROOTS) + set(_PAIMON_BENCHMARK_FIND_ARGS HINTS ${_PAIMON_BENCHMARK_ROOTS} NO_DEFAULT_PATH) +endif() + +find_package(benchmark CONFIG QUIET ${_PAIMON_BENCHMARK_FIND_ARGS}) + +if(NOT TARGET benchmark::benchmark) + find_path(BENCHMARK_INCLUDE_DIR + NAMES benchmark/benchmark.h ${_PAIMON_BENCHMARK_FIND_ARGS} + PATH_SUFFIXES include) + find_library(BENCHMARK_LIBRARY + NAMES benchmark ${_PAIMON_BENCHMARK_FIND_ARGS} + PATH_SUFFIXES lib lib64) + find_library(BENCHMARK_MAIN_LIBRARY + NAMES benchmark_main ${_PAIMON_BENCHMARK_FIND_ARGS} + PATH_SUFFIXES lib lib64) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(benchmarkAlt REQUIRED_VARS BENCHMARK_INCLUDE_DIR + BENCHMARK_LIBRARY) + + if(benchmarkAlt_FOUND) + if(NOT TARGET benchmark::benchmark) + add_library(benchmark::benchmark UNKNOWN IMPORTED) + set_target_properties(benchmark::benchmark + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + endif() + + if(BENCHMARK_MAIN_LIBRARY AND NOT TARGET benchmark::benchmark_main) + add_library(benchmark::benchmark_main UNKNOWN IMPORTED) + set_target_properties(benchmark::benchmark_main + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_MAIN_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + endif() + endif() +else() + set(benchmarkAlt_FOUND TRUE) +endif() + +unset(_PAIMON_BENCHMARK_ROOTS) +unset(_PAIMON_BENCHMARK_FIND_ARGS) diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index 068cf7de7..70f5954b3 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -245,6 +245,18 @@ else() endif() endif() +if(DEFINED ENV{PAIMON_BENCHMARK_URL}) + set(BENCHMARK_SOURCE_URL "$ENV{PAIMON_BENCHMARK_URL}") +else() + if(EXISTS "${THIRDPARTY_DIR}/${PAIMON_BENCHMARK_PKG_NAME}") + set_urls(BENCHMARK_SOURCE_URL "${THIRDPARTY_DIR}/${PAIMON_BENCHMARK_PKG_NAME}") + else() + set_urls(BENCHMARK_SOURCE_URL + "${THIRDPARTY_MIRROR_URL}https://github.com/google/benchmark/archive/refs/tags/v${PAIMON_BENCHMARK_BUILD_VERSION}.tar.gz" + ) + endif() +endif() + if(DEFINED ENV{PAIMON_TBB_URL}) set(TBB_SOURCE_URL "$ENV{PAIMON_TBB_URL}") else() @@ -500,6 +512,8 @@ function(paimon_get_dependency_compat_target DEPENDENCY_NAME OUT_VAR) set(_target libprotobuf) elseif("${DEPENDENCY_NAME}" STREQUAL "GTest") set(_target GTest::gtest) + elseif("${DEPENDENCY_NAME}" STREQUAL "Benchmark") + set(_target benchmark::benchmark) elseif("${DEPENDENCY_NAME}" STREQUAL "RE2") set(_target re2::re2) elseif("${DEPENDENCY_NAME}" STREQUAL "Snappy") @@ -586,6 +600,8 @@ macro(paimon_build_dependency DEPENDENCY_NAME) build_avro() elseif("${DEPENDENCY_NAME}" STREQUAL "GTest") build_gtest() + elseif("${DEPENDENCY_NAME}" STREQUAL "Benchmark") + build_benchmark() else() message(FATAL_ERROR "No bundled build rule for ${DEPENDENCY_NAME}") endif() @@ -1743,6 +1759,43 @@ macro(build_tbb) endmacro(build_tbb) +macro(build_benchmark) + message(STATUS "Building benchmark from source") + + set(BENCHMARK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/benchmark_ep-install") + set(BENCHMARK_INCLUDE_DIR "${BENCHMARK_PREFIX}/include") + set(BENCHMARK_STATIC_LIB "${BENCHMARK_PREFIX}/lib/libbenchmark.a") + set(BENCHMARK_MAIN_STATIC_LIB "${BENCHMARK_PREFIX}/lib/libbenchmark_main.a") + + set(BENCHMARK_CMAKE_ARGS + ${EP_COMMON_CMAKE_ARGS} + "-DCMAKE_INSTALL_PREFIX=${BENCHMARK_PREFIX}" + -DBENCHMARK_ENABLE_TESTING=OFF + -DBENCHMARK_ENABLE_GTEST_TESTS=OFF + -DBENCHMARK_DOWNLOAD_DEPENDENCIES=OFF) + + externalproject_add(benchmark_ep + URL ${BENCHMARK_SOURCE_URL} + CMAKE_ARGS ${BENCHMARK_CMAKE_ARGS} + BUILD_BYPRODUCTS "${BENCHMARK_STATIC_LIB}" "${BENCHMARK_MAIN_STATIC_LIB}") + + file(MAKE_DIRECTORY "${BENCHMARK_INCLUDE_DIR}") + + add_library(benchmark::benchmark STATIC IMPORTED) + set_target_properties(benchmark::benchmark + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + add_dependencies(benchmark::benchmark benchmark_ep) + + add_library(benchmark::benchmark_main STATIC IMPORTED) + set_target_properties(benchmark::benchmark_main + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_MAIN_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + add_dependencies(benchmark::benchmark_main benchmark_ep) +endmacro() + macro(build_glog) message(STATUS "Building glog from source") set(GLOG_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/glog_ep-install") @@ -1810,6 +1863,9 @@ if(PAIMON_ENABLE_ORC) resolve_dependency(Protobuf) resolve_dependency(ORC) endif() +if(PAIMON_BUILD_BENCHMARKS) + resolve_dependency(Benchmark FIND_PACKAGE_NAME benchmark) +endif() if(PAIMON_ENABLE_JINDO) build_jindosdk_c() build_jindosdk_nextarch() diff --git a/third_party/versions.txt b/third_party/versions.txt index c27599997..235e8ba4e 100644 --- a/third_party/versions.txt +++ b/third_party/versions.txt @@ -60,6 +60,9 @@ PAIMON_GTEST_BUILD_VERSION=1.11.0 PAIMON_GTEST_BUILD_SHA256_CHECKSUM=b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 PAIMON_GTEST_PKG_NAME=gtest-${PAIMON_GTEST_BUILD_VERSION}.tar.gz +PAIMON_BENCHMARK_BUILD_VERSION=1.9.1 +PAIMON_BENCHMARK_PKG_NAME=benchmark-${PAIMON_BENCHMARK_BUILD_VERSION}.tar.gz + PAIMON_ARROW_BUILD_VERSION=17.0.0 PAIMON_ARROW_BUILD_SHA256_CHECKSUM=9d280d8042e7cf526f8c28d170d93bfab65e50f94569f6a790982a878d8d898d PAIMON_ARROW_PKG_NAME=apache-arrow-${PAIMON_ARROW_BUILD_VERSION}.tar.gz @@ -124,6 +127,7 @@ DEPENDENCIES=( "PAIMON_TBB_URL ${PAIMON_TBB_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/uxlfoundation/oneTBB/archive/refs/tags/${PAIMON_TBB_BUILD_VERSION}.tar.gz" "PAIMON_ORC_URL ${PAIMON_ORC_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/apache/orc/archive/refs/tags/${PAIMON_ORC_BUILD_VERSION}.tar.gz" "PAIMON_GTEST_URL ${PAIMON_GTEST_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/google/googletest/archive/release-${PAIMON_GTEST_BUILD_VERSION}.tar.gz" + "PAIMON_BENCHMARK_URL ${PAIMON_BENCHMARK_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/google/benchmark/archive/refs/tags/v${PAIMON_BENCHMARK_BUILD_VERSION}.tar.gz" "PAIMON_ARROW_URL ${PAIMON_ARROW_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/apache/arrow/releases/download/apache-arrow-${PAIMON_ARROW_BUILD_VERSION}/apache-arrow-${PAIMON_ARROW_BUILD_VERSION}.tar.gz" "PAIMON_AVRO_URL ${PAIMON_AVRO_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/apache/avro/archive/${PAIMON_AVRO_BUILD_VERSION}.tar.gz" "PAIMON_FMT_URL ${PAIMON_FMT_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/fmtlib/fmt/archive/refs/tags/${PAIMON_FMT_BUILD_VERSION}.tar.gz" From 0338cc194502816af88b8d98deb0d471888d1ddb Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Tue, 26 May 2026 10:41:16 +0800 Subject: [PATCH 02/13] fix --- benchmark/cli_option_parsing.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h index 1ef47ca2b..4ef91fd8e 100644 --- a/benchmark/cli_option_parsing.h +++ b/benchmark/cli_option_parsing.h @@ -41,6 +41,7 @@ inline std::vector ParseCsvColumns(const std::string& csv, std::vector columns; std::string current; + bool last_delimiter_was_comma = false; for (char c : csv) { if (c == ',') { if (current.empty()) { @@ -48,15 +49,28 @@ inline std::vector ParseCsvColumns(const std::string& csv, } columns.push_back(current); current.clear(); + last_delimiter_was_comma = true; continue; } - if (c != ' ' && c != '\t') { - current.push_back(c); + if (c == ' ' || c == '\t') { + if (!current.empty()) { + columns.push_back(current); + current.clear(); + } + continue; } + + current.push_back(c); + last_delimiter_was_comma = false; } + if (current.empty()) { + if (!columns.empty() && !last_delimiter_was_comma) { + return columns; + } throw std::runtime_error("invalid " + option_name + ": empty column name"); } + columns.push_back(current); return columns; } From ecedf76b243fdad62e1f0cf505b3d5311025733d Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Tue, 26 May 2026 11:26:19 +0800 Subject: [PATCH 03/13] fix --- benchmark/benchmark_suite.cpp | 32 +++++++++++++++---------- benchmark/read_write_benchmark.cpp | 1 + cmake_modules/FindbenchmarkAlt.cmake | 2 +- cmake_modules/ThirdpartyToolchain.cmake | 9 +++++-- third_party/versions.txt | 1 + 5 files changed, 29 insertions(+), 16 deletions(-) diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp index 15dfc335d..d32059828 100644 --- a/benchmark/benchmark_suite.cpp +++ b/benchmark/benchmark_suite.cpp @@ -14,15 +14,11 @@ * limitations under the License. */ -#include -#include -#include - #include -#include #include #include #include +#include #include #include #include @@ -154,22 +150,32 @@ void PrintPaimonBenchmarkCliHelpImpl() { struct BenchmarkWorkspace { explicit BenchmarkWorkspace(const std::string& prefix) { - root_path = "/tmp/" + prefix + "_" + std::to_string(NextId()); - EnsureDirectory(root_path); + std::error_code ec; + const std::filesystem::path temp_dir = std::filesystem::temp_directory_path(ec); + if (ec) { + throw std::runtime_error("failed to get system temp directory: " + ec.message()); + } + + const std::filesystem::path workspace_dir = + temp_dir / (prefix + "_" + std::to_string(NextId())); + root_path = workspace_dir.string(); + EnsureDirectory(workspace_dir); } ~BenchmarkWorkspace() { - const std::string cleanup_cmd = "rm -rf '" + root_path + "'"; - std::system(cleanup_cmd.c_str()); + std::error_code ec; + std::filesystem::remove_all(std::filesystem::path(root_path), ec); } std::string root_path; private: - static void EnsureDirectory(const std::string& path) { - if (mkdir(path.c_str(), 0755) != 0 && errno != EEXIST) { - throw std::runtime_error("failed to create benchmark workspace: " + path + - ", errno=" + std::to_string(errno)); + static void EnsureDirectory(const std::filesystem::path& path) { + std::error_code ec; + std::filesystem::create_directories(path, ec); + if (ec) { + throw std::runtime_error("failed to create benchmark workspace: " + path.string() + + ", error=" + ec.message()); } } diff --git a/benchmark/read_write_benchmark.cpp b/benchmark/read_write_benchmark.cpp index 9130b27c3..d06fc1fc8 100644 --- a/benchmark/read_write_benchmark.cpp +++ b/benchmark/read_write_benchmark.cpp @@ -30,6 +30,7 @@ int main(int argc, char** argv) { if (paimon::benchmark::HasHelpFlag(argc, argv)) { paimon::benchmark::PrintPaimonBenchmarkCliHelp(); + return 0; } benchmark::Initialize(&argc, argv); diff --git a/cmake_modules/FindbenchmarkAlt.cmake b/cmake_modules/FindbenchmarkAlt.cmake index 4c29e3a34..20c290d7f 100644 --- a/cmake_modules/FindbenchmarkAlt.cmake +++ b/cmake_modules/FindbenchmarkAlt.cmake @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -set(_PAIMON_BENCHMARK_ROOTS ${Benchmark_ROOT} ${benchmark_ROOT} ${PAIMON_PACKAGE_PREFIX}) +set(_PAIMON_BENCHMARK_ROOTS ${benchmark_ROOT} ${BENCHMARK_ROOT} ${PAIMON_PACKAGE_PREFIX}) list(REMOVE_ITEM _PAIMON_BENCHMARK_ROOTS "") if(_PAIMON_BENCHMARK_ROOTS) set(_PAIMON_BENCHMARK_FIND_ARGS HINTS ${_PAIMON_BENCHMARK_ROOTS} NO_DEFAULT_PATH) diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index 70f5954b3..e091bbbf8 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -1764,8 +1764,12 @@ macro(build_benchmark) set(BENCHMARK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/benchmark_ep-install") set(BENCHMARK_INCLUDE_DIR "${BENCHMARK_PREFIX}/include") - set(BENCHMARK_STATIC_LIB "${BENCHMARK_PREFIX}/lib/libbenchmark.a") - set(BENCHMARK_MAIN_STATIC_LIB "${BENCHMARK_PREFIX}/lib/libbenchmark_main.a") + set(BENCHMARK_STATIC_LIB + "${BENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(BENCHMARK_MAIN_STATIC_LIB + "${BENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark_main${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) set(BENCHMARK_CMAKE_ARGS ${EP_COMMON_CMAKE_ARGS} @@ -1776,6 +1780,7 @@ macro(build_benchmark) externalproject_add(benchmark_ep URL ${BENCHMARK_SOURCE_URL} + URL_HASH "SHA256=${PAIMON_BENCHMARK_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${BENCHMARK_CMAKE_ARGS} BUILD_BYPRODUCTS "${BENCHMARK_STATIC_LIB}" "${BENCHMARK_MAIN_STATIC_LIB}") diff --git a/third_party/versions.txt b/third_party/versions.txt index 235e8ba4e..89624f5c0 100644 --- a/third_party/versions.txt +++ b/third_party/versions.txt @@ -61,6 +61,7 @@ PAIMON_GTEST_BUILD_SHA256_CHECKSUM=b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab2 PAIMON_GTEST_PKG_NAME=gtest-${PAIMON_GTEST_BUILD_VERSION}.tar.gz PAIMON_BENCHMARK_BUILD_VERSION=1.9.1 +PAIMON_BENCHMARK_BUILD_SHA256_CHECKSUM=32131c08ee31eeff2c8968d7e874f3cb648034377dfc32a4c377fa8796d84981 PAIMON_BENCHMARK_PKG_NAME=benchmark-${PAIMON_BENCHMARK_BUILD_VERSION}.tar.gz PAIMON_ARROW_BUILD_VERSION=17.0.0 From a9b23169886f5d03bd38ad5c61a39045880a72d7 Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Wed, 27 May 2026 09:32:24 +0800 Subject: [PATCH 04/13] fix --- benchmark/CMakeLists.txt | 6 +- benchmark/benchmark_case_mor_read.cpp | 2 +- benchmark/benchmark_case_pk_write.cpp | 2 +- benchmark/benchmark_case_read.cpp | 2 +- benchmark/benchmark_case_write.cpp | 2 +- benchmark/benchmark_helpers.cpp | 19 +++---- benchmark/benchmark_helpers.h | 13 ++--- benchmark/benchmark_suite.cpp | 76 ++++++++++++------------- benchmark/cli_option_parsing_test.cpp | 30 +++++----- benchmark/read_write_benchmark.cpp | 2 +- cmake_modules/BuildUtils.cmake | 8 ++- cmake_modules/FindbenchmarkAlt.cmake | 2 +- cmake_modules/ThirdpartyToolchain.cmake | 3 +- 13 files changed, 82 insertions(+), 85 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 23d863ef7..b530c4a4c 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -57,13 +57,17 @@ if(PAIMON_BUILD_BENCHMARKS) Threads::Threads ${CMAKE_DL_LIBS} rt - ${PAIMON_BENCHMARK_LINK_TOOLCHAIN}) + ${PAIMON_BENCHMARK_LINK_TOOLCHAIN} + EXTRA_INCLUDES + ${CMAKE_SOURCE_DIR}) endif() if(PAIMON_BUILD_TESTS) add_paimon_test(cli_option_parsing_test SOURCES cli_option_parsing_test.cpp + EXTRA_INCLUDES + ${CMAKE_SOURCE_DIR} STATIC_LINK_LIBS ${GTEST_LINK_TOOLCHAIN}) endif() diff --git a/benchmark/benchmark_case_mor_read.cpp b/benchmark/benchmark_case_mor_read.cpp index f9b938498..d96c7e3e0 100644 --- a/benchmark/benchmark_case_mor_read.cpp +++ b/benchmark/benchmark_case_mor_read.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "benchmark_suite.h" +#include "benchmark/benchmark_suite.h" namespace { diff --git a/benchmark/benchmark_case_pk_write.cpp b/benchmark/benchmark_case_pk_write.cpp index 7c1ac5f4f..d18a71d39 100644 --- a/benchmark/benchmark_case_pk_write.cpp +++ b/benchmark/benchmark_case_pk_write.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "benchmark_suite.h" +#include "benchmark/benchmark_suite.h" namespace { diff --git a/benchmark/benchmark_case_read.cpp b/benchmark/benchmark_case_read.cpp index 3ae35efd6..71b528d18 100644 --- a/benchmark/benchmark_case_read.cpp +++ b/benchmark/benchmark_case_read.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "benchmark_suite.h" +#include "benchmark/benchmark_suite.h" namespace { diff --git a/benchmark/benchmark_case_write.cpp b/benchmark/benchmark_case_write.cpp index 43693e391..4bf34695a 100644 --- a/benchmark/benchmark_case_write.cpp +++ b/benchmark/benchmark_case_write.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "benchmark_suite.h" +#include "benchmark/benchmark_suite.h" namespace { diff --git a/benchmark/benchmark_helpers.cpp b/benchmark/benchmark_helpers.cpp index 09d601c0e..f4c409d0b 100644 --- a/benchmark/benchmark_helpers.cpp +++ b/benchmark/benchmark_helpers.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "benchmark_helpers.h" +#include "benchmark/benchmark_helpers.h" #include @@ -23,8 +23,8 @@ namespace paimon::benchmark { bool BenchmarkHelpers::ValidateFileFormatOrSkip(::benchmark::State& state, - const std::string& file_format, - bool is_supported, SkipFn skip) { + const std::string& file_format, bool is_supported, + SkipFn skip) { if (!is_supported) { skip(state, "file format is not supported in this build: " + file_format); return false; @@ -34,8 +34,7 @@ bool BenchmarkHelpers::ValidateFileFormatOrSkip(::benchmark::State& state, bool BenchmarkHelpers::ValidateSourcePresenceOrSkip(::benchmark::State& state, const std::string& source_path, - const std::string& message, - SkipFn skip) { + const std::string& message, SkipFn skip) { if (source_path.empty()) { skip(state, message); return false; @@ -48,16 +47,14 @@ bool BenchmarkHelpers::ValidateSourceSupportOrSkip(::benchmark::State& state, bool is_supported, SkipFn skip) { if (!is_supported) { skip(state, - "source data mode requires reader support in this build for format: " + - source_format); + "source data mode requires reader support in this build for format: " + source_format); return false; } return true; } bool BenchmarkHelpers::ValidatePrefetchParallelOrSkip(::benchmark::State& state, - int32_t prefetch_parallel_num, - SkipFn skip) { + int32_t prefetch_parallel_num, SkipFn skip) { if (prefetch_parallel_num <= 0) { skip(state, "prefetch_parallel must be greater than 0"); return false; @@ -82,8 +79,8 @@ bool BenchmarkHelpers::TryRunExternalReadMode(::benchmark::State& state, return false; } - std::cout << "[benchmark][" << benchmark_name - << "] external_table_path=" << external_table_path << std::endl; + std::cout << "[benchmark][" << benchmark_name << "] external_table_path=" << external_table_path + << std::endl; const int64_t rows_read = RunReadIterations(state, read_once); state.SetItemsProcessed(state.iterations() * rows_read); return true; diff --git a/benchmark/benchmark_helpers.h b/benchmark/benchmark_helpers.h index 30fb0c3a9..39f6eadf9 100644 --- a/benchmark/benchmark_helpers.h +++ b/benchmark/benchmark_helpers.h @@ -31,8 +31,7 @@ class BenchmarkHelpers { using ReadOnceFn = std::function; using SkipFn = void (*)(::benchmark::State&, const std::string&); - static bool ValidateFileFormatOrSkip(::benchmark::State& state, - const std::string& file_format, + static bool ValidateFileFormatOrSkip(::benchmark::State& state, const std::string& file_format, bool is_supported, SkipFn skip); static bool ValidateSourcePresenceOrSkip(::benchmark::State& state, @@ -40,17 +39,15 @@ class BenchmarkHelpers { const std::string& message, SkipFn skip); static bool ValidateSourceSupportOrSkip(::benchmark::State& state, - const std::string& source_format, - bool is_supported, SkipFn skip); + const std::string& source_format, bool is_supported, + SkipFn skip); static bool ValidatePrefetchParallelOrSkip(::benchmark::State& state, - int32_t prefetch_parallel_num, - SkipFn skip); + int32_t prefetch_parallel_num, SkipFn skip); static int64_t RunReadIterations(::benchmark::State& state, const ReadOnceFn& read_once); - static bool TryRunExternalReadMode(::benchmark::State& state, - const std::string& benchmark_name, + static bool TryRunExternalReadMode(::benchmark::State& state, const std::string& benchmark_name, const std::string& external_table_path, const ReadOnceFn& read_once); }; diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp index d32059828..2b8d04e7d 100644 --- a/benchmark/benchmark_suite.cpp +++ b/benchmark/benchmark_suite.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "benchmark/benchmark_suite.h" + #include #include #include @@ -33,9 +35,8 @@ #include "arrow/api.h" #include "arrow/c/bridge.h" #include "arrow/io/api.h" -#include "benchmark_helpers.h" -#include "benchmark_suite.h" -#include "cli_option_parsing.h" +#include "benchmark/benchmark_helpers.h" +#include "benchmark/cli_option_parsing.h" #include "paimon/api.h" #include "paimon/catalog/catalog.h" @@ -665,14 +666,13 @@ void RunBMWrite(::benchmark::State& state) { state, source_spec.path, "--paimon_source_parquet is required", &SkipWithMessage)) { return; } - if (!BenchmarkHelpers::ValidateSourceSupportOrSkip( - state, source_spec.format, SupportsSourceDataMode(source_spec.format), - &SkipWithMessage)) { + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { return; } - if (!BenchmarkHelpers::ValidateFileFormatOrSkip(state, file_format, - IsFileFormatSupported(file_format), - &SkipWithMessage)) { + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { return; } @@ -703,9 +703,8 @@ void RunBMRead(::benchmark::State& state) { if (!TryGetSourceSpec(state, &source_spec)) { return; } - if (!BenchmarkHelpers::ValidateFileFormatOrSkip(state, file_format, - IsFileFormatSupported(file_format), - &SkipWithMessage)) { + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { return; } @@ -716,9 +715,9 @@ void RunBMRead(::benchmark::State& state) { auto options = BuildOptions(file_format); - if (BenchmarkHelpers::TryRunExternalReadMode( - state, "read", external_table_path, - [&]() { return ReadRows(external_table_path, options, prefetch_parallel_num); })) { + if (BenchmarkHelpers::TryRunExternalReadMode(state, "read", external_table_path, [&]() { + return ReadRows(external_table_path, options, prefetch_parallel_num); + })) { return; } @@ -728,9 +727,9 @@ void RunBMRead(::benchmark::State& state) { &SkipWithMessage)) { return; } - if (!BenchmarkHelpers::ValidateSourceSupportOrSkip( - state, source_spec.format, SupportsSourceDataMode(source_spec.format), - &SkipWithMessage)) { + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { return; } @@ -742,8 +741,9 @@ void RunBMRead(::benchmark::State& state) { return; } - const int64_t rows_read = BenchmarkHelpers::RunReadIterations( - state, [&]() { return ReadRows(shared_table->table_path, options, prefetch_parallel_num); }); + const int64_t rows_read = BenchmarkHelpers::RunReadIterations(state, [&]() { + return ReadRows(shared_table->table_path, options, prefetch_parallel_num); + }); state.SetItemsProcessed(state.iterations() * rows_read); } @@ -758,14 +758,13 @@ void RunBMPkWrite(::benchmark::State& state) { state, source_spec.path, "--paimon_source_parquet is required", &SkipWithMessage)) { return; } - if (!BenchmarkHelpers::ValidateSourceSupportOrSkip( - state, source_spec.format, SupportsSourceDataMode(source_spec.format), - &SkipWithMessage)) { + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { return; } - if (!BenchmarkHelpers::ValidateFileFormatOrSkip(state, file_format, - IsFileFormatSupported(file_format), - &SkipWithMessage)) { + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { return; } const std::vector& pk_columns = GetPkColumns(); @@ -802,9 +801,8 @@ void RunBMMorRead(::benchmark::State& state) { if (!TryGetSourceSpec(state, &source_spec)) { return; } - if (!BenchmarkHelpers::ValidateFileFormatOrSkip(state, file_format, - IsFileFormatSupported(file_format), - &SkipWithMessage)) { + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { return; } if (!BenchmarkHelpers::ValidatePrefetchParallelOrSkip(state, prefetch_parallel_num, @@ -813,12 +811,9 @@ void RunBMMorRead(::benchmark::State& state) { } const auto external_read_options = BuildOptions(file_format); - if (BenchmarkHelpers::TryRunExternalReadMode( - state, "mor-read", external_table_path, - [&]() { - return ReadRows(external_table_path, external_read_options, - prefetch_parallel_num); - })) { + if (BenchmarkHelpers::TryRunExternalReadMode(state, "mor-read", external_table_path, [&]() { + return ReadRows(external_table_path, external_read_options, prefetch_parallel_num); + })) { return; } @@ -828,9 +823,9 @@ void RunBMMorRead(::benchmark::State& state) { &SkipWithMessage)) { return; } - if (!BenchmarkHelpers::ValidateSourceSupportOrSkip( - state, source_spec.format, SupportsSourceDataMode(source_spec.format), - &SkipWithMessage)) { + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { return; } if (GetPkColumns().empty()) { @@ -847,8 +842,9 @@ void RunBMMorRead(::benchmark::State& state) { return; } - const int64_t rows_read = BenchmarkHelpers::RunReadIterations( - state, [&]() { return ReadRows(shared_table->table_path, options, prefetch_parallel_num); }); + const int64_t rows_read = BenchmarkHelpers::RunReadIterations(state, [&]() { + return ReadRows(shared_table->table_path, options, prefetch_parallel_num); + }); state.SetItemsProcessed(state.iterations() * rows_read); } diff --git a/benchmark/cli_option_parsing_test.cpp b/benchmark/cli_option_parsing_test.cpp index b89328ea3..98488792a 100644 --- a/benchmark/cli_option_parsing_test.cpp +++ b/benchmark/cli_option_parsing_test.cpp @@ -14,12 +14,13 @@ * limitations under the License. */ +#include "benchmark/cli_option_parsing.h" + #include #include #include #include "gtest/gtest.h" -#include "cli_option_parsing.h" namespace paimon::testing { namespace { @@ -59,13 +60,11 @@ TEST(CliOptionParsingTest, ParseCsvColumnsWorks) { TEST(CliOptionParsingTest, ParseCsvColumnsRejectsInvalidInput) { EXPECT_THROW((void)paimon::benchmark::ParseCsvColumns("", "--cols"), std::runtime_error); - EXPECT_THROW((void)paimon::benchmark::ParseCsvColumns("id,", "--cols"), - std::runtime_error); + EXPECT_THROW((void)paimon::benchmark::ParseCsvColumns("id,", "--cols"), std::runtime_error); } TEST(CliOptionParsingTest, ParseDelimitedOptionsWorks) { - const auto parsed = - paimon::benchmark::ParseDelimitedOptions("k1:v1;k2:v2", "--paimon_option"); + const auto parsed = paimon::benchmark::ParseDelimitedOptions("k1:v1;k2:v2", "--paimon_option"); ASSERT_EQ(parsed.size(), 2U); EXPECT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); EXPECT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); @@ -84,8 +83,7 @@ TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) int i = 1; std::string value; EXPECT_TRUE(paimon::benchmark::ParseStringOptionArg( - &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--foo", - &value)); + &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--foo", &value)); EXPECT_EQ(i, 1); EXPECT_EQ(value, "bar"); } @@ -95,8 +93,7 @@ TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) int i = 1; std::string value; EXPECT_TRUE(paimon::benchmark::ParseStringOptionArg( - &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--foo", - &value)); + &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--foo", &value)); EXPECT_EQ(i, 2); EXPECT_EQ(value, "bar"); } @@ -106,10 +103,10 @@ TEST(CliOptionParsingTest, ParseStringOptionArgRejectsMissingValue) { ArgvHolder argv_holder({"prog", "--foo"}); int i = 1; std::string value; - EXPECT_THROW((void)paimon::benchmark::ParseStringOptionArg( - &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], - "--foo", &value), - std::runtime_error); + EXPECT_THROW( + (void)paimon::benchmark::ParseStringOptionArg( + &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--foo", &value), + std::runtime_error); } TEST(CliOptionParsingTest, ParseCsvOptionArgAndDelimitedRepeatableOptionArgWorks) { @@ -119,8 +116,7 @@ TEST(CliOptionParsingTest, ParseCsvOptionArgAndDelimitedRepeatableOptionArgWorks std::vector columns; EXPECT_TRUE(paimon::benchmark::ParseCsvOptionArg(&i, argv_holder.argc(), argv_holder.argv.data(), - argv_holder.args[i], "--cols", - &columns)); + argv_holder.args[i], "--cols", &columns)); EXPECT_EQ(i, 2); ASSERT_EQ(columns.size(), 2U); EXPECT_EQ(columns[0], "id"); @@ -132,8 +128,8 @@ TEST(CliOptionParsingTest, ParseCsvOptionArgAndDelimitedRepeatableOptionArgWorks int i = 1; std::vector> options; EXPECT_TRUE(paimon::benchmark::ParseDelimitedRepeatableOptionArg( - &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], - "--paimon_option", &options)); + &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--paimon_option", + &options)); EXPECT_EQ(i, 2); ASSERT_EQ(options.size(), 2U); EXPECT_EQ(options[0], std::make_pair(std::string("k1"), std::string("v1"))); diff --git a/benchmark/read_write_benchmark.cpp b/benchmark/read_write_benchmark.cpp index d06fc1fc8..10f78a9bc 100644 --- a/benchmark/read_write_benchmark.cpp +++ b/benchmark/read_write_benchmark.cpp @@ -18,7 +18,7 @@ #include #include "benchmark/benchmark.h" -#include "benchmark_suite.h" +#include "benchmark/benchmark_suite.h" int main(int argc, char** argv) { try { diff --git a/cmake_modules/BuildUtils.cmake b/cmake_modules/BuildUtils.cmake index 632d9b7e2..d6d3b4a58 100644 --- a/cmake_modules/BuildUtils.cmake +++ b/cmake_modules/BuildUtils.cmake @@ -408,7 +408,13 @@ endfunction() function(add_benchmark_case REL_BENCHMARK_NAME) set(options ENABLED) set(one_value_args) - set(multi_value_args SOURCES STATIC_LINK_LIBS EXTRA_LINK_LIBS EXTRA_INCLUDES LABELS PREFIX) + set(multi_value_args + SOURCES + STATIC_LINK_LIBS + EXTRA_LINK_LIBS + EXTRA_INCLUDES + LABELS + PREFIX) cmake_parse_arguments(ARG "${options}" "${one_value_args}" diff --git a/cmake_modules/FindbenchmarkAlt.cmake b/cmake_modules/FindbenchmarkAlt.cmake index 20c290d7f..a731f3359 100644 --- a/cmake_modules/FindbenchmarkAlt.cmake +++ b/cmake_modules/FindbenchmarkAlt.cmake @@ -33,7 +33,7 @@ if(NOT TARGET benchmark::benchmark) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(benchmarkAlt REQUIRED_VARS BENCHMARK_INCLUDE_DIR - BENCHMARK_LIBRARY) + BENCHMARK_LIBRARY) if(benchmarkAlt_FOUND) if(NOT TARGET benchmark::benchmark) diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index e091bbbf8..4a1bc084a 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -1782,7 +1782,8 @@ macro(build_benchmark) URL ${BENCHMARK_SOURCE_URL} URL_HASH "SHA256=${PAIMON_BENCHMARK_BUILD_SHA256_CHECKSUM}" CMAKE_ARGS ${BENCHMARK_CMAKE_ARGS} - BUILD_BYPRODUCTS "${BENCHMARK_STATIC_LIB}" "${BENCHMARK_MAIN_STATIC_LIB}") + BUILD_BYPRODUCTS "${BENCHMARK_STATIC_LIB}" + "${BENCHMARK_MAIN_STATIC_LIB}") file(MAKE_DIRECTORY "${BENCHMARK_INCLUDE_DIR}") From 0534022404441f30b591a2dadc3a9230721d7c1c Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Thu, 28 May 2026 17:13:01 +0800 Subject: [PATCH 05/13] fix --- benchmark/read_write_benchmark.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmark/read_write_benchmark.cpp b/benchmark/read_write_benchmark.cpp index 10f78a9bc..06c7da523 100644 --- a/benchmark/read_write_benchmark.cpp +++ b/benchmark/read_write_benchmark.cpp @@ -24,7 +24,9 @@ int main(int argc, char** argv) { try { paimon::benchmark::ParsePaimonBenchmarkCliArgs(&argc, argv); } catch (const std::exception& e) { - std::cerr << "[benchmark][cli] " << e.what() << std::endl; + std::cerr << "paimon-read-write-benchmark: " << e.what() << std::endl; + std::cerr << "Try 'paimon-read-write-benchmark --help' for more information." + << std::endl; return 1; } From 2b393db55db526f714ac6cdb2507e1b8ca9b6374 Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Wed, 3 Jun 2026 11:12:00 +0800 Subject: [PATCH 06/13] fix --- CMakeLists.txt | 36 ++--- benchmark/CMakeLists.txt | 28 ++-- benchmark/benchmark_suite.cpp | 181 ++++++++++++++---------- benchmark/cli_option_parsing.h | 85 +++++------ benchmark/cli_option_parsing_test.cpp | 93 ++++++------ cmake_modules/DefineOptions.cmake | 2 +- cmake_modules/ThirdpartyToolchain.cmake | 6 +- docs/source/examples/benchmark.rst | 90 ++++++++++++ docs/source/examples/index.rst | 1 + 9 files changed, 326 insertions(+), 196 deletions(-) create mode 100644 docs/source/examples/benchmark.rst diff --git a/CMakeLists.txt b/CMakeLists.txt index 30d557b7e..026e584fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -355,6 +355,26 @@ endif() set(ENV{PAIMON_TEST_DATA} "${CMAKE_SOURCE_DIR}/test/test_data") +if(PAIMON_BUILD_TESTS OR PAIMON_BUILD_BENCHMARKS) + paimon_link_libraries_whole_archive(PAIMON_LOCAL_FILE_SYSTEM_STATIC_LINK_LIBS + paimon_local_file_system_static) + paimon_link_libraries_no_as_needed(PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS + paimon_local_file_system_shared) + paimon_link_libraries_whole_archive(PAIMON_BLOB_FILE_FORMAT_STATIC_LINK_LIBS + paimon_blob_file_format_static) + paimon_link_libraries_whole_archive(PAIMON_PARQUET_FILE_FORMAT_STATIC_LINK_LIBS + paimon_parquet_file_format_static) + + if(PAIMON_ENABLE_ORC) + paimon_link_libraries_whole_archive(PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS + paimon_orc_file_format_static) + endif() + if(PAIMON_ENABLE_AVRO) + paimon_link_libraries_whole_archive(PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS + paimon_avro_file_format_static) + endif() +endif() + if(PAIMON_BUILD_TESTS) if(NOT PAIMON_ENABLE_ORC) message(FATAL_ERROR "PAIMON_ENABLE_ORC must be enabled if PAIMON_BUILD_TESTS is enable" @@ -389,15 +409,6 @@ if(PAIMON_BUILD_TESTS) TEST_PLUGIN_LINK_LIBS paimon_parquet_file_format_shared paimon_blob_file_format_shared) set(TEST_STATIC_LINK_LIBS ${TEST_WHOLE_ARCHIVE_LINK_LIBS} ${TEST_PLUGIN_LINK_LIBS}) - paimon_link_libraries_whole_archive(PAIMON_LOCAL_FILE_SYSTEM_STATIC_LINK_LIBS - paimon_local_file_system_static) - paimon_link_libraries_no_as_needed(PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS - paimon_local_file_system_shared) - paimon_link_libraries_whole_archive(PAIMON_BLOB_FILE_FORMAT_STATIC_LINK_LIBS - paimon_blob_file_format_static) - paimon_link_libraries_whole_archive(PAIMON_PARQUET_FILE_FORMAT_STATIC_LINK_LIBS - paimon_parquet_file_format_static) - if(PAIMON_ENABLE_LANCE) paimon_link_libraries_whole_archive(PAIMON_LANCE_FILE_FORMAT_STATIC_LINK_LIBS paimon_lance_file_format_static) @@ -406,15 +417,11 @@ if(PAIMON_BUILD_TESTS) list(APPEND TEST_STATIC_LINK_LIBS ${TEST_PLUGIN_LINK_LIBS}) endif() if(PAIMON_ENABLE_ORC) - paimon_link_libraries_whole_archive(PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS - paimon_orc_file_format_static) paimon_link_libraries_no_as_needed(TEST_PLUGIN_LINK_LIBS paimon_orc_file_format_shared) list(APPEND TEST_STATIC_LINK_LIBS ${TEST_PLUGIN_LINK_LIBS}) endif() if(PAIMON_ENABLE_AVRO) - paimon_link_libraries_whole_archive(PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS - paimon_avro_file_format_static) paimon_link_libraries_no_as_needed(TEST_PLUGIN_LINK_LIBS paimon_avro_file_format_shared) list(APPEND TEST_STATIC_LINK_LIBS ${TEST_PLUGIN_LINK_LIBS}) @@ -453,9 +460,6 @@ if(PAIMON_BUILD_BENCHMARKS) add_dependencies(benchmark paimon-benchmarks) set(PAIMON_BENCHMARK_LINK_TOOLCHAIN benchmark::benchmark) - if(TARGET benchmark::benchmark_main) - list(APPEND PAIMON_BENCHMARK_LINK_TOOLCHAIN benchmark::benchmark_main) - endif() endif() paimon_print_dependency_resolution_summary() diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index b530c4a4c..3c0a04f5c 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -20,24 +20,24 @@ find_package(Threads REQUIRED) set(PAIMON_BENCHMARK_STATIC_LINK_LIBS paimon_shared - "-Wl,--whole-archive" - paimon_local_file_system_shared - "-Wl,--no-whole-archive" - "-Wl,--no-as-needed" - paimon_parquet_file_format_shared - paimon_blob_file_format_shared - "-Wl,--as-needed") + ${PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS} + ${PAIMON_PARQUET_FILE_FORMAT_STATIC_LINK_LIBS} + ${PAIMON_BLOB_FILE_FORMAT_STATIC_LINK_LIBS}) if(PAIMON_ENABLE_ORC) - list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS "-Wl,--no-as-needed") - list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS paimon_orc_file_format_shared) - list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS "-Wl,--as-needed") + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS ${PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS}) endif() if(PAIMON_ENABLE_AVRO) - list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS "-Wl,--no-as-needed") - list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS paimon_avro_file_format_shared) - list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS "-Wl,--as-needed") + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS ${PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS}) +endif() + +set(PAIMON_BENCHMARK_PLATFORM_LINK_LIBS) +if(UNIX AND NOT APPLE) + find_library(PAIMON_BENCHMARK_RT_LIBRARY rt) + if(PAIMON_BENCHMARK_RT_LIBRARY) + list(APPEND PAIMON_BENCHMARK_PLATFORM_LINK_LIBS ${PAIMON_BENCHMARK_RT_LIBRARY}) + endif() endif() if(PAIMON_BUILD_BENCHMARKS) @@ -56,7 +56,7 @@ if(PAIMON_BUILD_BENCHMARKS) ${PAIMON_BENCHMARK_STATIC_LINK_LIBS} Threads::Threads ${CMAKE_DL_LIBS} - rt + ${PAIMON_BENCHMARK_PLATFORM_LINK_LIBS} ${PAIMON_BENCHMARK_LINK_TOOLCHAIN} EXTRA_INCLUDES ${CMAKE_SOURCE_DIR}) diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp index 2b8d04e7d..18b023e51 100644 --- a/benchmark/benchmark_suite.cpp +++ b/benchmark/benchmark_suite.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -54,9 +53,11 @@ namespace { using DataBatches = std::vector>; struct BenchmarkCliOptions { - std::string source_parquet; + std::string source_data_file; std::string external_table_path; std::string file_format = "parquet"; + int64_t source_batch_max_rows = 4096; + int32_t row_to_batch_thread_number = 3; std::vector pk_columns; std::vector> extra_options; }; @@ -75,44 +76,83 @@ const BenchmarkCliOptions& GetBenchmarkCliOptions() { return MutableBenchmarkCliOptions(); } +int64_t ParsePositiveInt64(const std::string& value, const std::string& option_name) { + char* end = nullptr; + const auto parsed = std::strtoll(value.c_str(), &end, 10); + if (end == value.c_str() || *end != '\0' || parsed <= 0) { + throw std::runtime_error("invalid " + option_name + ", expected positive integer"); + } + return static_cast(parsed); +} + +int32_t ParsePositiveInt32(const std::string& value, const std::string& option_name) { + const int64_t parsed = ParsePositiveInt64(value, option_name); + if (parsed > std::numeric_limits::max()) { + throw std::runtime_error("invalid " + option_name + ", value is too large"); + } + return static_cast(parsed); +} + void ParsePaimonBenchmarkCliArgsImpl(int* argc, char** argv) { auto& options = MutableBenchmarkCliOptions(); - int write_idx = 1; - for (int i = 1; i < *argc; ++i) { - const std::string arg(argv[i]); - - if (paimon::benchmark::ParseStringOptionArg(&i, *argc, argv, arg, "--paimon_source_parquet", - &options.source_parquet)) { + auto parsed_argc = static_cast(*argc); + int32_t write_index = 1; + for (int32_t arg_index = 1; arg_index < parsed_argc; ++arg_index) { + const std::string arg(argv[arg_index]); + std::string parsed_value; + + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, + "--paimon_source_data_file", &arg_index, + &options.source_data_file)) { + continue; + } + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, + "--paimon_source_parquet", &arg_index, + &options.source_data_file)) { continue; } - if (paimon::benchmark::ParseStringOptionArg(&i, *argc, argv, arg, - "--paimon_external_table_path", + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, + "--paimon_external_table_path", &arg_index, &options.external_table_path)) { continue; } - if (paimon::benchmark::ParseStringOptionArg(&i, *argc, argv, arg, "--paimon_file_format", - &options.file_format)) { + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, "--paimon_file_format", + &arg_index, &options.file_format)) { continue; } - if (paimon::benchmark::ParseCsvOptionArg(&i, *argc, argv, arg, "--paimon_pk_columns", - &options.pk_columns)) { + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, + "--paimon_source_batch_max_rows", &arg_index, + &parsed_value)) { + options.source_batch_max_rows = + ParsePositiveInt64(parsed_value, "--paimon_source_batch_max_rows"); + continue; + } + if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, + "--paimon_row_to_batch_thread_number", + &arg_index, &parsed_value)) { + options.row_to_batch_thread_number = + ParsePositiveInt32(parsed_value, "--paimon_row_to_batch_thread_number"); + continue; + } + if (paimon::benchmark::ParseCsvOptionArg(parsed_argc, argv, arg, "--paimon_pk_columns", + &arg_index, &options.pk_columns)) { continue; } if (paimon::benchmark::ParseDelimitedRepeatableOptionArg( - &i, *argc, argv, arg, "--paimon_option", &options.extra_options)) { + parsed_argc, argv, arg, "--paimon_option", &arg_index, &options.extra_options)) { continue; } - argv[write_idx++] = argv[i]; + argv[write_index++] = argv[arg_index]; } - *argc = write_idx; - argv[write_idx] = nullptr; + *argc = write_index; + argv[write_index] = nullptr; } bool HasHelpFlagImpl(int argc, char** argv) { - for (int i = 1; i < argc; ++i) { - const std::string arg(argv[i]); + for (int32_t arg_index = 1; arg_index < argc; ++arg_index) { + const std::string arg(argv[arg_index]); if (arg == "-h" || arg == "--help" || arg == "--help=true") { return true; } @@ -121,32 +161,36 @@ bool HasHelpFlagImpl(int argc, char** argv) { } void PrintPaimonBenchmarkCliHelpImpl() { - std::cout - << "Paimon benchmark custom options:\n" - << " --paimon_source_parquet=\n" - << " Required. External Parquet source file used to build benchmark data.\n" - << " Also supports: --paimon_source_parquet \n" - << " --paimon_external_table_path=\n" - << " Optional for BM_Read and BM_MOR_Read. If set, read directly from existing\n" - << " table path and\n" - << " skip source file loading and pre-write stage.\n" - << " Also supports: --paimon_external_table_path \n" - << " --paimon_file_format=\n" - << " Optional. Target table file format. Default: parquet.\n" - << " Also supports: --paimon_file_format \n" - << " --paimon_pk_columns=\n" - << " Required by BM_PK_Write and BM_MOR_Read.\n" - << " Also supports: --paimon_pk_columns \n" - << " --paimon_option=:;:\n" - << " Optional and repeatable. Pass through table options as-is.\n" - << " Also supports: --paimon_option :;:\n" - << " Note: use quotes in shell, e.g. \"--paimon_option k1:v1;k2:v2\".\n" - << "\n" - << "Example:\n" - << " paimon-read-write-benchmark --paimon_source_parquet /path/data.parquet \\\n" - << " --paimon_file_format parquet --paimon_pk_columns=id \\\n" - << " --paimon_option \"read.batch-size:8192;bucket:4\" --benchmark_filter=BM_Read\n" - << std::endl; + std::cout << "Paimon benchmark custom options:\n" + << " --paimon_source_data_file=\n" + << " Required. External source data file used to build benchmark data.\n" + << " Currently supports Parquet source files.\n" + << " Also supports: --paimon_source_data_file \n" + << " Deprecated alias: --paimon_source_parquet\n" + << " --paimon_external_table_path=\n" + << " Optional for BM_Read and BM_MOR_Read. If set, read directly from existing\n" + << " table path and skip source file loading and pre-write stage.\n" + << " Also supports: --paimon_external_table_path \n" + << " --paimon_file_format=\n" + << " Optional. Target table file format. Default: parquet.\n" + << " Also supports: --paimon_file_format \n" + << " --paimon_source_batch_max_rows=\n" + << " Optional. Max rows per source batch. Default: 4096.\n" + << " --paimon_row_to_batch_thread_number=\n" + << " Optional. Row-to-batch thread number for reads. Default: 3.\n" + << " --paimon_pk_columns=\n" + << " Required by BM_PK_Write and BM_MOR_Read.\n" + << " Also supports: --paimon_pk_columns \n" + << " --paimon_option=:;:\n" + << " Optional and repeatable. Pass through table options as-is.\n" + << " Also supports: --paimon_option :;:\n" + << " Note: use quotes in shell, e.g. \"--paimon_option k1:v1;k2:v2\".\n" + << "\n" + << "Example:\n" + << " paimon-read-write-benchmark --paimon_source_data_file /path/data.parquet \\\n" + << " --paimon_file_format parquet --paimon_pk_columns=id \\\n" + << " --paimon_option \"read.batch-size:8192\" --benchmark_filter=BM_Read\n" + << std::endl; } struct BenchmarkWorkspace { @@ -256,13 +300,13 @@ std::map BuildOptions(const std::string& file_format) std::map BuildPkOptions(const std::string& file_format) { auto options = BuildOptions(file_format); - options.emplace(paimon::Options::BUCKET, "4"); + options.emplace(paimon::Options::BUCKET, "1"); options.emplace(paimon::Options::MERGE_ENGINE, "deduplicate"); return options; } -std::string GetParquetDataSourcePath() { - return GetBenchmarkCliOptions().source_parquet; +std::string GetSourceDataFilePath() { + return GetBenchmarkCliOptions().source_data_file; } std::string GetExternalTablePath() { @@ -274,26 +318,19 @@ const std::vector& GetPkColumns() { } SourceDataSpec GetSourceDataSpec() { - const std::string parquet_source_path = GetParquetDataSourcePath(); - if (!parquet_source_path.empty()) { - return {"parquet", parquet_source_path}; + const std::string source_data_file_path = GetSourceDataFilePath(); + if (!source_data_file_path.empty()) { + return {"parquet", source_data_file_path}; } return {"", ""}; } -int64_t GetParquetSourceBatchMaxRows() { - const char* value = std::getenv("PAIMON_BENCHMARK_SOURCE_BATCH_MAX_ROWS"); - if (value == nullptr || std::strlen(value) == 0) { - return 4096; - } +int64_t GetSourceBatchMaxRows() { + return GetBenchmarkCliOptions().source_batch_max_rows; +} - char* end = nullptr; - const auto parsed = std::strtoll(value, &end, 10); - if (end == value || *end != '\0' || parsed <= 0) { - throw std::runtime_error( - "invalid PAIMON_BENCHMARK_SOURCE_BATCH_MAX_ROWS, expected positive integer"); - } - return static_cast(parsed); +int32_t GetRowToBatchThreadNumber() { + return GetBenchmarkCliOptions().row_to_batch_thread_number; } bool SupportsParquetSourceDataMode() { @@ -334,7 +371,7 @@ std::shared_ptr BuildStructArrayFromRecordBatch( } const ParquetSourceCache& LoadParquetSource(const std::string& path) { - const int64_t batch_max_rows = GetParquetSourceBatchMaxRows(); + const int64_t batch_max_rows = GetSourceBatchMaxRows(); static ParquetSourceCache cache; if (cache.path == path && cache.batch_max_rows == batch_max_rows) { return cache; @@ -493,7 +530,7 @@ struct SharedMorReadTableCache { std::string BuildReadTableCacheKey(const std::string& file_format, const SourceDataSpec& source_spec) { return file_format + "|" + source_spec.format + "|" + source_spec.path + "|" + - std::to_string(GetParquetSourceBatchMaxRows()); + std::to_string(GetSourceBatchMaxRows()); } std::string JoinColumns(const std::vector& columns) { @@ -589,8 +626,8 @@ int64_t ReadRows(const std::string& table_path, const std::map 1) + .SetRowToBatchThreadNumber(GetRowToBatchThreadNumber()); auto read_ctx = ValueOrThrow(read_builder.Finish(), "create read context"); auto reader = ValueOrThrow(paimon::TableRead::Create(std::move(read_ctx)), "create table reader"); @@ -663,7 +700,7 @@ void RunBMWrite(::benchmark::State& state) { return; } if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( - state, source_spec.path, "--paimon_source_parquet is required", &SkipWithMessage)) { + state, source_spec.path, "--paimon_source_data_file is required", &SkipWithMessage)) { return; } if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, @@ -723,7 +760,7 @@ void RunBMRead(::benchmark::State& state) { if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( state, source_spec.path, - "--paimon_source_parquet is required when --paimon_external_table_path is not set", + "--paimon_source_data_file is required when --paimon_external_table_path is not set", &SkipWithMessage)) { return; } @@ -755,7 +792,7 @@ void RunBMPkWrite(::benchmark::State& state) { return; } if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( - state, source_spec.path, "--paimon_source_parquet is required", &SkipWithMessage)) { + state, source_spec.path, "--paimon_source_data_file is required", &SkipWithMessage)) { return; } if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, @@ -819,7 +856,7 @@ void RunBMMorRead(::benchmark::State& state) { if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( state, source_spec.path, - "--paimon_source_parquet is required when --paimon_external_table_path is not set", + "--paimon_source_data_file is required when --paimon_external_table_path is not set", &SkipWithMessage)) { return; } diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h index 4ef91fd8e..adc72288b 100644 --- a/benchmark/cli_option_parsing.h +++ b/benchmark/cli_option_parsing.h @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -33,6 +34,15 @@ inline bool ConsumeCliOption(const std::string& arg, const std::string& option_n return true; } +inline std::string TrimAsciiWhitespace(const std::string& value) { + const auto first = value.find_first_not_of(" \t\n\r"); + if (first == std::string::npos) { + return ""; + } + const auto last = value.find_last_not_of(" \t\n\r"); + return value.substr(first, last - first + 1); +} + inline std::vector ParseCsvColumns(const std::string& csv, const std::string& option_name) { if (csv.empty()) { @@ -40,38 +50,20 @@ inline std::vector ParseCsvColumns(const std::string& csv, } std::vector columns; - std::string current; - bool last_delimiter_was_comma = false; - for (char c : csv) { - if (c == ',') { - if (current.empty()) { - throw std::runtime_error("invalid " + option_name + ": empty column name"); - } - columns.push_back(current); - current.clear(); - last_delimiter_was_comma = true; - continue; - } - if (c == ' ' || c == '\t') { - if (!current.empty()) { - columns.push_back(current); - current.clear(); - } + size_t segment_start = 0; + for (size_t index = 0; index <= csv.size(); ++index) { + if (index != csv.size() && csv[index] != ',') { continue; } - current.push_back(c); - last_delimiter_was_comma = false; - } - - if (current.empty()) { - if (!columns.empty() && !last_delimiter_was_comma) { - return columns; + const std::string column = + TrimAsciiWhitespace(csv.substr(segment_start, index - segment_start)); + if (column.empty()) { + throw std::runtime_error("invalid " + option_name + ": empty column name"); } - throw std::runtime_error("invalid " + option_name + ": empty column name"); + columns.push_back(column); + segment_start = index + 1; } - - columns.push_back(current); return columns; } @@ -83,10 +75,10 @@ inline std::vector> ParseDelimitedOptions( std::vector> parsed; std::string token; - for (size_t i = 0; i <= input.size(); ++i) { - const bool at_end = (i == input.size()); - if (!at_end && input[i] != ';') { - token.push_back(input[i]); + for (size_t index = 0; index <= input.size(); ++index) { + const bool at_end = (index == input.size()); + if (!at_end && input[index] != ';') { + token.push_back(input[index]); continue; } @@ -94,19 +86,20 @@ inline std::vector> ParseDelimitedOptions( throw std::runtime_error("invalid " + option_name + ": empty option segment"); } - const auto sep = token.find(':'); - if (sep == std::string::npos || sep == 0 || sep + 1 >= token.size()) { + const auto separator = token.find(':'); + if (separator == std::string::npos || separator == 0 || separator + 1 >= token.size()) { throw std::runtime_error("invalid " + option_name + ": expected key:value"); } - parsed.emplace_back(token.substr(0, sep), token.substr(sep + 1)); + parsed.emplace_back(token.substr(0, separator), token.substr(separator + 1)); token.clear(); } return parsed; } -inline bool ParseStringOptionArg(int* i, int argc, char** argv, const std::string& arg, - const std::string& option_name, std::string* value_out) { +inline bool ParseStringOptionArg(int32_t argc, char** argv, const std::string& arg, + const std::string& option_name, int32_t* arg_index, + std::string* value_out) { std::string parsed_value; if (ConsumeCliOption(arg, option_name, &parsed_value)) { *value_out = std::move(parsed_value); @@ -117,15 +110,15 @@ inline bool ParseStringOptionArg(int* i, int argc, char** argv, const std::strin return false; } - if (*i + 1 >= argc) { + if (*arg_index + 1 >= argc) { throw std::runtime_error("missing value for " + option_name); } - *value_out = argv[++(*i)]; + *value_out = argv[++(*arg_index)]; return true; } -inline bool ParseCsvOptionArg(int* i, int argc, char** argv, const std::string& arg, - const std::string& option_name, +inline bool ParseCsvOptionArg(int32_t argc, char** argv, const std::string& arg, + const std::string& option_name, int32_t* arg_index, std::vector* columns_out) { std::string parsed_value; if (ConsumeCliOption(arg, option_name, &parsed_value)) { @@ -137,16 +130,16 @@ inline bool ParseCsvOptionArg(int* i, int argc, char** argv, const std::string& return false; } - if (*i + 1 >= argc) { + if (*arg_index + 1 >= argc) { throw std::runtime_error("missing value for " + option_name); } - *columns_out = ParseCsvColumns(std::string(argv[++(*i)]), option_name); + *columns_out = ParseCsvColumns(std::string(argv[++(*arg_index)]), option_name); return true; } inline bool ParseDelimitedRepeatableOptionArg( - int* i, int argc, char** argv, const std::string& arg, const std::string& option_name, - std::vector>* options_out) { + int32_t argc, char** argv, const std::string& arg, const std::string& option_name, + int32_t* arg_index, std::vector>* options_out) { std::string parsed_value; if (ConsumeCliOption(arg, option_name, &parsed_value)) { const auto parsed_options = ParseDelimitedOptions(parsed_value, option_name); @@ -158,11 +151,11 @@ inline bool ParseDelimitedRepeatableOptionArg( return false; } - if (*i + 1 >= argc) { + if (*arg_index + 1 >= argc) { throw std::runtime_error("missing value for " + option_name); } - const std::string option_arg = argv[++(*i)]; + const std::string option_arg = argv[++(*arg_index)]; const auto parsed_options = ParseDelimitedOptions(option_arg, option_name); options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); return true; diff --git a/benchmark/cli_option_parsing_test.cpp b/benchmark/cli_option_parsing_test.cpp index 98488792a..9d308c6b7 100644 --- a/benchmark/cli_option_parsing_test.cpp +++ b/benchmark/cli_option_parsing_test.cpp @@ -16,6 +16,7 @@ #include "benchmark/cli_option_parsing.h" +#include #include #include #include @@ -36,104 +37,108 @@ struct ArgvHolder { } } - int argc() const { - return static_cast(argv.size()); + int32_t argc() const { + return static_cast(argv.size()); } }; TEST(CliOptionParsingTest, ConsumeCliOptionWorks) { std::string value; - EXPECT_TRUE(paimon::benchmark::ConsumeCliOption("--foo=bar", "--foo", &value)); - EXPECT_EQ(value, "bar"); + ASSERT_TRUE(paimon::benchmark::ConsumeCliOption("--foo=bar", "--foo", &value)); + ASSERT_EQ(value, "bar"); value.clear(); - EXPECT_FALSE(paimon::benchmark::ConsumeCliOption("--foo", "--foo", &value)); + ASSERT_FALSE(paimon::benchmark::ConsumeCliOption("--foo", "--foo", &value)); } TEST(CliOptionParsingTest, ParseCsvColumnsWorks) { - const auto parsed = paimon::benchmark::ParseCsvColumns("id, name\tage", "--cols"); + const auto parsed = paimon::benchmark::ParseCsvColumns("id, name,age", "--cols"); ASSERT_EQ(parsed.size(), 3U); - EXPECT_EQ(parsed[0], "id"); - EXPECT_EQ(parsed[1], "name"); - EXPECT_EQ(parsed[2], "age"); + ASSERT_EQ(parsed[0], "id"); + ASSERT_EQ(parsed[1], "name"); + ASSERT_EQ(parsed[2], "age"); } TEST(CliOptionParsingTest, ParseCsvColumnsRejectsInvalidInput) { - EXPECT_THROW((void)paimon::benchmark::ParseCsvColumns("", "--cols"), std::runtime_error); - EXPECT_THROW((void)paimon::benchmark::ParseCsvColumns("id,", "--cols"), std::runtime_error); + ASSERT_THROW((void)paimon::benchmark::ParseCsvColumns("", "--cols"), std::runtime_error); + ASSERT_THROW((void)paimon::benchmark::ParseCsvColumns("id,", "--cols"), std::runtime_error); + ASSERT_THROW((void)paimon::benchmark::ParseCsvColumns("id,,name", "--cols"), + std::runtime_error); } TEST(CliOptionParsingTest, ParseDelimitedOptionsWorks) { const auto parsed = paimon::benchmark::ParseDelimitedOptions("k1:v1;k2:v2", "--paimon_option"); ASSERT_EQ(parsed.size(), 2U); - EXPECT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); - EXPECT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); + ASSERT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); + ASSERT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); } TEST(CliOptionParsingTest, ParseDelimitedOptionsRejectsInvalidInput) { - EXPECT_THROW((void)paimon::benchmark::ParseDelimitedOptions("", "--paimon_option"), + ASSERT_THROW((void)paimon::benchmark::ParseDelimitedOptions("", "--paimon_option"), std::runtime_error); - EXPECT_THROW((void)paimon::benchmark::ParseDelimitedOptions("k1:v1;", "--paimon_option"), + ASSERT_THROW((void)paimon::benchmark::ParseDelimitedOptions("k1:v1;", "--paimon_option"), std::runtime_error); } TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) { { ArgvHolder argv_holder({"prog", "--foo=bar"}); - int i = 1; + int32_t arg_index = 1; std::string value; - EXPECT_TRUE(paimon::benchmark::ParseStringOptionArg( - &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--foo", &value)); - EXPECT_EQ(i, 1); - EXPECT_EQ(value, "bar"); + ASSERT_TRUE(paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--foo", + &arg_index, &value)); + ASSERT_EQ(arg_index, 1); + ASSERT_EQ(value, "bar"); } { ArgvHolder argv_holder({"prog", "--foo", "bar"}); - int i = 1; + int32_t arg_index = 1; std::string value; - EXPECT_TRUE(paimon::benchmark::ParseStringOptionArg( - &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--foo", &value)); - EXPECT_EQ(i, 2); - EXPECT_EQ(value, "bar"); + ASSERT_TRUE(paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--foo", + &arg_index, &value)); + ASSERT_EQ(arg_index, 2); + ASSERT_EQ(value, "bar"); } } TEST(CliOptionParsingTest, ParseStringOptionArgRejectsMissingValue) { ArgvHolder argv_holder({"prog", "--foo"}); - int i = 1; + int32_t arg_index = 1; std::string value; - EXPECT_THROW( - (void)paimon::benchmark::ParseStringOptionArg( - &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--foo", &value), - std::runtime_error); + ASSERT_THROW((void)paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], + "--foo", &arg_index, &value), + std::runtime_error); } TEST(CliOptionParsingTest, ParseCsvOptionArgAndDelimitedRepeatableOptionArgWorks) { { ArgvHolder argv_holder({"prog", "--cols", "id,name"}); - int i = 1; + int32_t arg_index = 1; std::vector columns; - EXPECT_TRUE(paimon::benchmark::ParseCsvOptionArg(&i, argv_holder.argc(), - argv_holder.argv.data(), - argv_holder.args[i], "--cols", &columns)); - EXPECT_EQ(i, 2); + ASSERT_TRUE(paimon::benchmark::ParseCsvOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--cols", + &arg_index, &columns)); + ASSERT_EQ(arg_index, 2); ASSERT_EQ(columns.size(), 2U); - EXPECT_EQ(columns[0], "id"); - EXPECT_EQ(columns[1], "name"); + ASSERT_EQ(columns[0], "id"); + ASSERT_EQ(columns[1], "name"); } { ArgvHolder argv_holder({"prog", "--paimon_option", "k1:v1;k2:v2"}); - int i = 1; + int32_t arg_index = 1; std::vector> options; - EXPECT_TRUE(paimon::benchmark::ParseDelimitedRepeatableOptionArg( - &i, argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[i], "--paimon_option", - &options)); - EXPECT_EQ(i, 2); + ASSERT_TRUE(paimon::benchmark::ParseDelimitedRepeatableOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], + "--paimon_option", &arg_index, &options)); + ASSERT_EQ(arg_index, 2); ASSERT_EQ(options.size(), 2U); - EXPECT_EQ(options[0], std::make_pair(std::string("k1"), std::string("v1"))); - EXPECT_EQ(options[1], std::make_pair(std::string("k2"), std::string("v2"))); + ASSERT_EQ(options[0], std::make_pair(std::string("k1"), std::string("v1"))); + ASSERT_EQ(options[1], std::make_pair(std::string("k2"), std::string("v2"))); } } diff --git a/cmake_modules/DefineOptions.cmake b/cmake_modules/DefineOptions.cmake index 6d965cd1d..cf7f964be 100644 --- a/cmake_modules/DefineOptions.cmake +++ b/cmake_modules/DefineOptions.cmake @@ -245,7 +245,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") BUNDLED SYSTEM) - define_option_string(Benchmark_SOURCE + define_option_string(benchmark_SOURCE "Dependency source for Google Benchmark" "" AUTO diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index 4a1bc084a..271011a0d 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -512,7 +512,7 @@ function(paimon_get_dependency_compat_target DEPENDENCY_NAME OUT_VAR) set(_target libprotobuf) elseif("${DEPENDENCY_NAME}" STREQUAL "GTest") set(_target GTest::gtest) - elseif("${DEPENDENCY_NAME}" STREQUAL "Benchmark") + elseif("${DEPENDENCY_NAME}" STREQUAL "benchmark") set(_target benchmark::benchmark) elseif("${DEPENDENCY_NAME}" STREQUAL "RE2") set(_target re2::re2) @@ -600,7 +600,7 @@ macro(paimon_build_dependency DEPENDENCY_NAME) build_avro() elseif("${DEPENDENCY_NAME}" STREQUAL "GTest") build_gtest() - elseif("${DEPENDENCY_NAME}" STREQUAL "Benchmark") + elseif("${DEPENDENCY_NAME}" STREQUAL "benchmark") build_benchmark() else() message(FATAL_ERROR "No bundled build rule for ${DEPENDENCY_NAME}") @@ -1870,7 +1870,7 @@ if(PAIMON_ENABLE_ORC) resolve_dependency(ORC) endif() if(PAIMON_BUILD_BENCHMARKS) - resolve_dependency(Benchmark FIND_PACKAGE_NAME benchmark) + resolve_dependency(benchmark) endif() if(PAIMON_ENABLE_JINDO) build_jindosdk_c() diff --git a/docs/source/examples/benchmark.rst b/docs/source/examples/benchmark.rst new file mode 100644 index 000000000..1b5f5896b --- /dev/null +++ b/docs/source/examples/benchmark.rst @@ -0,0 +1,90 @@ +.. Copyright 2026-present Alibaba Inc. + +.. Licensed under the Apache License, Version 2.0 (the "License"); +.. you may not use this file except in compliance with the License. +.. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, software +.. distributed under the License is distributed on an "AS IS" BASIS, +.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. See the License for the specific language governing permissions and +.. limitations under the License. + +================ +Benchmark Usage +================ + +Paimon C++ provides Google Benchmark based cases for append-table write/read and +primary-key table write/MOR read paths. Benchmarks are disabled by default. + +Build +===== + +Enable benchmarks when configuring CMake:: + + cmake -S . -B build -DPAIMON_BUILD_BENCHMARKS=ON + cmake --build build --target paimon-read-write-benchmark + +Run all benchmark cases through CTest:: + + cmake --build build --target benchmark + +Custom Options +============== + +``paimon-read-write-benchmark`` accepts Google Benchmark options plus the Paimon +specific options below: + +``--paimon_source_data_file=`` + Source data file used to build benchmark data. Currently Parquet source files + are supported. ``--paimon_source_parquet`` is kept as a deprecated alias. + +``--paimon_external_table_path=`` + Read directly from an existing table path for ``BM_Read`` and ``BM_MOR_Read``. + When set, the source loading and pre-write stage are skipped. + +``--paimon_file_format=`` + Target table file format. The default value is ``parquet``. + +``--paimon_source_batch_max_rows=`` + Max rows per source batch. The default value is ``4096``. + +``--paimon_row_to_batch_thread_number=`` + Row-to-batch thread number for reads. The default value is ``3``. + +``--paimon_pk_columns=`` + Primary key columns for ``BM_PK_Write`` and ``BM_MOR_Read``. + +``--paimon_option=:;:`` + Repeatable table options passed through to Paimon. + +Examples +======== + +Append table write:: + + paimon-read-write-benchmark \ + --paimon_source_data_file /path/data.parquet \ + --benchmark_filter=BM_Write + +Append table read with four prefetch workers:: + + paimon-read-write-benchmark \ + --paimon_source_data_file /path/data.parquet \ + --benchmark_filter=BM_Read/4 + +Primary-key table write:: + + paimon-read-write-benchmark \ + --paimon_source_data_file /path/data.parquet \ + --paimon_pk_columns=id \ + --benchmark_filter=BM_PK_Write + +MOR read from an existing table:: + + paimon-read-write-benchmark \ + --paimon_external_table_path /path/table \ + --paimon_pk_columns=id \ + --benchmark_filter=BM_MOR_Read/4 diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 0ba3b318a..b3ec8c536 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -20,3 +20,4 @@ Examples write_commit_scan_read clean + benchmark From a1e03ec20da66f062d18212fd8830719024f90cc Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Wed, 3 Jun 2026 14:55:30 +0800 Subject: [PATCH 07/13] fix --- CMakeLists.txt | 5 +- benchmark/CMakeLists.txt | 1 + benchmark/benchmark_suite.cpp | 296 ++++++++++-------------- benchmark/benchmark_suite.h | 4 +- benchmark/cli_option_parsing.h | 22 +- benchmark/cli_option_parsing_test.cpp | 18 +- benchmark/read_write_benchmark.cpp | 6 +- docs/source/examples/benchmark.rst | 8 +- src/paimon/testing/utils/CMakeLists.txt | 5 +- 9 files changed, 166 insertions(+), 199 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 026e584fd..0e0548c52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -356,6 +356,9 @@ endif() set(ENV{PAIMON_TEST_DATA} "${CMAKE_SOURCE_DIR}/test/test_data") if(PAIMON_BUILD_TESTS OR PAIMON_BUILD_BENCHMARKS) + resolve_dependency(GTest) + include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) + paimon_link_libraries_whole_archive(PAIMON_LOCAL_FILE_SYSTEM_STATIC_LINK_LIBS paimon_local_file_system_static) paimon_link_libraries_no_as_needed(PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS @@ -386,7 +389,6 @@ if(PAIMON_BUILD_TESTS) endif() # Adding unit tests part of the "paimon" portion of the test suite add_custom_target(paimon-tests) - resolve_dependency(GTest) add_custom_target(unittest ctest @@ -396,7 +398,6 @@ if(PAIMON_BUILD_TESTS) --output-on-failure) add_dependencies(unittest paimon-tests) - include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) include_directories("${CMAKE_SOURCE_DIR}/test/") paimon_link_libraries_whole_archive( diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 3c0a04f5c..0a836b149 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -54,6 +54,7 @@ if(PAIMON_BUILD_BENCHMARKS) arrow parquet ${PAIMON_BENCHMARK_STATIC_LINK_LIBS} + test_utils_static Threads::Threads ${CMAKE_DL_LIBS} ${PAIMON_BENCHMARK_PLATFORM_LINK_LIBS} diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp index 18b023e51..8c924ba3e 100644 --- a/benchmark/benchmark_suite.cpp +++ b/benchmark/benchmark_suite.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -38,9 +37,11 @@ #include "benchmark/cli_option_parsing.h" #include "paimon/api.h" #include "paimon/catalog/catalog.h" +#include "paimon/testing/utils/testharness.h" #if __has_include("parquet/arrow/reader.h") #include "parquet/arrow/reader.h" +#include "parquet/file_reader.h" #define PAIMON_BENCHMARK_HAS_PARQUET_READER 1 #else #define PAIMON_BENCHMARK_HAS_PARQUET_READER 0 @@ -50,8 +51,6 @@ namespace paimon::benchmark { namespace { -using DataBatches = std::vector>; - struct BenchmarkCliOptions { std::string source_data_file; std::string external_table_path; @@ -93,9 +92,9 @@ int32_t ParsePositiveInt32(const std::string& value, const std::string& option_n return static_cast(parsed); } -void ParsePaimonBenchmarkCliArgsImpl(int* argc, char** argv) { +void ParsePaimonBenchmarkCliArgsImpl(int32_t* argc, char** argv) { auto& options = MutableBenchmarkCliOptions(); - auto parsed_argc = static_cast(*argc); + const int32_t parsed_argc = *argc; int32_t write_index = 1; for (int32_t arg_index = 1; arg_index < parsed_argc; ++arg_index) { const std::string arg(argv[arg_index]); @@ -106,11 +105,6 @@ void ParsePaimonBenchmarkCliArgsImpl(int* argc, char** argv) { &options.source_data_file)) { continue; } - if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, - "--paimon_source_parquet", &arg_index, - &options.source_data_file)) { - continue; - } if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, "--paimon_external_table_path", &arg_index, &options.external_table_path)) { @@ -134,8 +128,8 @@ void ParsePaimonBenchmarkCliArgsImpl(int* argc, char** argv) { ParsePositiveInt32(parsed_value, "--paimon_row_to_batch_thread_number"); continue; } - if (paimon::benchmark::ParseCsvOptionArg(parsed_argc, argv, arg, "--paimon_pk_columns", - &arg_index, &options.pk_columns)) { + if (paimon::benchmark::ParseCommaSeparatedOptionArg( + parsed_argc, argv, arg, "--paimon_pk_columns", &arg_index, &options.pk_columns)) { continue; } if (paimon::benchmark::ParseDelimitedRepeatableOptionArg( @@ -150,7 +144,7 @@ void ParsePaimonBenchmarkCliArgsImpl(int* argc, char** argv) { argv[write_index] = nullptr; } -bool HasHelpFlagImpl(int argc, char** argv) { +bool HasHelpFlagImpl(int32_t argc, char** argv) { for (int32_t arg_index = 1; arg_index < argc; ++arg_index) { const std::string arg(argv[arg_index]); if (arg == "-h" || arg == "--help" || arg == "--help=true") { @@ -166,7 +160,6 @@ void PrintPaimonBenchmarkCliHelpImpl() { << " Required. External source data file used to build benchmark data.\n" << " Currently supports Parquet source files.\n" << " Also supports: --paimon_source_data_file \n" - << " Deprecated alias: --paimon_source_parquet\n" << " --paimon_external_table_path=\n" << " Optional for BM_Read and BM_MOR_Read. If set, read directly from existing\n" << " table path and skip source file loading and pre-write stage.\n" @@ -193,42 +186,13 @@ void PrintPaimonBenchmarkCliHelpImpl() { << std::endl; } -struct BenchmarkWorkspace { - explicit BenchmarkWorkspace(const std::string& prefix) { - std::error_code ec; - const std::filesystem::path temp_dir = std::filesystem::temp_directory_path(ec); - if (ec) { - throw std::runtime_error("failed to get system temp directory: " + ec.message()); - } - - const std::filesystem::path workspace_dir = - temp_dir / (prefix + "_" + std::to_string(NextId())); - root_path = workspace_dir.string(); - EnsureDirectory(workspace_dir); - } - - ~BenchmarkWorkspace() { - std::error_code ec; - std::filesystem::remove_all(std::filesystem::path(root_path), ec); - } - - std::string root_path; - - private: - static void EnsureDirectory(const std::filesystem::path& path) { - std::error_code ec; - std::filesystem::create_directories(path, ec); - if (ec) { - throw std::runtime_error("failed to create benchmark workspace: " + path.string() + - ", error=" + ec.message()); - } - } - - static uint64_t NextId() { - static std::atomic id{0}; - return ++id; +std::unique_ptr CreateBenchmarkWorkspace() { + auto workspace = paimon::test::UniqueTestDirectory::Create(); + if (workspace == nullptr) { + throw std::runtime_error("failed to create benchmark workspace"); } -}; + return workspace; +} uint64_t NextTableId() { static std::atomic id{0}; @@ -260,7 +224,7 @@ void SkipWithMessage(::benchmark::State& state, const std::string& message) { state.SkipWithError(owned_message.c_str()); } -std::string GetFileFormatFromEnv() { +std::string GetConfiguredFileFormat() { std::string file_format = GetBenchmarkCliOptions().file_format; for (const auto& kv : GetBenchmarkCliOptions().extra_options) { if (kv.first == paimon::Options::FILE_FORMAT) { @@ -300,8 +264,8 @@ std::map BuildOptions(const std::string& file_format) std::map BuildPkOptions(const std::string& file_format) { auto options = BuildOptions(file_format); - options.emplace(paimon::Options::BUCKET, "1"); - options.emplace(paimon::Options::MERGE_ENGINE, "deduplicate"); + options[paimon::Options::BUCKET] = "1"; + options[paimon::Options::MERGE_ENGINE] = "deduplicate"; return options; } @@ -348,39 +312,15 @@ bool SupportsSourceDataMode(const std::string& source_format) { return false; } -struct ParquetSourceCache { - std::string path; - int64_t batch_max_rows = 0; - std::shared_ptr schema; - DataBatches batches; - int64_t total_rows = 0; -}; - -struct SourceDataCache { +struct SourceDataMetadata { std::shared_ptr schema; - const DataBatches* batches = nullptr; int64_t total_rows = 0; std::string format; std::string path; }; -std::shared_ptr BuildStructArrayFromRecordBatch( - const std::shared_ptr& batch) { - return std::make_shared(arrow::struct_(batch->schema()->fields()), - batch->num_rows(), batch->columns()); -} - -const ParquetSourceCache& LoadParquetSource(const std::string& path) { - const int64_t batch_max_rows = GetSourceBatchMaxRows(); - static ParquetSourceCache cache; - if (cache.path == path && cache.batch_max_rows == batch_max_rows) { - return cache; - } - -#if !PAIMON_BENCHMARK_HAS_PARQUET_READER - throw std::runtime_error( - "Parquet source data mode requires parquet::arrow reader support in this build"); -#else +#if PAIMON_BENCHMARK_HAS_PARQUET_READER +std::unique_ptr OpenParquetSourceReader(const std::string& path) { auto input = arrow::io::ReadableFile::Open(path); if (!input.ok()) { throw std::runtime_error("open Parquet source failed: " + path + ", " + @@ -393,73 +333,52 @@ const ParquetSourceCache& LoadParquetSource(const std::string& path) { if (!open_status.ok()) { throw std::runtime_error("create Parquet reader failed: " + open_status.ToString()); } + parquet_reader->set_batch_size(GetSourceBatchMaxRows()); + return parquet_reader; +} +#endif - std::shared_ptr table; - const auto read_status = parquet_reader->ReadTable(&table); - if (!read_status.ok()) { - throw std::runtime_error("read Parquet source failed: " + read_status.ToString()); - } - - if (table->num_rows() <= 0) { - throw std::runtime_error("Parquet source is empty: " + path); +const SourceDataMetadata& LoadParquetSourceMetadata(const std::string& path) { +#if !PAIMON_BENCHMARK_HAS_PARQUET_READER + throw std::runtime_error( + "Parquet source data mode requires parquet::arrow reader support in this build"); +#else + static SourceDataMetadata cache; + if (cache.path == path && cache.format == "parquet") { + return cache; } - DataBatches batches; - int64_t total_rows = 0; - arrow::TableBatchReader batch_reader(*table); - batch_reader.set_chunksize(batch_max_rows); - std::shared_ptr record_batch; - while (true) { - const auto read_batch_status = batch_reader.ReadNext(&record_batch); - if (!read_batch_status.ok()) { - throw std::runtime_error("split Parquet table into batches failed: " + - read_batch_status.ToString()); - } - if (record_batch == nullptr) { - break; - } - if (record_batch->num_rows() <= 0) { - continue; - } - batches.push_back(BuildStructArrayFromRecordBatch(record_batch)); - total_rows += record_batch->num_rows(); + auto parquet_reader = OpenParquetSourceReader(path); + std::shared_ptr schema; + const auto schema_status = parquet_reader->GetSchema(&schema); + if (!schema_status.ok()) { + throw std::runtime_error("read Parquet source schema failed: " + schema_status.ToString()); } - if (batches.empty() || total_rows <= 0) { - throw std::runtime_error("Parquet source has no non-empty batches: " + path); + const int64_t total_rows = parquet_reader->parquet_reader()->metadata()->num_rows(); + if (total_rows <= 0) { + throw std::runtime_error("Parquet source is empty: " + path); } - cache.path = path; - cache.batch_max_rows = batch_max_rows; - cache.schema = table->schema(); - cache.batches = std::move(batches); + cache.schema = std::move(schema); cache.total_rows = total_rows; + cache.format = "parquet"; + cache.path = path; return cache; #endif } -SourceDataCache LoadSourceData(const SourceDataSpec& source_spec) { +SourceDataMetadata LoadSourceDataMetadata(const SourceDataSpec& source_spec) { if (source_spec.format == "parquet") { - const auto& source = LoadParquetSource(source_spec.path); - return {source.schema, &source.batches, source.total_rows, source_spec.format, - source_spec.path}; + return LoadParquetSourceMetadata(source_spec.path); } throw std::runtime_error("unknown source format: " + source_spec.format); } -std::shared_ptr BuildSchema(const SourceDataSpec& source_spec) { - return LoadSourceData(source_spec).schema; -} -DataBatches BuildDataBatches(const SourceDataSpec& source_spec) { - const auto source = LoadSourceData(source_spec); - if (source.batches == nullptr || source.batches->empty() || source.total_rows <= 0) { - throw std::runtime_error("source file has no non-empty data batches"); - } - - std::cout << "[benchmark][source] format=" << source.format << ", path=" << source.path - << ", source_rows=" << source.total_rows - << ", generated_data_batches=" << source.batches->size() << std::endl; - return *source.batches; +std::shared_ptr BuildStructArrayFromRecordBatch( + const std::shared_ptr& batch) { + return std::make_shared(arrow::struct_(batch->schema()->fields()), + batch->num_rows(), batch->columns()); } std::unique_ptr MakeRecordBatch( @@ -490,19 +409,59 @@ void EnsureTable(const std::string& root_path, const std::string& db_name, "create table"); } +void WriteSourceDataToWriter(paimon::FileStoreWrite* writer, const SourceDataSpec& source_spec) { + if (source_spec.format != "parquet") { + throw std::runtime_error("unknown source format: " + source_spec.format); + } + +#if !PAIMON_BENCHMARK_HAS_PARQUET_READER + throw std::runtime_error( + "Parquet source data mode requires parquet::arrow reader support in this build"); +#else + auto parquet_reader = OpenParquetSourceReader(source_spec.path); + std::unique_ptr batch_reader; + const auto reader_status = parquet_reader->GetRecordBatchReader(&batch_reader); + if (!reader_status.ok()) { + throw std::runtime_error("create Parquet source batch reader failed: " + + reader_status.ToString()); + } + + int64_t written_rows = 0; + while (true) { + std::shared_ptr record_batch; + const auto read_status = batch_reader->ReadNext(&record_batch); + if (!read_status.ok()) { + throw std::runtime_error("read Parquet source batch failed: " + read_status.ToString()); + } + if (record_batch == nullptr) { + break; + } + if (record_batch->num_rows() <= 0) { + continue; + } + + auto struct_array = BuildStructArrayFromRecordBatch(record_batch); + auto batch = MakeRecordBatch(struct_array); + CheckStatus(writer->Write(std::move(batch)), "write batch"); + written_rows += record_batch->num_rows(); + } + + if (written_rows <= 0) { + throw std::runtime_error("source file has no non-empty data batches: " + source_spec.path); + } +#endif +} + void WriteAndCommit(const std::string& table_path, const std::map& options, - const DataBatches& data_batches) { + const SourceDataSpec& source_spec) { paimon::WriteContextBuilder write_builder(table_path, "benchmark-writer"); auto write_ctx = ValueOrThrow(write_builder.SetOptions(options).Finish(), "create write context"); auto writer = ValueOrThrow(paimon::FileStoreWrite::Create(std::move(write_ctx)), "create file store writer"); - for (const auto& data : data_batches) { - auto batch = MakeRecordBatch(data); - CheckStatus(writer->Write(std::move(batch)), "write batch"); - } + WriteSourceDataToWriter(writer.get(), source_spec); auto messages = ValueOrThrow(writer->PrepareCommit(), "prepare commit"); paimon::CommitContextBuilder commit_builder(table_path, "benchmark-writer"); @@ -515,14 +474,14 @@ void WriteAndCommit(const std::string& table_path, struct SharedReadTableCache { std::string key; - std::unique_ptr workspace; + std::unique_ptr workspace; std::string table_path; int64_t total_rows = 0; }; struct SharedMorReadTableCache { std::string key; - std::unique_ptr workspace; + std::unique_ptr workspace; std::string table_path; int64_t total_rows = 0; }; @@ -560,24 +519,22 @@ const SharedMorReadTableCache& GetOrCreateSharedMorReadTable(const std::string& } auto options = BuildPkOptions(file_format); - const auto source = LoadSourceData(source_spec); - auto schema = BuildSchema(source_spec); - auto data_batches = BuildDataBatches(source_spec); + const auto source_metadata = LoadSourceDataMetadata(source_spec); - auto workspace = std::make_unique("paimon_mor_read_bench_shared"); + auto workspace = CreateBenchmarkWorkspace(); const std::string db_name = "bench_db"; const std::string table_name = "mor_read_shared_" + std::to_string(NextTableId()); - EnsureTable(workspace->root_path, db_name, table_name, options, schema, + EnsureTable(workspace->Str(), db_name, table_name, options, source_metadata.schema, /*primary_keys=*/pk_columns); - const std::string table_path = RequirePath(workspace->root_path, db_name, table_name); + const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); std::cout << "[benchmark][mor-read] create_shared_output_table_path=" << table_path << std::endl; - WriteAndCommit(table_path, options, data_batches); + WriteAndCommit(table_path, options, source_spec); cache.key = cache_key; cache.workspace = std::move(workspace); cache.table_path = table_path; - cache.total_rows = source.total_rows; + cache.total_rows = source_metadata.total_rows; return cache; } @@ -594,22 +551,20 @@ const SharedReadTableCache& GetOrCreateSharedReadTable(const std::string& file_f } auto options = BuildOptions(file_format); - const auto source = LoadSourceData(source_spec); - auto schema = BuildSchema(source_spec); - auto data_batches = BuildDataBatches(source_spec); + const auto source_metadata = LoadSourceDataMetadata(source_spec); - auto workspace = std::make_unique("paimon_read_bench_shared"); + auto workspace = CreateBenchmarkWorkspace(); const std::string db_name = "bench_db"; const std::string table_name = "read_shared_" + std::to_string(NextTableId()); - EnsureTable(workspace->root_path, db_name, table_name, options, schema); - const std::string table_path = RequirePath(workspace->root_path, db_name, table_name); + EnsureTable(workspace->Str(), db_name, table_name, options, source_metadata.schema); + const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); std::cout << "[benchmark][read] create_shared_output_table_path=" << table_path << std::endl; - WriteAndCommit(table_path, options, data_batches); + WriteAndCommit(table_path, options, source_spec); cache.key = cache_key; cache.workspace = std::move(workspace); cache.table_path = table_path; - cache.total_rows = source.total_rows; + cache.total_rows = source_metadata.total_rows; return cache; } @@ -652,7 +607,6 @@ int64_t ReadRows(const std::string& table_path, const std::map schema; - DataBatches data_batches; int64_t total_rows = 0; }; @@ -669,9 +623,9 @@ bool TryGetSourceSpec(::benchmark::State& state, SourceDataSpec* source_spec) { bool TryPrepareSourceData(::benchmark::State& state, const SourceDataSpec& source_spec, PreparedSourceData* prepared) { try { - prepared->total_rows = LoadSourceData(source_spec).total_rows; - prepared->schema = BuildSchema(source_spec); - prepared->data_batches = BuildDataBatches(source_spec); + const auto source_metadata = LoadSourceDataMetadata(source_spec); + prepared->schema = source_metadata.schema; + prepared->total_rows = source_metadata.total_rows; return true; } catch (const std::exception& e) { SkipWithMessage(state, e.what()); @@ -682,10 +636,12 @@ bool TryPrepareSourceData(::benchmark::State& state, const SourceDataSpec& sourc } // namespace void ParsePaimonBenchmarkCliArgs(int* argc, char** argv) { - ParsePaimonBenchmarkCliArgsImpl(argc, argv); + auto parsed_argc = static_cast(*argc); + ParsePaimonBenchmarkCliArgsImpl(&parsed_argc, argv); + *argc = static_cast(parsed_argc); } -bool HasHelpFlag(int argc, char** argv) { +bool HasHelpFlag(int32_t argc, char** argv) { return HasHelpFlagImpl(argc, argv); } @@ -694,7 +650,7 @@ void PrintPaimonBenchmarkCliHelp() { } void RunBMWrite(::benchmark::State& state) { - const std::string file_format = GetFileFormatFromEnv(); + const std::string file_format = GetConfiguredFileFormat(); SourceDataSpec source_spec; if (!TryGetSourceSpec(state, &source_spec)) { return; @@ -718,15 +674,15 @@ void RunBMWrite(::benchmark::State& state) { if (!TryPrepareSourceData(state, source_spec, &prepared)) { return; } - BenchmarkWorkspace workspace("paimon_write_bench"); + auto workspace = CreateBenchmarkWorkspace(); for (auto _ : state) { const std::string db_name = "bench_db"; const std::string table_name = "write_" + std::to_string(NextTableId()); - EnsureTable(workspace.root_path, db_name, table_name, options, prepared.schema); - const std::string table_path = RequirePath(workspace.root_path, db_name, table_name); + EnsureTable(workspace->Str(), db_name, table_name, options, prepared.schema); + const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); std::cout << "[benchmark][write] output_table_path=" << table_path << std::endl; - WriteAndCommit(table_path, options, prepared.data_batches); + WriteAndCommit(table_path, options, source_spec); } state.SetItemsProcessed(state.iterations() * prepared.total_rows); @@ -734,7 +690,7 @@ void RunBMWrite(::benchmark::State& state) { void RunBMRead(::benchmark::State& state) { const auto prefetch_parallel_num = static_cast(state.range(0)); - const std::string file_format = GetFileFormatFromEnv(); + const std::string file_format = GetConfiguredFileFormat(); const std::string external_table_path = GetExternalTablePath(); SourceDataSpec source_spec; if (!TryGetSourceSpec(state, &source_spec)) { @@ -786,7 +742,7 @@ void RunBMRead(::benchmark::State& state) { } void RunBMPkWrite(::benchmark::State& state) { - const std::string file_format = GetFileFormatFromEnv(); + const std::string file_format = GetConfiguredFileFormat(); SourceDataSpec source_spec; if (!TryGetSourceSpec(state, &source_spec)) { return; @@ -815,16 +771,16 @@ void RunBMPkWrite(::benchmark::State& state) { if (!TryPrepareSourceData(state, source_spec, &prepared)) { return; } - BenchmarkWorkspace workspace("paimon_pk_write_bench"); + auto workspace = CreateBenchmarkWorkspace(); for (auto _ : state) { const std::string db_name = "bench_db"; const std::string table_name = "pk_write_" + std::to_string(NextTableId()); - EnsureTable(workspace.root_path, db_name, table_name, options, prepared.schema, + EnsureTable(workspace->Str(), db_name, table_name, options, prepared.schema, /*primary_keys=*/pk_columns); - const std::string table_path = RequirePath(workspace.root_path, db_name, table_name); + const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); std::cout << "[benchmark][pk-write] output_table_path=" << table_path << std::endl; - WriteAndCommit(table_path, options, prepared.data_batches); + WriteAndCommit(table_path, options, source_spec); } state.SetItemsProcessed(state.iterations() * prepared.total_rows); @@ -832,7 +788,7 @@ void RunBMPkWrite(::benchmark::State& state) { void RunBMMorRead(::benchmark::State& state) { const auto prefetch_parallel_num = static_cast(state.range(0)); - const std::string file_format = GetFileFormatFromEnv(); + const std::string file_format = GetConfiguredFileFormat(); const std::string external_table_path = GetExternalTablePath(); SourceDataSpec source_spec; if (!TryGetSourceSpec(state, &source_spec)) { diff --git a/benchmark/benchmark_suite.h b/benchmark/benchmark_suite.h index 31ba6c8bb..43c07af7f 100644 --- a/benchmark/benchmark_suite.h +++ b/benchmark/benchmark_suite.h @@ -16,12 +16,14 @@ #pragma once +#include + #include "benchmark/benchmark.h" namespace paimon::benchmark { void ParsePaimonBenchmarkCliArgs(int* argc, char** argv); -bool HasHelpFlag(int argc, char** argv); +bool HasHelpFlag(int32_t argc, char** argv); void PrintPaimonBenchmarkCliHelp(); void RunBMWrite(::benchmark::State& state); diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h index adc72288b..448ce5ec5 100644 --- a/benchmark/cli_option_parsing.h +++ b/benchmark/cli_option_parsing.h @@ -43,21 +43,21 @@ inline std::string TrimAsciiWhitespace(const std::string& value) { return value.substr(first, last - first + 1); } -inline std::vector ParseCsvColumns(const std::string& csv, - const std::string& option_name) { - if (csv.empty()) { +inline std::vector ParseCommaSeparatedColumns(const std::string& input, + const std::string& option_name) { + if (input.empty()) { throw std::runtime_error("missing value for " + option_name); } std::vector columns; size_t segment_start = 0; - for (size_t index = 0; index <= csv.size(); ++index) { - if (index != csv.size() && csv[index] != ',') { + for (size_t index = 0; index <= input.size(); ++index) { + if (index != input.size() && input[index] != ',') { continue; } const std::string column = - TrimAsciiWhitespace(csv.substr(segment_start, index - segment_start)); + TrimAsciiWhitespace(input.substr(segment_start, index - segment_start)); if (column.empty()) { throw std::runtime_error("invalid " + option_name + ": empty column name"); } @@ -117,12 +117,12 @@ inline bool ParseStringOptionArg(int32_t argc, char** argv, const std::string& a return true; } -inline bool ParseCsvOptionArg(int32_t argc, char** argv, const std::string& arg, - const std::string& option_name, int32_t* arg_index, - std::vector* columns_out) { +inline bool ParseCommaSeparatedOptionArg(int32_t argc, char** argv, const std::string& arg, + const std::string& option_name, int32_t* arg_index, + std::vector* columns_out) { std::string parsed_value; if (ConsumeCliOption(arg, option_name, &parsed_value)) { - *columns_out = ParseCsvColumns(parsed_value, option_name); + *columns_out = ParseCommaSeparatedColumns(parsed_value, option_name); return true; } @@ -133,7 +133,7 @@ inline bool ParseCsvOptionArg(int32_t argc, char** argv, const std::string& arg, if (*arg_index + 1 >= argc) { throw std::runtime_error("missing value for " + option_name); } - *columns_out = ParseCsvColumns(std::string(argv[++(*arg_index)]), option_name); + *columns_out = ParseCommaSeparatedColumns(std::string(argv[++(*arg_index)]), option_name); return true; } diff --git a/benchmark/cli_option_parsing_test.cpp b/benchmark/cli_option_parsing_test.cpp index 9d308c6b7..7c33cdf39 100644 --- a/benchmark/cli_option_parsing_test.cpp +++ b/benchmark/cli_option_parsing_test.cpp @@ -51,18 +51,20 @@ TEST(CliOptionParsingTest, ConsumeCliOptionWorks) { ASSERT_FALSE(paimon::benchmark::ConsumeCliOption("--foo", "--foo", &value)); } -TEST(CliOptionParsingTest, ParseCsvColumnsWorks) { - const auto parsed = paimon::benchmark::ParseCsvColumns("id, name,age", "--cols"); +TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsWorks) { + const auto parsed = paimon::benchmark::ParseCommaSeparatedColumns("id, name,age", "--cols"); ASSERT_EQ(parsed.size(), 3U); ASSERT_EQ(parsed[0], "id"); ASSERT_EQ(parsed[1], "name"); ASSERT_EQ(parsed[2], "age"); } -TEST(CliOptionParsingTest, ParseCsvColumnsRejectsInvalidInput) { - ASSERT_THROW((void)paimon::benchmark::ParseCsvColumns("", "--cols"), std::runtime_error); - ASSERT_THROW((void)paimon::benchmark::ParseCsvColumns("id,", "--cols"), std::runtime_error); - ASSERT_THROW((void)paimon::benchmark::ParseCsvColumns("id,,name", "--cols"), +TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsRejectsInvalidInput) { + ASSERT_THROW((void)paimon::benchmark::ParseCommaSeparatedColumns("", "--cols"), + std::runtime_error); + ASSERT_THROW((void)paimon::benchmark::ParseCommaSeparatedColumns("id,", "--cols"), + std::runtime_error); + ASSERT_THROW((void)paimon::benchmark::ParseCommaSeparatedColumns("id,,name", "--cols"), std::runtime_error); } @@ -114,12 +116,12 @@ TEST(CliOptionParsingTest, ParseStringOptionArgRejectsMissingValue) { std::runtime_error); } -TEST(CliOptionParsingTest, ParseCsvOptionArgAndDelimitedRepeatableOptionArgWorks) { +TEST(CliOptionParsingTest, ParseCommaSeparatedOptionArgAndDelimitedRepeatableOptionArgWorks) { { ArgvHolder argv_holder({"prog", "--cols", "id,name"}); int32_t arg_index = 1; std::vector columns; - ASSERT_TRUE(paimon::benchmark::ParseCsvOptionArg( + ASSERT_TRUE(paimon::benchmark::ParseCommaSeparatedOptionArg( argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--cols", &arg_index, &columns)); ASSERT_EQ(arg_index, 2); diff --git a/benchmark/read_write_benchmark.cpp b/benchmark/read_write_benchmark.cpp index 06c7da523..0710cdc9b 100644 --- a/benchmark/read_write_benchmark.cpp +++ b/benchmark/read_write_benchmark.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -25,12 +26,11 @@ int main(int argc, char** argv) { paimon::benchmark::ParsePaimonBenchmarkCliArgs(&argc, argv); } catch (const std::exception& e) { std::cerr << "paimon-read-write-benchmark: " << e.what() << std::endl; - std::cerr << "Try 'paimon-read-write-benchmark --help' for more information." - << std::endl; + std::cerr << "Try 'paimon-read-write-benchmark --help' for more information." << std::endl; return 1; } - if (paimon::benchmark::HasHelpFlag(argc, argv)) { + if (paimon::benchmark::HasHelpFlag(static_cast(argc), argv)) { paimon::benchmark::PrintPaimonBenchmarkCliHelp(); return 0; } diff --git a/docs/source/examples/benchmark.rst b/docs/source/examples/benchmark.rst index 1b5f5896b..7ae4e272f 100644 --- a/docs/source/examples/benchmark.rst +++ b/docs/source/examples/benchmark.rst @@ -39,7 +39,7 @@ specific options below: ``--paimon_source_data_file=`` Source data file used to build benchmark data. Currently Parquet source files - are supported. ``--paimon_source_parquet`` is kept as a deprecated alias. + are supported. ``--paimon_external_table_path=`` Read directly from an existing table path for ``BM_Read`` and ``BM_MOR_Read``. @@ -55,10 +55,12 @@ specific options below: Row-to-batch thread number for reads. The default value is ``3``. ``--paimon_pk_columns=`` - Primary key columns for ``BM_PK_Write`` and ``BM_MOR_Read``. + Primary key columns for ``BM_PK_Write`` and ``BM_MOR_Read``. These cases + explicitly use ``bucket=1`` because benchmark batches are written to bucket 0. ``--paimon_option=:;:`` - Repeatable table options passed through to Paimon. + Repeatable table options passed through to Paimon. For ``BM_PK_Write`` and + ``BM_MOR_Read``, ``bucket`` is forced to ``1``. Examples ======== diff --git a/src/paimon/testing/utils/CMakeLists.txt b/src/paimon/testing/utils/CMakeLists.txt index b8fe36296..ee18db9ea 100644 --- a/src/paimon/testing/utils/CMakeLists.txt +++ b/src/paimon/testing/utils/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -if(PAIMON_BUILD_TESTS) +if(PAIMON_BUILD_TESTS OR PAIMON_BUILD_BENCHMARKS) set(PAIMON_TEST_UTILS testharness.cpp data_generator.cpp) @@ -25,6 +25,9 @@ if(PAIMON_BUILD_TESTS) STATIC_LINK_LIBS paimon_static ${GTEST_LINK_TOOLCHAIN}) +endif() + +if(PAIMON_BUILD_TESTS) add_paimon_test(test_utils_test SOURCES From 01bdd3d0439a9652020b9ac576f93e5efc1b630d Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Wed, 3 Jun 2026 16:17:52 +0800 Subject: [PATCH 08/13] fix --- CMakeLists.txt | 4 +++- benchmark/benchmark_suite.cpp | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e0548c52..154a38d97 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -494,4 +494,6 @@ add_subdirectory(src/paimon/global_index/lucene) add_subdirectory(src/paimon/testing/mock) add_subdirectory(src/paimon/testing/utils) add_subdirectory(test/inte) -add_subdirectory(benchmark) +if(PAIMON_BUILD_BENCHMARKS) + add_subdirectory(benchmark) +endif() diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp index 8c924ba3e..2b0a13139 100644 --- a/benchmark/benchmark_suite.cpp +++ b/benchmark/benchmark_suite.cpp @@ -219,9 +219,7 @@ void CheckStatus(const paimon::Status& status, const std::string& context) { } void SkipWithMessage(::benchmark::State& state, const std::string& message) { - static thread_local std::string owned_message; - owned_message = message; - state.SkipWithError(owned_message.c_str()); + state.SkipWithError(message); } std::string GetConfiguredFileFormat() { @@ -344,6 +342,8 @@ const SourceDataMetadata& LoadParquetSourceMetadata(const std::string& path) { "Parquet source data mode requires parquet::arrow reader support in this build"); #else static SourceDataMetadata cache; + static std::mutex cache_mutex; + std::lock_guard lock(cache_mutex); if (cache.path == path && cache.format == "parquet") { return cache; } From db96b3e1196a179387ca903946b6ab7411798e49 Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Thu, 4 Jun 2026 10:42:08 +0800 Subject: [PATCH 09/13] fix --- benchmark/CMakeLists.txt | 1 + benchmark/benchmark_helpers.cpp | 16 +- benchmark/benchmark_helpers.h | 13 +- benchmark/benchmark_suite.cpp | 398 ++++++++++++++++---------- benchmark/benchmark_suite.h | 3 +- benchmark/cli_option_parsing.h | 54 ++-- benchmark/cli_option_parsing_test.cpp | 67 +++-- benchmark/read_write_benchmark.cpp | 8 +- 8 files changed, 333 insertions(+), 227 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 0a836b149..573fbaf94 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -70,5 +70,6 @@ if(PAIMON_BUILD_TESTS) EXTRA_INCLUDES ${CMAKE_SOURCE_DIR} STATIC_LINK_LIBS + paimon_shared ${GTEST_LINK_TOOLCHAIN}) endif() diff --git a/benchmark/benchmark_helpers.cpp b/benchmark/benchmark_helpers.cpp index f4c409d0b..7d6b3ae60 100644 --- a/benchmark/benchmark_helpers.cpp +++ b/benchmark/benchmark_helpers.cpp @@ -62,26 +62,26 @@ bool BenchmarkHelpers::ValidatePrefetchParallelOrSkip(::benchmark::State& state, return true; } -int64_t BenchmarkHelpers::RunReadIterations(::benchmark::State& state, - const ReadOnceFn& read_once) { +Result BenchmarkHelpers::RunReadIterations(::benchmark::State& state, + const ReadOnceFn& read_once) { int64_t rows_read = 0; for (auto _ : state) { - rows_read = read_once(); + PAIMON_ASSIGN_OR_RAISE(rows_read, read_once()); } return rows_read; } -bool BenchmarkHelpers::TryRunExternalReadMode(::benchmark::State& state, - const std::string& benchmark_name, - const std::string& external_table_path, - const ReadOnceFn& read_once) { +Result BenchmarkHelpers::TryRunExternalReadMode(::benchmark::State& state, + const std::string& benchmark_name, + const std::string& external_table_path, + const ReadOnceFn& read_once) { if (external_table_path.empty()) { return false; } std::cout << "[benchmark][" << benchmark_name << "] external_table_path=" << external_table_path << std::endl; - const int64_t rows_read = RunReadIterations(state, read_once); + PAIMON_ASSIGN_OR_RAISE(const int64_t rows_read, RunReadIterations(state, read_once)); state.SetItemsProcessed(state.iterations() * rows_read); return true; } diff --git a/benchmark/benchmark_helpers.h b/benchmark/benchmark_helpers.h index 39f6eadf9..ed59e602f 100644 --- a/benchmark/benchmark_helpers.h +++ b/benchmark/benchmark_helpers.h @@ -20,6 +20,8 @@ #include #include +#include "paimon/result.h" + namespace benchmark { class State; } @@ -28,7 +30,7 @@ namespace paimon::benchmark { class BenchmarkHelpers { public: - using ReadOnceFn = std::function; + using ReadOnceFn = std::function()>; using SkipFn = void (*)(::benchmark::State&, const std::string&); static bool ValidateFileFormatOrSkip(::benchmark::State& state, const std::string& file_format, @@ -45,11 +47,12 @@ class BenchmarkHelpers { static bool ValidatePrefetchParallelOrSkip(::benchmark::State& state, int32_t prefetch_parallel_num, SkipFn skip); - static int64_t RunReadIterations(::benchmark::State& state, const ReadOnceFn& read_once); + static Result RunReadIterations(::benchmark::State& state, const ReadOnceFn& read_once); - static bool TryRunExternalReadMode(::benchmark::State& state, const std::string& benchmark_name, - const std::string& external_table_path, - const ReadOnceFn& read_once); + static Result TryRunExternalReadMode(::benchmark::State& state, + const std::string& benchmark_name, + const std::string& external_table_path, + const ReadOnceFn& read_once); }; } // namespace paimon::benchmark diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp index 2b0a13139..e452fa741 100644 --- a/benchmark/benchmark_suite.cpp +++ b/benchmark/benchmark_suite.cpp @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -75,24 +74,24 @@ const BenchmarkCliOptions& GetBenchmarkCliOptions() { return MutableBenchmarkCliOptions(); } -int64_t ParsePositiveInt64(const std::string& value, const std::string& option_name) { +Result ParsePositiveInt64(const std::string& value, const std::string& option_name) { char* end = nullptr; const auto parsed = std::strtoll(value.c_str(), &end, 10); if (end == value.c_str() || *end != '\0' || parsed <= 0) { - throw std::runtime_error("invalid " + option_name + ", expected positive integer"); + return Status::Invalid("invalid ", option_name, ", expected positive integer"); } return static_cast(parsed); } -int32_t ParsePositiveInt32(const std::string& value, const std::string& option_name) { - const int64_t parsed = ParsePositiveInt64(value, option_name); +Result ParsePositiveInt32(const std::string& value, const std::string& option_name) { + PAIMON_ASSIGN_OR_RAISE(const int64_t parsed, ParsePositiveInt64(value, option_name)); if (parsed > std::numeric_limits::max()) { - throw std::runtime_error("invalid " + option_name + ", value is too large"); + return Status::Invalid("invalid ", option_name, ", value is too large"); } return static_cast(parsed); } -void ParsePaimonBenchmarkCliArgsImpl(int32_t* argc, char** argv) { +Status ParsePaimonBenchmarkCliArgsImpl(int32_t* argc, char** argv) { auto& options = MutableBenchmarkCliOptions(); const int32_t parsed_argc = *argc; int32_t write_index = 1; @@ -100,40 +99,59 @@ void ParsePaimonBenchmarkCliArgsImpl(int32_t* argc, char** argv) { const std::string arg(argv[arg_index]); std::string parsed_value; - if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, - "--paimon_source_data_file", &arg_index, - &options.source_data_file)) { + PAIMON_ASSIGN_OR_RAISE(bool is_parsed, + paimon::benchmark::ParseStringOptionArg( + parsed_argc, argv, arg, "--paimon_source_data_file", + &arg_index, &options.source_data_file)); + if (is_parsed) { continue; } - if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, - "--paimon_external_table_path", &arg_index, - &options.external_table_path)) { + PAIMON_ASSIGN_OR_RAISE(is_parsed, + paimon::benchmark::ParseStringOptionArg( + parsed_argc, argv, arg, "--paimon_external_table_path", + &arg_index, &options.external_table_path)); + if (is_parsed) { continue; } - if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, "--paimon_file_format", - &arg_index, &options.file_format)) { + PAIMON_ASSIGN_OR_RAISE(is_parsed, + paimon::benchmark::ParseStringOptionArg( + parsed_argc, argv, arg, "--paimon_file_format", &arg_index, + &options.file_format)); + if (is_parsed) { continue; } - if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, - "--paimon_source_batch_max_rows", &arg_index, - &parsed_value)) { - options.source_batch_max_rows = - ParsePositiveInt64(parsed_value, "--paimon_source_batch_max_rows"); + PAIMON_ASSIGN_OR_RAISE(is_parsed, + paimon::benchmark::ParseStringOptionArg( + parsed_argc, argv, arg, "--paimon_source_batch_max_rows", + &arg_index, &parsed_value)); + if (is_parsed) { + PAIMON_ASSIGN_OR_RAISE( + options.source_batch_max_rows, + ParsePositiveInt64(parsed_value, "--paimon_source_batch_max_rows")); continue; } - if (paimon::benchmark::ParseStringOptionArg(parsed_argc, argv, arg, - "--paimon_row_to_batch_thread_number", - &arg_index, &parsed_value)) { - options.row_to_batch_thread_number = - ParsePositiveInt32(parsed_value, "--paimon_row_to_batch_thread_number"); + PAIMON_ASSIGN_OR_RAISE(is_parsed, + paimon::benchmark::ParseStringOptionArg( + parsed_argc, argv, arg, "--paimon_row_to_batch_thread_number", + &arg_index, &parsed_value)); + if (is_parsed) { + PAIMON_ASSIGN_OR_RAISE( + options.row_to_batch_thread_number, + ParsePositiveInt32(parsed_value, "--paimon_row_to_batch_thread_number")); continue; } - if (paimon::benchmark::ParseCommaSeparatedOptionArg( - parsed_argc, argv, arg, "--paimon_pk_columns", &arg_index, &options.pk_columns)) { + PAIMON_ASSIGN_OR_RAISE(is_parsed, + paimon::benchmark::ParseCommaSeparatedOptionArg( + parsed_argc, argv, arg, "--paimon_pk_columns", &arg_index, + &options.pk_columns)); + if (is_parsed) { continue; } - if (paimon::benchmark::ParseDelimitedRepeatableOptionArg( - parsed_argc, argv, arg, "--paimon_option", &arg_index, &options.extra_options)) { + PAIMON_ASSIGN_OR_RAISE(is_parsed, + paimon::benchmark::ParseDelimitedRepeatableOptionArg( + parsed_argc, argv, arg, "--paimon_option", &arg_index, + &options.extra_options)); + if (is_parsed) { continue; } @@ -142,6 +160,7 @@ void ParsePaimonBenchmarkCliArgsImpl(int32_t* argc, char** argv) { *argc = write_index; argv[write_index] = nullptr; + return Status::OK(); } bool HasHelpFlagImpl(int32_t argc, char** argv) { @@ -186,10 +205,10 @@ void PrintPaimonBenchmarkCliHelpImpl() { << std::endl; } -std::unique_ptr CreateBenchmarkWorkspace() { +Result> CreateBenchmarkWorkspace() { auto workspace = paimon::test::UniqueTestDirectory::Create(); if (workspace == nullptr) { - throw std::runtime_error("failed to create benchmark workspace"); + return Status::Invalid("failed to create benchmark workspace"); } return workspace; } @@ -205,17 +224,18 @@ std::string RequirePath(const std::string& root_path, const std::string& db_name } template -T ValueOrThrow(paimon::Result&& result, const std::string& context) { +Result AddContext(paimon::Result&& result, const std::string& context) { if (!result.ok()) { - throw std::runtime_error(context + ": " + result.status().ToString()); + return Status::Invalid(context, ": ", result.status().ToString()); } return std::move(result).value(); } -void CheckStatus(const paimon::Status& status, const std::string& context) { +Status AddContext(const paimon::Status& status, const std::string& context) { if (!status.ok()) { - throw std::runtime_error(context + ": " + status.ToString()); + return Status::Invalid(context, ": ", status.ToString()); } + return Status::OK(); } void SkipWithMessage(::benchmark::State& state, const std::string& message) { @@ -318,27 +338,28 @@ struct SourceDataMetadata { }; #if PAIMON_BENCHMARK_HAS_PARQUET_READER -std::unique_ptr OpenParquetSourceReader(const std::string& path) { +Result> OpenParquetSourceReader( + const std::string& path) { auto input = arrow::io::ReadableFile::Open(path); if (!input.ok()) { - throw std::runtime_error("open Parquet source failed: " + path + ", " + - input.status().ToString()); + return Status::Invalid("open Parquet source failed: ", path, ", ", + input.status().ToString()); } std::unique_ptr parquet_reader; const auto open_status = parquet::arrow::OpenFile( input.ValueUnsafe(), arrow::default_memory_pool(), &parquet_reader); if (!open_status.ok()) { - throw std::runtime_error("create Parquet reader failed: " + open_status.ToString()); + return Status::Invalid("create Parquet reader failed: ", open_status.ToString()); } parquet_reader->set_batch_size(GetSourceBatchMaxRows()); return parquet_reader; } #endif -const SourceDataMetadata& LoadParquetSourceMetadata(const std::string& path) { +Result LoadParquetSourceMetadata(const std::string& path) { #if !PAIMON_BENCHMARK_HAS_PARQUET_READER - throw std::runtime_error( + return Status::Invalid( "Parquet source data mode requires parquet::arrow reader support in this build"); #else static SourceDataMetadata cache; @@ -348,16 +369,16 @@ const SourceDataMetadata& LoadParquetSourceMetadata(const std::string& path) { return cache; } - auto parquet_reader = OpenParquetSourceReader(path); + PAIMON_ASSIGN_OR_RAISE(auto parquet_reader, OpenParquetSourceReader(path)); std::shared_ptr schema; const auto schema_status = parquet_reader->GetSchema(&schema); if (!schema_status.ok()) { - throw std::runtime_error("read Parquet source schema failed: " + schema_status.ToString()); + return Status::Invalid("read Parquet source schema failed: ", schema_status.ToString()); } const int64_t total_rows = parquet_reader->parquet_reader()->metadata()->num_rows(); if (total_rows <= 0) { - throw std::runtime_error("Parquet source is empty: " + path); + return Status::Invalid("Parquet source is empty: ", path); } cache.schema = std::move(schema); @@ -368,11 +389,11 @@ const SourceDataMetadata& LoadParquetSourceMetadata(const std::string& path) { #endif } -SourceDataMetadata LoadSourceDataMetadata(const SourceDataSpec& source_spec) { +Result LoadSourceDataMetadata(const SourceDataSpec& source_spec) { if (source_spec.format == "parquet") { return LoadParquetSourceMetadata(source_spec.path); } - throw std::runtime_error("unknown source format: " + source_spec.format); + return Status::Invalid("unknown source format: ", source_spec.format); } std::shared_ptr BuildStructArrayFromRecordBatch( @@ -381,49 +402,55 @@ std::shared_ptr BuildStructArrayFromRecordBatch( batch->num_rows(), batch->columns()); } -std::unique_ptr MakeRecordBatch( +Result> MakeRecordBatch( const std::shared_ptr& arr) { ArrowArray c_array; if (!arrow::ExportArray(*arr, &c_array).ok()) { - throw std::runtime_error("failed to export arrow array"); + return Status::Invalid("failed to export arrow array"); } paimon::RecordBatchBuilder builder(&c_array); - builder.SetBucket(0); - return ValueOrThrow(builder.Finish(), "build paimon record batch"); + return AddContext(builder.Finish(), "build paimon record batch"); } -void EnsureTable(const std::string& root_path, const std::string& db_name, - const std::string& table_name, const std::map& options, - const std::shared_ptr& schema, - const std::vector& primary_keys = {}) { - auto catalog = ValueOrThrow(paimon::Catalog::Create(root_path, options), "create catalog"); - CheckStatus(catalog->CreateDatabase(db_name, options, true), "create database"); +Status EnsureTable(const std::string& root_path, const std::string& db_name, + const std::string& table_name, + const std::map& options, + const std::shared_ptr& schema, + const std::vector& primary_keys = {}) { + PAIMON_ASSIGN_OR_RAISE(auto catalog, + AddContext(paimon::Catalog::Create(root_path, options), + "create catalog")); + PAIMON_RETURN_NOT_OK( + AddContext(catalog->CreateDatabase(db_name, options, true), "create database")); ArrowSchema c_schema; if (!arrow::ExportSchema(*schema, &c_schema).ok()) { - throw std::runtime_error("failed to export table schema"); + return Status::Invalid("failed to export table schema"); } - CheckStatus(catalog->CreateTable(paimon::Identifier(db_name, table_name), &c_schema, - /*partition_keys=*/{}, primary_keys, options, - /*ignore_if_exists=*/false), - "create table"); + PAIMON_RETURN_NOT_OK(AddContext( + catalog->CreateTable(paimon::Identifier(db_name, table_name), &c_schema, + /*partition_keys=*/{}, primary_keys, options, + /*ignore_if_exists=*/false), + "create table")); + return Status::OK(); } -void WriteSourceDataToWriter(paimon::FileStoreWrite* writer, const SourceDataSpec& source_spec) { +Status WriteSourceDataToWriter(paimon::FileStoreWrite* writer, + const SourceDataSpec& source_spec) { if (source_spec.format != "parquet") { - throw std::runtime_error("unknown source format: " + source_spec.format); + return Status::Invalid("unknown source format: ", source_spec.format); } #if !PAIMON_BENCHMARK_HAS_PARQUET_READER - throw std::runtime_error( + return Status::Invalid( "Parquet source data mode requires parquet::arrow reader support in this build"); #else - auto parquet_reader = OpenParquetSourceReader(source_spec.path); + PAIMON_ASSIGN_OR_RAISE(auto parquet_reader, OpenParquetSourceReader(source_spec.path)); std::unique_ptr batch_reader; const auto reader_status = parquet_reader->GetRecordBatchReader(&batch_reader); if (!reader_status.ok()) { - throw std::runtime_error("create Parquet source batch reader failed: " + - reader_status.ToString()); + return Status::Invalid("create Parquet source batch reader failed: ", + reader_status.ToString()); } int64_t written_rows = 0; @@ -431,7 +458,7 @@ void WriteSourceDataToWriter(paimon::FileStoreWrite* writer, const SourceDataSpe std::shared_ptr record_batch; const auto read_status = batch_reader->ReadNext(&record_batch); if (!read_status.ok()) { - throw std::runtime_error("read Parquet source batch failed: " + read_status.ToString()); + return Status::Invalid("read Parquet source batch failed: ", read_status.ToString()); } if (record_batch == nullptr) { break; @@ -441,35 +468,43 @@ void WriteSourceDataToWriter(paimon::FileStoreWrite* writer, const SourceDataSpe } auto struct_array = BuildStructArrayFromRecordBatch(record_batch); - auto batch = MakeRecordBatch(struct_array); - CheckStatus(writer->Write(std::move(batch)), "write batch"); + PAIMON_ASSIGN_OR_RAISE(auto batch, MakeRecordBatch(struct_array)); + PAIMON_RETURN_NOT_OK(AddContext(writer->Write(std::move(batch)), "write batch")); written_rows += record_batch->num_rows(); } if (written_rows <= 0) { - throw std::runtime_error("source file has no non-empty data batches: " + source_spec.path); + return Status::Invalid("source file has no non-empty data batches: ", source_spec.path); } + return Status::OK(); #endif } -void WriteAndCommit(const std::string& table_path, - const std::map& options, - const SourceDataSpec& source_spec) { +Status WriteAndCommit(const std::string& table_path, + const std::map& options, + const SourceDataSpec& source_spec) { paimon::WriteContextBuilder write_builder(table_path, "benchmark-writer"); - auto write_ctx = - ValueOrThrow(write_builder.SetOptions(options).Finish(), "create write context"); - auto writer = ValueOrThrow(paimon::FileStoreWrite::Create(std::move(write_ctx)), - "create file store writer"); - - WriteSourceDataToWriter(writer.get(), source_spec); - auto messages = ValueOrThrow(writer->PrepareCommit(), "prepare commit"); + PAIMON_ASSIGN_OR_RAISE( + auto write_ctx, + AddContext(write_builder.SetOptions(options).Finish(), "create write context")); + PAIMON_ASSIGN_OR_RAISE( + auto writer, + AddContext(paimon::FileStoreWrite::Create(std::move(write_ctx)), + "create file store writer")); + + PAIMON_RETURN_NOT_OK(WriteSourceDataToWriter(writer.get(), source_spec)); + PAIMON_ASSIGN_OR_RAISE(auto messages, + AddContext(writer->PrepareCommit(), "prepare commit")); paimon::CommitContextBuilder commit_builder(table_path, "benchmark-writer"); - auto commit_ctx = - ValueOrThrow(commit_builder.SetOptions(options).Finish(), "create commit context"); - auto committer = - ValueOrThrow(paimon::FileStoreCommit::Create(std::move(commit_ctx)), "create committer"); - CheckStatus(committer->Commit(messages), "commit write"); + PAIMON_ASSIGN_OR_RAISE( + auto commit_ctx, + AddContext(commit_builder.SetOptions(options).Finish(), "create commit context")); + PAIMON_ASSIGN_OR_RAISE( + auto committer, + AddContext(paimon::FileStoreCommit::Create(std::move(commit_ctx)), "create committer")); + PAIMON_RETURN_NOT_OK(AddContext(committer->Commit(messages), "commit write")); + return Status::OK(); } struct SharedReadTableCache { @@ -503,8 +538,8 @@ std::string JoinColumns(const std::vector& columns) { return joined; } -const SharedMorReadTableCache& GetOrCreateSharedMorReadTable(const std::string& file_format, - const SourceDataSpec& source_spec) { +Result GetOrCreateSharedMorReadTable( + const std::string& file_format, const SourceDataSpec& source_spec) { static SharedMorReadTableCache cache; static std::mutex cache_mutex; @@ -515,31 +550,32 @@ const SharedMorReadTableCache& GetOrCreateSharedMorReadTable(const std::string& if (cache.workspace != nullptr && cache.key == cache_key) { std::cout << "[benchmark][mor-read] reuse_output_table_path=" << cache.table_path << std::endl; - return cache; + return &cache; } auto options = BuildPkOptions(file_format); - const auto source_metadata = LoadSourceDataMetadata(source_spec); + PAIMON_ASSIGN_OR_RAISE(const auto source_metadata, LoadSourceDataMetadata(source_spec)); - auto workspace = CreateBenchmarkWorkspace(); + PAIMON_ASSIGN_OR_RAISE(auto workspace, CreateBenchmarkWorkspace()); const std::string db_name = "bench_db"; const std::string table_name = "mor_read_shared_" + std::to_string(NextTableId()); - EnsureTable(workspace->Str(), db_name, table_name, options, source_metadata.schema, - /*primary_keys=*/pk_columns); + PAIMON_RETURN_NOT_OK(EnsureTable(workspace->Str(), db_name, table_name, options, + source_metadata.schema, + /*primary_keys=*/pk_columns)); const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); std::cout << "[benchmark][mor-read] create_shared_output_table_path=" << table_path << std::endl; - WriteAndCommit(table_path, options, source_spec); + PAIMON_RETURN_NOT_OK(WriteAndCommit(table_path, options, source_spec)); cache.key = cache_key; cache.workspace = std::move(workspace); cache.table_path = table_path; cache.total_rows = source_metadata.total_rows; - return cache; + return &cache; } -const SharedReadTableCache& GetOrCreateSharedReadTable(const std::string& file_format, - const SourceDataSpec& source_spec) { +Result GetOrCreateSharedReadTable( + const std::string& file_format, const SourceDataSpec& source_spec) { static SharedReadTableCache cache; static std::mutex cache_mutex; @@ -547,33 +583,38 @@ const SharedReadTableCache& GetOrCreateSharedReadTable(const std::string& file_f std::lock_guard lock(cache_mutex); if (cache.workspace != nullptr && cache.key == cache_key) { std::cout << "[benchmark][read] reuse_output_table_path=" << cache.table_path << std::endl; - return cache; + return &cache; } auto options = BuildOptions(file_format); - const auto source_metadata = LoadSourceDataMetadata(source_spec); + PAIMON_ASSIGN_OR_RAISE(const auto source_metadata, LoadSourceDataMetadata(source_spec)); - auto workspace = CreateBenchmarkWorkspace(); + PAIMON_ASSIGN_OR_RAISE(auto workspace, CreateBenchmarkWorkspace()); const std::string db_name = "bench_db"; const std::string table_name = "read_shared_" + std::to_string(NextTableId()); - EnsureTable(workspace->Str(), db_name, table_name, options, source_metadata.schema); + PAIMON_RETURN_NOT_OK( + EnsureTable(workspace->Str(), db_name, table_name, options, source_metadata.schema)); const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); std::cout << "[benchmark][read] create_shared_output_table_path=" << table_path << std::endl; - WriteAndCommit(table_path, options, source_spec); + PAIMON_RETURN_NOT_OK(WriteAndCommit(table_path, options, source_spec)); cache.key = cache_key; cache.workspace = std::move(workspace); cache.table_path = table_path; cache.total_rows = source_metadata.total_rows; - return cache; + return &cache; } -int64_t ReadRows(const std::string& table_path, const std::map& options, - int32_t prefetch_parallel_num) { +Result ReadRows(const std::string& table_path, + const std::map& options, + int32_t prefetch_parallel_num) { paimon::ScanContextBuilder scan_builder(table_path); - auto scan_ctx = ValueOrThrow(scan_builder.SetOptions(options).Finish(), "create scan context"); - auto scanner = ValueOrThrow(paimon::TableScan::Create(std::move(scan_ctx)), "create scanner"); - auto plan = ValueOrThrow(scanner->CreatePlan(), "create plan"); + PAIMON_ASSIGN_OR_RAISE( + auto scan_ctx, AddContext(scan_builder.SetOptions(options).Finish(), "create scan context")); + PAIMON_ASSIGN_OR_RAISE(auto scanner, + AddContext(paimon::TableScan::Create(std::move(scan_ctx)), + "create scanner")); + PAIMON_ASSIGN_OR_RAISE(auto plan, AddContext(scanner->CreatePlan(), "create plan")); paimon::ReadContextBuilder read_builder(table_path); constexpr int32_t kPrefetchBatchCount = 600; @@ -583,21 +624,26 @@ int64_t ReadRows(const std::string& table_path, const std::map 1) .SetRowToBatchThreadNumber(GetRowToBatchThreadNumber()); - auto read_ctx = ValueOrThrow(read_builder.Finish(), "create read context"); - auto reader = - ValueOrThrow(paimon::TableRead::Create(std::move(read_ctx)), "create table reader"); - auto batch_reader = ValueOrThrow(reader->CreateReader(plan->Splits()), "create batch reader"); + PAIMON_ASSIGN_OR_RAISE(auto read_ctx, + AddContext(read_builder.Finish(), "create read context")); + PAIMON_ASSIGN_OR_RAISE( + auto reader, + AddContext(paimon::TableRead::Create(std::move(read_ctx)), "create table reader")); + PAIMON_ASSIGN_OR_RAISE(auto batch_reader, + AddContext(reader->CreateReader(plan->Splits()), + "create batch reader")); int64_t total_rows = 0; while (true) { - auto batch = ValueOrThrow(batch_reader->NextBatch(), "read next batch"); + PAIMON_ASSIGN_OR_RAISE(auto batch, + AddContext(batch_reader->NextBatch(), "read next batch")); if (paimon::BatchReader::IsEofBatch(batch)) { break; } auto& [array, schema] = batch; auto imported = arrow::ImportArray(array.get(), schema.get()); if (!imported.ok()) { - throw std::runtime_error("import c data array failed: " + imported.status().ToString()); + return Status::Invalid("import c data array failed: ", imported.status().ToString()); } total_rows += imported.ValueUnsafe()->length(); } @@ -611,34 +657,30 @@ struct PreparedSourceData { }; bool TryGetSourceSpec(::benchmark::State& state, SourceDataSpec* source_spec) { - try { - *source_spec = GetSourceDataSpec(); - return true; - } catch (const std::exception& e) { - SkipWithMessage(state, e.what()); - return false; - } + (void)state; + *source_spec = GetSourceDataSpec(); + return true; } bool TryPrepareSourceData(::benchmark::State& state, const SourceDataSpec& source_spec, PreparedSourceData* prepared) { - try { - const auto source_metadata = LoadSourceDataMetadata(source_spec); - prepared->schema = source_metadata.schema; - prepared->total_rows = source_metadata.total_rows; - return true; - } catch (const std::exception& e) { - SkipWithMessage(state, e.what()); + auto source_metadata = LoadSourceDataMetadata(source_spec); + if (!source_metadata.ok()) { + SkipWithMessage(state, source_metadata.status().ToString()); return false; } + prepared->schema = source_metadata.value().schema; + prepared->total_rows = source_metadata.value().total_rows; + return true; } } // namespace -void ParsePaimonBenchmarkCliArgs(int* argc, char** argv) { +Status ParsePaimonBenchmarkCliArgs(int* argc, char** argv) { auto parsed_argc = static_cast(*argc); - ParsePaimonBenchmarkCliArgsImpl(&parsed_argc, argv); + PAIMON_RETURN_NOT_OK(ParsePaimonBenchmarkCliArgsImpl(&parsed_argc, argv)); *argc = static_cast(parsed_argc); + return Status::OK(); } bool HasHelpFlag(int32_t argc, char** argv) { @@ -675,14 +717,27 @@ void RunBMWrite(::benchmark::State& state) { return; } auto workspace = CreateBenchmarkWorkspace(); + if (!workspace.ok()) { + SkipWithMessage(state, workspace.status().ToString()); + return; + } for (auto _ : state) { const std::string db_name = "bench_db"; const std::string table_name = "write_" + std::to_string(NextTableId()); - EnsureTable(workspace->Str(), db_name, table_name, options, prepared.schema); - const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); + const Status ensure_status = + EnsureTable(workspace.value()->Str(), db_name, table_name, options, prepared.schema); + if (!ensure_status.ok()) { + SkipWithMessage(state, ensure_status.ToString()); + return; + } + const std::string table_path = RequirePath(workspace.value()->Str(), db_name, table_name); std::cout << "[benchmark][write] output_table_path=" << table_path << std::endl; - WriteAndCommit(table_path, options, source_spec); + const Status write_status = WriteAndCommit(table_path, options, source_spec); + if (!write_status.ok()) { + SkipWithMessage(state, write_status.ToString()); + return; + } } state.SetItemsProcessed(state.iterations() * prepared.total_rows); @@ -708,9 +763,15 @@ void RunBMRead(::benchmark::State& state) { auto options = BuildOptions(file_format); - if (BenchmarkHelpers::TryRunExternalReadMode(state, "read", external_table_path, [&]() { + auto external_read_result = BenchmarkHelpers::TryRunExternalReadMode( + state, "read", external_table_path, [&]() { return ReadRows(external_table_path, options, prefetch_parallel_num); - })) { + }); + if (!external_read_result.ok()) { + SkipWithMessage(state, external_read_result.status().ToString()); + return; + } + if (external_read_result.value()) { return; } @@ -726,19 +787,21 @@ void RunBMRead(::benchmark::State& state) { return; } - const SharedReadTableCache* shared_table = nullptr; - try { - shared_table = &GetOrCreateSharedReadTable(file_format, source_spec); - } catch (const std::exception& e) { - SkipWithMessage(state, e.what()); + auto shared_table = GetOrCreateSharedReadTable(file_format, source_spec); + if (!shared_table.ok()) { + SkipWithMessage(state, shared_table.status().ToString()); return; } - const int64_t rows_read = BenchmarkHelpers::RunReadIterations(state, [&]() { - return ReadRows(shared_table->table_path, options, prefetch_parallel_num); + auto rows_read = BenchmarkHelpers::RunReadIterations(state, [&]() { + return ReadRows(shared_table.value()->table_path, options, prefetch_parallel_num); }); + if (!rows_read.ok()) { + SkipWithMessage(state, rows_read.status().ToString()); + return; + } - state.SetItemsProcessed(state.iterations() * rows_read); + state.SetItemsProcessed(state.iterations() * rows_read.value()); } void RunBMPkWrite(::benchmark::State& state) { @@ -772,15 +835,28 @@ void RunBMPkWrite(::benchmark::State& state) { return; } auto workspace = CreateBenchmarkWorkspace(); + if (!workspace.ok()) { + SkipWithMessage(state, workspace.status().ToString()); + return; + } for (auto _ : state) { const std::string db_name = "bench_db"; const std::string table_name = "pk_write_" + std::to_string(NextTableId()); - EnsureTable(workspace->Str(), db_name, table_name, options, prepared.schema, - /*primary_keys=*/pk_columns); - const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); + const Status ensure_status = EnsureTable(workspace.value()->Str(), db_name, table_name, + options, prepared.schema, + /*primary_keys=*/pk_columns); + if (!ensure_status.ok()) { + SkipWithMessage(state, ensure_status.ToString()); + return; + } + const std::string table_path = RequirePath(workspace.value()->Str(), db_name, table_name); std::cout << "[benchmark][pk-write] output_table_path=" << table_path << std::endl; - WriteAndCommit(table_path, options, source_spec); + const Status write_status = WriteAndCommit(table_path, options, source_spec); + if (!write_status.ok()) { + SkipWithMessage(state, write_status.ToString()); + return; + } } state.SetItemsProcessed(state.iterations() * prepared.total_rows); @@ -804,9 +880,15 @@ void RunBMMorRead(::benchmark::State& state) { } const auto external_read_options = BuildOptions(file_format); - if (BenchmarkHelpers::TryRunExternalReadMode(state, "mor-read", external_table_path, [&]() { + auto external_read_result = BenchmarkHelpers::TryRunExternalReadMode( + state, "mor-read", external_table_path, [&]() { return ReadRows(external_table_path, external_read_options, prefetch_parallel_num); - })) { + }); + if (!external_read_result.ok()) { + SkipWithMessage(state, external_read_result.status().ToString()); + return; + } + if (external_read_result.value()) { return; } @@ -827,18 +909,20 @@ void RunBMMorRead(::benchmark::State& state) { } auto options = BuildPkOptions(file_format); - const SharedMorReadTableCache* shared_table = nullptr; - try { - shared_table = &GetOrCreateSharedMorReadTable(file_format, source_spec); - } catch (const std::exception& e) { - SkipWithMessage(state, e.what()); + auto shared_table = GetOrCreateSharedMorReadTable(file_format, source_spec); + if (!shared_table.ok()) { + SkipWithMessage(state, shared_table.status().ToString()); return; } - const int64_t rows_read = BenchmarkHelpers::RunReadIterations(state, [&]() { - return ReadRows(shared_table->table_path, options, prefetch_parallel_num); + auto rows_read = BenchmarkHelpers::RunReadIterations(state, [&]() { + return ReadRows(shared_table.value()->table_path, options, prefetch_parallel_num); }); - state.SetItemsProcessed(state.iterations() * rows_read); + if (!rows_read.ok()) { + SkipWithMessage(state, rows_read.status().ToString()); + return; + } + state.SetItemsProcessed(state.iterations() * rows_read.value()); } } // namespace paimon::benchmark diff --git a/benchmark/benchmark_suite.h b/benchmark/benchmark_suite.h index 43c07af7f..8b540e7e9 100644 --- a/benchmark/benchmark_suite.h +++ b/benchmark/benchmark_suite.h @@ -19,10 +19,11 @@ #include #include "benchmark/benchmark.h" +#include "paimon/status.h" namespace paimon::benchmark { -void ParsePaimonBenchmarkCliArgs(int* argc, char** argv); +Status ParsePaimonBenchmarkCliArgs(int* argc, char** argv); bool HasHelpFlag(int32_t argc, char** argv); void PrintPaimonBenchmarkCliHelp(); diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h index 448ce5ec5..73069ff17 100644 --- a/benchmark/cli_option_parsing.h +++ b/benchmark/cli_option_parsing.h @@ -17,11 +17,13 @@ #pragma once #include -#include #include #include #include +#include "paimon/result.h" +#include "paimon/status.h" + namespace paimon::benchmark { inline bool ConsumeCliOption(const std::string& arg, const std::string& option_name, @@ -43,10 +45,10 @@ inline std::string TrimAsciiWhitespace(const std::string& value) { return value.substr(first, last - first + 1); } -inline std::vector ParseCommaSeparatedColumns(const std::string& input, - const std::string& option_name) { +inline Result> ParseCommaSeparatedColumns( + const std::string& input, const std::string& option_name) { if (input.empty()) { - throw std::runtime_error("missing value for " + option_name); + return Status::Invalid("missing value for ", option_name); } std::vector columns; @@ -59,7 +61,7 @@ inline std::vector ParseCommaSeparatedColumns(const std::string& in const std::string column = TrimAsciiWhitespace(input.substr(segment_start, index - segment_start)); if (column.empty()) { - throw std::runtime_error("invalid " + option_name + ": empty column name"); + return Status::Invalid("invalid ", option_name, ": empty column name"); } columns.push_back(column); segment_start = index + 1; @@ -67,10 +69,10 @@ inline std::vector ParseCommaSeparatedColumns(const std::string& in return columns; } -inline std::vector> ParseDelimitedOptions( +inline Result>> ParseDelimitedOptions( const std::string& input, const std::string& option_name) { if (input.empty()) { - throw std::runtime_error("missing value for " + option_name); + return Status::Invalid("missing value for ", option_name); } std::vector> parsed; @@ -83,12 +85,12 @@ inline std::vector> ParseDelimitedOptions( } if (token.empty()) { - throw std::runtime_error("invalid " + option_name + ": empty option segment"); + return Status::Invalid("invalid ", option_name, ": empty option segment"); } const auto separator = token.find(':'); if (separator == std::string::npos || separator == 0 || separator + 1 >= token.size()) { - throw std::runtime_error("invalid " + option_name + ": expected key:value"); + return Status::Invalid("invalid ", option_name, ": expected key:value"); } parsed.emplace_back(token.substr(0, separator), token.substr(separator + 1)); @@ -97,9 +99,9 @@ inline std::vector> ParseDelimitedOptions( return parsed; } -inline bool ParseStringOptionArg(int32_t argc, char** argv, const std::string& arg, - const std::string& option_name, int32_t* arg_index, - std::string* value_out) { +inline Result ParseStringOptionArg(int32_t argc, char** argv, const std::string& arg, + const std::string& option_name, int32_t* arg_index, + std::string* value_out) { std::string parsed_value; if (ConsumeCliOption(arg, option_name, &parsed_value)) { *value_out = std::move(parsed_value); @@ -111,18 +113,20 @@ inline bool ParseStringOptionArg(int32_t argc, char** argv, const std::string& a } if (*arg_index + 1 >= argc) { - throw std::runtime_error("missing value for " + option_name); + return Status::Invalid("missing value for ", option_name); } *value_out = argv[++(*arg_index)]; return true; } -inline bool ParseCommaSeparatedOptionArg(int32_t argc, char** argv, const std::string& arg, - const std::string& option_name, int32_t* arg_index, - std::vector* columns_out) { +inline Result ParseCommaSeparatedOptionArg(int32_t argc, char** argv, const std::string& arg, + const std::string& option_name, + int32_t* arg_index, + std::vector* columns_out) { std::string parsed_value; if (ConsumeCliOption(arg, option_name, &parsed_value)) { - *columns_out = ParseCommaSeparatedColumns(parsed_value, option_name); + PAIMON_ASSIGN_OR_RAISE(*columns_out, + ParseCommaSeparatedColumns(parsed_value, option_name)); return true; } @@ -131,18 +135,21 @@ inline bool ParseCommaSeparatedOptionArg(int32_t argc, char** argv, const std::s } if (*arg_index + 1 >= argc) { - throw std::runtime_error("missing value for " + option_name); + return Status::Invalid("missing value for ", option_name); } - *columns_out = ParseCommaSeparatedColumns(std::string(argv[++(*arg_index)]), option_name); + PAIMON_ASSIGN_OR_RAISE(*columns_out, + ParseCommaSeparatedColumns(std::string(argv[++(*arg_index)]), + option_name)); return true; } -inline bool ParseDelimitedRepeatableOptionArg( +inline Result ParseDelimitedRepeatableOptionArg( int32_t argc, char** argv, const std::string& arg, const std::string& option_name, int32_t* arg_index, std::vector>* options_out) { std::string parsed_value; if (ConsumeCliOption(arg, option_name, &parsed_value)) { - const auto parsed_options = ParseDelimitedOptions(parsed_value, option_name); + PAIMON_ASSIGN_OR_RAISE(const auto parsed_options, + ParseDelimitedOptions(parsed_value, option_name)); options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); return true; } @@ -152,11 +159,12 @@ inline bool ParseDelimitedRepeatableOptionArg( } if (*arg_index + 1 >= argc) { - throw std::runtime_error("missing value for " + option_name); + return Status::Invalid("missing value for ", option_name); } const std::string option_arg = argv[++(*arg_index)]; - const auto parsed_options = ParseDelimitedOptions(option_arg, option_name); + PAIMON_ASSIGN_OR_RAISE(const auto parsed_options, + ParseDelimitedOptions(option_arg, option_name)); options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); return true; } diff --git a/benchmark/cli_option_parsing_test.cpp b/benchmark/cli_option_parsing_test.cpp index 7c33cdf39..050b68125 100644 --- a/benchmark/cli_option_parsing_test.cpp +++ b/benchmark/cli_option_parsing_test.cpp @@ -22,6 +22,7 @@ #include #include "gtest/gtest.h" +#include "paimon/testing/utils/testharness.h" namespace paimon::testing { namespace { @@ -52,7 +53,9 @@ TEST(CliOptionParsingTest, ConsumeCliOptionWorks) { } TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsWorks) { - const auto parsed = paimon::benchmark::ParseCommaSeparatedColumns("id, name,age", "--cols"); + ASSERT_OK_AND_ASSIGN(auto parsed, + paimon::benchmark::ParseCommaSeparatedColumns("id, name,age", + "--cols")); ASSERT_EQ(parsed.size(), 3U); ASSERT_EQ(parsed[0], "id"); ASSERT_EQ(parsed[1], "name"); @@ -60,26 +63,23 @@ TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsWorks) { } TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsRejectsInvalidInput) { - ASSERT_THROW((void)paimon::benchmark::ParseCommaSeparatedColumns("", "--cols"), - std::runtime_error); - ASSERT_THROW((void)paimon::benchmark::ParseCommaSeparatedColumns("id,", "--cols"), - std::runtime_error); - ASSERT_THROW((void)paimon::benchmark::ParseCommaSeparatedColumns("id,,name", "--cols"), - std::runtime_error); + ASSERT_NOK(paimon::benchmark::ParseCommaSeparatedColumns("", "--cols")); + ASSERT_NOK(paimon::benchmark::ParseCommaSeparatedColumns("id,", "--cols")); + ASSERT_NOK(paimon::benchmark::ParseCommaSeparatedColumns("id,,name", "--cols")); } TEST(CliOptionParsingTest, ParseDelimitedOptionsWorks) { - const auto parsed = paimon::benchmark::ParseDelimitedOptions("k1:v1;k2:v2", "--paimon_option"); + ASSERT_OK_AND_ASSIGN( + auto parsed, + paimon::benchmark::ParseDelimitedOptions("k1:v1;k2:v2", "--paimon_option")); ASSERT_EQ(parsed.size(), 2U); ASSERT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); ASSERT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); } TEST(CliOptionParsingTest, ParseDelimitedOptionsRejectsInvalidInput) { - ASSERT_THROW((void)paimon::benchmark::ParseDelimitedOptions("", "--paimon_option"), - std::runtime_error); - ASSERT_THROW((void)paimon::benchmark::ParseDelimitedOptions("k1:v1;", "--paimon_option"), - std::runtime_error); + ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions("", "--paimon_option")); + ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions("k1:v1;", "--paimon_option")); } TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) { @@ -87,9 +87,12 @@ TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) ArgvHolder argv_holder({"prog", "--foo=bar"}); int32_t arg_index = 1; std::string value; - ASSERT_TRUE(paimon::benchmark::ParseStringOptionArg( - argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--foo", - &arg_index, &value)); + ASSERT_OK_AND_ASSIGN( + bool is_parsed, + paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], + "--foo", &arg_index, &value)); + ASSERT_TRUE(is_parsed); ASSERT_EQ(arg_index, 1); ASSERT_EQ(value, "bar"); } @@ -98,9 +101,12 @@ TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) ArgvHolder argv_holder({"prog", "--foo", "bar"}); int32_t arg_index = 1; std::string value; - ASSERT_TRUE(paimon::benchmark::ParseStringOptionArg( - argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--foo", - &arg_index, &value)); + ASSERT_OK_AND_ASSIGN( + bool is_parsed, + paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], + "--foo", &arg_index, &value)); + ASSERT_TRUE(is_parsed); ASSERT_EQ(arg_index, 2); ASSERT_EQ(value, "bar"); } @@ -110,10 +116,9 @@ TEST(CliOptionParsingTest, ParseStringOptionArgRejectsMissingValue) { ArgvHolder argv_holder({"prog", "--foo"}); int32_t arg_index = 1; std::string value; - ASSERT_THROW((void)paimon::benchmark::ParseStringOptionArg( - argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], - "--foo", &arg_index, &value), - std::runtime_error); + ASSERT_NOK(paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--foo", + &arg_index, &value)); } TEST(CliOptionParsingTest, ParseCommaSeparatedOptionArgAndDelimitedRepeatableOptionArgWorks) { @@ -121,9 +126,12 @@ TEST(CliOptionParsingTest, ParseCommaSeparatedOptionArgAndDelimitedRepeatableOpt ArgvHolder argv_holder({"prog", "--cols", "id,name"}); int32_t arg_index = 1; std::vector columns; - ASSERT_TRUE(paimon::benchmark::ParseCommaSeparatedOptionArg( - argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--cols", - &arg_index, &columns)); + ASSERT_OK_AND_ASSIGN( + bool is_parsed, + paimon::benchmark::ParseCommaSeparatedOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], + "--cols", &arg_index, &columns)); + ASSERT_TRUE(is_parsed); ASSERT_EQ(arg_index, 2); ASSERT_EQ(columns.size(), 2U); ASSERT_EQ(columns[0], "id"); @@ -134,9 +142,12 @@ TEST(CliOptionParsingTest, ParseCommaSeparatedOptionArgAndDelimitedRepeatableOpt ArgvHolder argv_holder({"prog", "--paimon_option", "k1:v1;k2:v2"}); int32_t arg_index = 1; std::vector> options; - ASSERT_TRUE(paimon::benchmark::ParseDelimitedRepeatableOptionArg( - argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], - "--paimon_option", &arg_index, &options)); + ASSERT_OK_AND_ASSIGN( + bool is_parsed, + paimon::benchmark::ParseDelimitedRepeatableOptionArg( + argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], + "--paimon_option", &arg_index, &options)); + ASSERT_TRUE(is_parsed); ASSERT_EQ(arg_index, 2); ASSERT_EQ(options.size(), 2U); ASSERT_EQ(options[0], std::make_pair(std::string("k1"), std::string("v1"))); diff --git a/benchmark/read_write_benchmark.cpp b/benchmark/read_write_benchmark.cpp index 0710cdc9b..7989682bd 100644 --- a/benchmark/read_write_benchmark.cpp +++ b/benchmark/read_write_benchmark.cpp @@ -15,17 +15,15 @@ */ #include -#include #include #include "benchmark/benchmark.h" #include "benchmark/benchmark_suite.h" int main(int argc, char** argv) { - try { - paimon::benchmark::ParsePaimonBenchmarkCliArgs(&argc, argv); - } catch (const std::exception& e) { - std::cerr << "paimon-read-write-benchmark: " << e.what() << std::endl; + const paimon::Status parse_status = paimon::benchmark::ParsePaimonBenchmarkCliArgs(&argc, argv); + if (!parse_status.ok()) { + std::cerr << "paimon-read-write-benchmark: " << parse_status.ToString() << std::endl; std::cerr << "Try 'paimon-read-write-benchmark --help' for more information." << std::endl; return 1; } From 462e557fb2ec7fec75cef99e2a41bc964529b236 Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Thu, 4 Jun 2026 13:21:03 +0800 Subject: [PATCH 10/13] fix --- benchmark/benchmark_helpers.cpp | 12 +-- benchmark/benchmark_helpers.h | 8 +- benchmark/benchmark_suite.cpp | 125 ++++++++------------------ benchmark/cli_option_parsing.h | 15 +++- benchmark/cli_option_parsing_test.cpp | 25 ++++++ benchmark/read_write_benchmark.cpp | 10 +-- docs/source/examples/benchmark.rst | 19 ++-- 7 files changed, 94 insertions(+), 120 deletions(-) diff --git a/benchmark/benchmark_helpers.cpp b/benchmark/benchmark_helpers.cpp index 7d6b3ae60..6715a3194 100644 --- a/benchmark/benchmark_helpers.cpp +++ b/benchmark/benchmark_helpers.cpp @@ -71,15 +71,15 @@ Result BenchmarkHelpers::RunReadIterations(::benchmark::State& state, return rows_read; } -Result BenchmarkHelpers::TryRunExternalReadMode(::benchmark::State& state, - const std::string& benchmark_name, - const std::string& external_table_path, - const ReadOnceFn& read_once) { - if (external_table_path.empty()) { +Result BenchmarkHelpers::TryRunSourceTableReadMode(::benchmark::State& state, + const std::string& benchmark_name, + const std::string& source_table_path, + const ReadOnceFn& read_once) { + if (source_table_path.empty()) { return false; } - std::cout << "[benchmark][" << benchmark_name << "] external_table_path=" << external_table_path + std::cout << "[benchmark][" << benchmark_name << "] source_table_path=" << source_table_path << std::endl; PAIMON_ASSIGN_OR_RAISE(const int64_t rows_read, RunReadIterations(state, read_once)); state.SetItemsProcessed(state.iterations() * rows_read); diff --git a/benchmark/benchmark_helpers.h b/benchmark/benchmark_helpers.h index ed59e602f..0204349e2 100644 --- a/benchmark/benchmark_helpers.h +++ b/benchmark/benchmark_helpers.h @@ -49,10 +49,10 @@ class BenchmarkHelpers { static Result RunReadIterations(::benchmark::State& state, const ReadOnceFn& read_once); - static Result TryRunExternalReadMode(::benchmark::State& state, - const std::string& benchmark_name, - const std::string& external_table_path, - const ReadOnceFn& read_once); + static Result TryRunSourceTableReadMode(::benchmark::State& state, + const std::string& benchmark_name, + const std::string& source_table_path, + const ReadOnceFn& read_once); }; } // namespace paimon::benchmark diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp index e452fa741..8e45a0197 100644 --- a/benchmark/benchmark_suite.cpp +++ b/benchmark/benchmark_suite.cpp @@ -18,9 +18,7 @@ #include #include -#include #include -#include #include #include #include @@ -50,12 +48,12 @@ namespace paimon::benchmark { namespace { +constexpr int64_t kSourceBatchMaxRows = 4096; +constexpr int32_t kRowToBatchThreadNumber = 3; + struct BenchmarkCliOptions { std::string source_data_file; - std::string external_table_path; - std::string file_format = "parquet"; - int64_t source_batch_max_rows = 4096; - int32_t row_to_batch_thread_number = 3; + std::string source_table_path; std::vector pk_columns; std::vector> extra_options; }; @@ -74,30 +72,13 @@ const BenchmarkCliOptions& GetBenchmarkCliOptions() { return MutableBenchmarkCliOptions(); } -Result ParsePositiveInt64(const std::string& value, const std::string& option_name) { - char* end = nullptr; - const auto parsed = std::strtoll(value.c_str(), &end, 10); - if (end == value.c_str() || *end != '\0' || parsed <= 0) { - return Status::Invalid("invalid ", option_name, ", expected positive integer"); - } - return static_cast(parsed); -} - -Result ParsePositiveInt32(const std::string& value, const std::string& option_name) { - PAIMON_ASSIGN_OR_RAISE(const int64_t parsed, ParsePositiveInt64(value, option_name)); - if (parsed > std::numeric_limits::max()) { - return Status::Invalid("invalid ", option_name, ", value is too large"); - } - return static_cast(parsed); -} - Status ParsePaimonBenchmarkCliArgsImpl(int32_t* argc, char** argv) { auto& options = MutableBenchmarkCliOptions(); + options = BenchmarkCliOptions{}; const int32_t parsed_argc = *argc; int32_t write_index = 1; for (int32_t arg_index = 1; arg_index < parsed_argc; ++arg_index) { const std::string arg(argv[arg_index]); - std::string parsed_value; PAIMON_ASSIGN_OR_RAISE(bool is_parsed, paimon::benchmark::ParseStringOptionArg( @@ -108,36 +89,9 @@ Status ParsePaimonBenchmarkCliArgsImpl(int32_t* argc, char** argv) { } PAIMON_ASSIGN_OR_RAISE(is_parsed, paimon::benchmark::ParseStringOptionArg( - parsed_argc, argv, arg, "--paimon_external_table_path", - &arg_index, &options.external_table_path)); - if (is_parsed) { - continue; - } - PAIMON_ASSIGN_OR_RAISE(is_parsed, - paimon::benchmark::ParseStringOptionArg( - parsed_argc, argv, arg, "--paimon_file_format", &arg_index, - &options.file_format)); - if (is_parsed) { - continue; - } - PAIMON_ASSIGN_OR_RAISE(is_parsed, - paimon::benchmark::ParseStringOptionArg( - parsed_argc, argv, arg, "--paimon_source_batch_max_rows", - &arg_index, &parsed_value)); + parsed_argc, argv, arg, "--paimon_source_table_path", + &arg_index, &options.source_table_path)); if (is_parsed) { - PAIMON_ASSIGN_OR_RAISE( - options.source_batch_max_rows, - ParsePositiveInt64(parsed_value, "--paimon_source_batch_max_rows")); - continue; - } - PAIMON_ASSIGN_OR_RAISE(is_parsed, - paimon::benchmark::ParseStringOptionArg( - parsed_argc, argv, arg, "--paimon_row_to_batch_thread_number", - &arg_index, &parsed_value)); - if (is_parsed) { - PAIMON_ASSIGN_OR_RAISE( - options.row_to_batch_thread_number, - ParsePositiveInt32(parsed_value, "--paimon_row_to_batch_thread_number")); continue; } PAIMON_ASSIGN_OR_RAISE(is_parsed, @@ -179,29 +133,23 @@ void PrintPaimonBenchmarkCliHelpImpl() { << " Required. External source data file used to build benchmark data.\n" << " Currently supports Parquet source files.\n" << " Also supports: --paimon_source_data_file \n" - << " --paimon_external_table_path=\n" + << " --paimon_source_table_path=\n" << " Optional for BM_Read and BM_MOR_Read. If set, read directly from existing\n" << " table path and skip source file loading and pre-write stage.\n" - << " Also supports: --paimon_external_table_path \n" - << " --paimon_file_format=\n" - << " Optional. Target table file format. Default: parquet.\n" - << " Also supports: --paimon_file_format \n" - << " --paimon_source_batch_max_rows=\n" - << " Optional. Max rows per source batch. Default: 4096.\n" - << " --paimon_row_to_batch_thread_number=\n" - << " Optional. Row-to-batch thread number for reads. Default: 3.\n" + << " Also supports: --paimon_source_table_path \n" << " --paimon_pk_columns=\n" << " Required by BM_PK_Write and BM_MOR_Read.\n" << " Also supports: --paimon_pk_columns \n" << " --paimon_option=:;:\n" << " Optional and repeatable. Pass through table options as-is.\n" + << " Default table file format is parquet; use file.format: to override.\n" << " Also supports: --paimon_option :;:\n" << " Note: use quotes in shell, e.g. \"--paimon_option k1:v1;k2:v2\".\n" << "\n" << "Example:\n" << " paimon-read-write-benchmark --paimon_source_data_file /path/data.parquet \\\n" - << " --paimon_file_format parquet --paimon_pk_columns=id \\\n" - << " --paimon_option \"read.batch-size:8192\" --benchmark_filter=BM_Read\n" + << " --paimon_pk_columns=id --paimon_option \"read.batch-size:8192\" \\\n" + << " --benchmark_filter=BM_Read\n" << std::endl; } @@ -226,14 +174,15 @@ std::string RequirePath(const std::string& root_path, const std::string& db_name template Result AddContext(paimon::Result&& result, const std::string& context) { if (!result.ok()) { - return Status::Invalid(context, ": ", result.status().ToString()); + const Status status = result.status(); + return status.WithMessage(context, ": ", status.message()); } return std::move(result).value(); } Status AddContext(const paimon::Status& status, const std::string& context) { if (!status.ok()) { - return Status::Invalid(context, ": ", status.ToString()); + return status.WithMessage(context, ": ", status.message()); } return Status::OK(); } @@ -243,7 +192,7 @@ void SkipWithMessage(::benchmark::State& state, const std::string& message) { } std::string GetConfiguredFileFormat() { - std::string file_format = GetBenchmarkCliOptions().file_format; + std::string file_format = "parquet"; for (const auto& kv : GetBenchmarkCliOptions().extra_options) { if (kv.first == paimon::Options::FILE_FORMAT) { file_format = kv.second; @@ -291,8 +240,8 @@ std::string GetSourceDataFilePath() { return GetBenchmarkCliOptions().source_data_file; } -std::string GetExternalTablePath() { - return GetBenchmarkCliOptions().external_table_path; +std::string GetSourceTablePath() { + return GetBenchmarkCliOptions().source_table_path; } const std::vector& GetPkColumns() { @@ -308,11 +257,11 @@ SourceDataSpec GetSourceDataSpec() { } int64_t GetSourceBatchMaxRows() { - return GetBenchmarkCliOptions().source_batch_max_rows; + return kSourceBatchMaxRows; } int32_t GetRowToBatchThreadNumber() { - return GetBenchmarkCliOptions().row_to_batch_thread_number; + return kRowToBatchThreadNumber; } bool SupportsParquetSourceDataMode() { @@ -746,7 +695,7 @@ void RunBMWrite(::benchmark::State& state) { void RunBMRead(::benchmark::State& state) { const auto prefetch_parallel_num = static_cast(state.range(0)); const std::string file_format = GetConfiguredFileFormat(); - const std::string external_table_path = GetExternalTablePath(); + const std::string source_table_path = GetSourceTablePath(); SourceDataSpec source_spec; if (!TryGetSourceSpec(state, &source_spec)) { return; @@ -763,21 +712,21 @@ void RunBMRead(::benchmark::State& state) { auto options = BuildOptions(file_format); - auto external_read_result = BenchmarkHelpers::TryRunExternalReadMode( - state, "read", external_table_path, [&]() { - return ReadRows(external_table_path, options, prefetch_parallel_num); + auto source_table_read_result = BenchmarkHelpers::TryRunSourceTableReadMode( + state, "read", source_table_path, [&]() { + return ReadRows(source_table_path, options, prefetch_parallel_num); }); - if (!external_read_result.ok()) { - SkipWithMessage(state, external_read_result.status().ToString()); + if (!source_table_read_result.ok()) { + SkipWithMessage(state, source_table_read_result.status().ToString()); return; } - if (external_read_result.value()) { + if (source_table_read_result.value()) { return; } if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( state, source_spec.path, - "--paimon_source_data_file is required when --paimon_external_table_path is not set", + "--paimon_source_data_file is required when --paimon_source_table_path is not set", &SkipWithMessage)) { return; } @@ -865,7 +814,7 @@ void RunBMPkWrite(::benchmark::State& state) { void RunBMMorRead(::benchmark::State& state) { const auto prefetch_parallel_num = static_cast(state.range(0)); const std::string file_format = GetConfiguredFileFormat(); - const std::string external_table_path = GetExternalTablePath(); + const std::string source_table_path = GetSourceTablePath(); SourceDataSpec source_spec; if (!TryGetSourceSpec(state, &source_spec)) { return; @@ -879,22 +828,22 @@ void RunBMMorRead(::benchmark::State& state) { return; } - const auto external_read_options = BuildOptions(file_format); - auto external_read_result = BenchmarkHelpers::TryRunExternalReadMode( - state, "mor-read", external_table_path, [&]() { - return ReadRows(external_table_path, external_read_options, prefetch_parallel_num); + const auto source_table_read_options = BuildOptions(file_format); + auto source_table_read_result = BenchmarkHelpers::TryRunSourceTableReadMode( + state, "mor-read", source_table_path, [&]() { + return ReadRows(source_table_path, source_table_read_options, prefetch_parallel_num); }); - if (!external_read_result.ok()) { - SkipWithMessage(state, external_read_result.status().ToString()); + if (!source_table_read_result.ok()) { + SkipWithMessage(state, source_table_read_result.status().ToString()); return; } - if (external_read_result.value()) { + if (source_table_read_result.value()) { return; } if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( state, source_spec.path, - "--paimon_source_data_file is required when --paimon_external_table_path is not set", + "--paimon_source_data_file is required when --paimon_source_table_path is not set", &SkipWithMessage)) { return; } diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h index 73069ff17..c7650d3dc 100644 --- a/benchmark/cli_option_parsing.h +++ b/benchmark/cli_option_parsing.h @@ -84,16 +84,23 @@ inline Result>> ParseDelimitedOp continue; } - if (token.empty()) { + const std::string segment = TrimAsciiWhitespace(token); + if (segment.empty()) { return Status::Invalid("invalid ", option_name, ": empty option segment"); } - const auto separator = token.find(':'); - if (separator == std::string::npos || separator == 0 || separator + 1 >= token.size()) { + const auto separator = segment.find(':'); + if (separator == std::string::npos) { return Status::Invalid("invalid ", option_name, ": expected key:value"); } - parsed.emplace_back(token.substr(0, separator), token.substr(separator + 1)); + const std::string key = TrimAsciiWhitespace(segment.substr(0, separator)); + const std::string value = TrimAsciiWhitespace(segment.substr(separator + 1)); + if (key.empty() || value.empty()) { + return Status::Invalid("invalid ", option_name, ": expected key:value"); + } + + parsed.emplace_back(key, value); token.clear(); } return parsed; diff --git a/benchmark/cli_option_parsing_test.cpp b/benchmark/cli_option_parsing_test.cpp index 050b68125..52bbcf5da 100644 --- a/benchmark/cli_option_parsing_test.cpp +++ b/benchmark/cli_option_parsing_test.cpp @@ -77,9 +77,21 @@ TEST(CliOptionParsingTest, ParseDelimitedOptionsWorks) { ASSERT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); } +TEST(CliOptionParsingTest, ParseDelimitedOptionsTrimsKeyAndValue) { + ASSERT_OK_AND_ASSIGN( + auto parsed, + paimon::benchmark::ParseDelimitedOptions(" k1 : v1 ; k2: v2 ", "--paimon_option")); + ASSERT_EQ(parsed.size(), 2U); + ASSERT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); + ASSERT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); +} + TEST(CliOptionParsingTest, ParseDelimitedOptionsRejectsInvalidInput) { ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions("", "--paimon_option")); ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions("k1:v1;", "--paimon_option")); + ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions("k1:", "--paimon_option")); + ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions(":v1", "--paimon_option")); + ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions("k1: ", "--paimon_option")); } TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) { @@ -121,6 +133,19 @@ TEST(CliOptionParsingTest, ParseStringOptionArgRejectsMissingValue) { &arg_index, &value)); } +TEST(CliOptionParsingTest, ParseStringOptionArgIgnoresOtherOptions) { + ArgvHolder argv_holder({"prog", "--bar=baz"}); + int32_t arg_index = 1; + std::string value; + ASSERT_OK_AND_ASSIGN(bool is_parsed, + paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--foo", &arg_index, &value)); + ASSERT_FALSE(is_parsed); + ASSERT_EQ(arg_index, 1); + ASSERT_TRUE(value.empty()); +} + TEST(CliOptionParsingTest, ParseCommaSeparatedOptionArgAndDelimitedRepeatableOptionArgWorks) { { ArgvHolder argv_holder({"prog", "--cols", "id,name"}); diff --git a/benchmark/read_write_benchmark.cpp b/benchmark/read_write_benchmark.cpp index 7989682bd..398e7f794 100644 --- a/benchmark/read_write_benchmark.cpp +++ b/benchmark/read_write_benchmark.cpp @@ -21,6 +21,11 @@ #include "benchmark/benchmark_suite.h" int main(int argc, char** argv) { + if (paimon::benchmark::HasHelpFlag(static_cast(argc), argv)) { + paimon::benchmark::PrintPaimonBenchmarkCliHelp(); + return 0; + } + const paimon::Status parse_status = paimon::benchmark::ParsePaimonBenchmarkCliArgs(&argc, argv); if (!parse_status.ok()) { std::cerr << "paimon-read-write-benchmark: " << parse_status.ToString() << std::endl; @@ -28,11 +33,6 @@ int main(int argc, char** argv) { return 1; } - if (paimon::benchmark::HasHelpFlag(static_cast(argc), argv)) { - paimon::benchmark::PrintPaimonBenchmarkCliHelp(); - return 0; - } - benchmark::Initialize(&argc, argv); if (benchmark::ReportUnrecognizedArguments(argc, argv)) { return 1; diff --git a/docs/source/examples/benchmark.rst b/docs/source/examples/benchmark.rst index 7ae4e272f..2bd06cbe9 100644 --- a/docs/source/examples/benchmark.rst +++ b/docs/source/examples/benchmark.rst @@ -41,26 +41,19 @@ specific options below: Source data file used to build benchmark data. Currently Parquet source files are supported. -``--paimon_external_table_path=`` +``--paimon_source_table_path=`` Read directly from an existing table path for ``BM_Read`` and ``BM_MOR_Read``. When set, the source loading and pre-write stage are skipped. -``--paimon_file_format=`` - Target table file format. The default value is ``parquet``. - -``--paimon_source_batch_max_rows=`` - Max rows per source batch. The default value is ``4096``. - -``--paimon_row_to_batch_thread_number=`` - Row-to-batch thread number for reads. The default value is ``3``. - ``--paimon_pk_columns=`` Primary key columns for ``BM_PK_Write`` and ``BM_MOR_Read``. These cases explicitly use ``bucket=1`` because benchmark batches are written to bucket 0. ``--paimon_option=:;:`` - Repeatable table options passed through to Paimon. For ``BM_PK_Write`` and - ``BM_MOR_Read``, ``bucket`` is forced to ``1``. + Repeatable table options passed through to Paimon. The default table file + format is ``parquet``; use ``--paimon_option file.format:`` to + override it. For ``BM_PK_Write`` and ``BM_MOR_Read``, ``bucket`` is forced to + ``1``. Examples ======== @@ -87,6 +80,6 @@ Primary-key table write:: MOR read from an existing table:: paimon-read-write-benchmark \ - --paimon_external_table_path /path/table \ + --paimon_source_table_path /path/table \ --paimon_pk_columns=id \ --benchmark_filter=BM_MOR_Read/4 From 4813bd7f456498be4c0d36e7e795db4ba2cbb87b Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Thu, 4 Jun 2026 13:23:20 +0800 Subject: [PATCH 11/13] fix --- benchmark/CMakeLists.txt | 9 +- benchmark/benchmark_helpers.cpp | 6 +- benchmark/benchmark_helpers.h | 9 +- benchmark/benchmark_suite.cpp | 150 ++++++++++++-------------- benchmark/cli_option_parsing.h | 15 ++- benchmark/cli_option_parsing_test.cpp | 53 ++++----- 6 files changed, 110 insertions(+), 132 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 573fbaf94..805cdb53b 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -19,17 +19,18 @@ endif() find_package(Threads REQUIRED) set(PAIMON_BENCHMARK_STATIC_LINK_LIBS - paimon_shared - ${PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS} + paimon_shared ${PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS} ${PAIMON_PARQUET_FILE_FORMAT_STATIC_LINK_LIBS} ${PAIMON_BLOB_FILE_FORMAT_STATIC_LINK_LIBS}) if(PAIMON_ENABLE_ORC) - list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS ${PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS}) + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS + ${PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS}) endif() if(PAIMON_ENABLE_AVRO) - list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS ${PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS}) + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS + ${PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS}) endif() set(PAIMON_BENCHMARK_PLATFORM_LINK_LIBS) diff --git a/benchmark/benchmark_helpers.cpp b/benchmark/benchmark_helpers.cpp index 6715a3194..6c9503236 100644 --- a/benchmark/benchmark_helpers.cpp +++ b/benchmark/benchmark_helpers.cpp @@ -72,9 +72,9 @@ Result BenchmarkHelpers::RunReadIterations(::benchmark::State& state, } Result BenchmarkHelpers::TryRunSourceTableReadMode(::benchmark::State& state, - const std::string& benchmark_name, - const std::string& source_table_path, - const ReadOnceFn& read_once) { + const std::string& benchmark_name, + const std::string& source_table_path, + const ReadOnceFn& read_once) { if (source_table_path.empty()) { return false; } diff --git a/benchmark/benchmark_helpers.h b/benchmark/benchmark_helpers.h index 0204349e2..0c5f58f69 100644 --- a/benchmark/benchmark_helpers.h +++ b/benchmark/benchmark_helpers.h @@ -47,12 +47,13 @@ class BenchmarkHelpers { static bool ValidatePrefetchParallelOrSkip(::benchmark::State& state, int32_t prefetch_parallel_num, SkipFn skip); - static Result RunReadIterations(::benchmark::State& state, const ReadOnceFn& read_once); + static Result RunReadIterations(::benchmark::State& state, + const ReadOnceFn& read_once); static Result TryRunSourceTableReadMode(::benchmark::State& state, - const std::string& benchmark_name, - const std::string& source_table_path, - const ReadOnceFn& read_once); + const std::string& benchmark_name, + const std::string& source_table_path, + const ReadOnceFn& read_once); }; } // namespace paimon::benchmark diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp index 8e45a0197..9968fc132 100644 --- a/benchmark/benchmark_suite.cpp +++ b/benchmark/benchmark_suite.cpp @@ -82,29 +82,26 @@ Status ParsePaimonBenchmarkCliArgsImpl(int32_t* argc, char** argv) { PAIMON_ASSIGN_OR_RAISE(bool is_parsed, paimon::benchmark::ParseStringOptionArg( - parsed_argc, argv, arg, "--paimon_source_data_file", - &arg_index, &options.source_data_file)); + parsed_argc, argv, arg, "--paimon_source_data_file", &arg_index, + &options.source_data_file)); if (is_parsed) { continue; } - PAIMON_ASSIGN_OR_RAISE(is_parsed, - paimon::benchmark::ParseStringOptionArg( - parsed_argc, argv, arg, "--paimon_source_table_path", - &arg_index, &options.source_table_path)); + PAIMON_ASSIGN_OR_RAISE(is_parsed, paimon::benchmark::ParseStringOptionArg( + parsed_argc, argv, arg, "--paimon_source_table_path", + &arg_index, &options.source_table_path)); if (is_parsed) { continue; } - PAIMON_ASSIGN_OR_RAISE(is_parsed, - paimon::benchmark::ParseCommaSeparatedOptionArg( - parsed_argc, argv, arg, "--paimon_pk_columns", &arg_index, - &options.pk_columns)); + PAIMON_ASSIGN_OR_RAISE(is_parsed, paimon::benchmark::ParseCommaSeparatedOptionArg( + parsed_argc, argv, arg, "--paimon_pk_columns", + &arg_index, &options.pk_columns)); if (is_parsed) { continue; } - PAIMON_ASSIGN_OR_RAISE(is_parsed, - paimon::benchmark::ParseDelimitedRepeatableOptionArg( - parsed_argc, argv, arg, "--paimon_option", &arg_index, - &options.extra_options)); + PAIMON_ASSIGN_OR_RAISE(is_parsed, paimon::benchmark::ParseDelimitedRepeatableOptionArg( + parsed_argc, argv, arg, "--paimon_option", &arg_index, + &options.extra_options)); if (is_parsed) { continue; } @@ -128,29 +125,30 @@ bool HasHelpFlagImpl(int32_t argc, char** argv) { } void PrintPaimonBenchmarkCliHelpImpl() { - std::cout << "Paimon benchmark custom options:\n" - << " --paimon_source_data_file=\n" - << " Required. External source data file used to build benchmark data.\n" - << " Currently supports Parquet source files.\n" - << " Also supports: --paimon_source_data_file \n" - << " --paimon_source_table_path=\n" - << " Optional for BM_Read and BM_MOR_Read. If set, read directly from existing\n" - << " table path and skip source file loading and pre-write stage.\n" - << " Also supports: --paimon_source_table_path \n" - << " --paimon_pk_columns=\n" - << " Required by BM_PK_Write and BM_MOR_Read.\n" - << " Also supports: --paimon_pk_columns \n" - << " --paimon_option=:;:\n" - << " Optional and repeatable. Pass through table options as-is.\n" - << " Default table file format is parquet; use file.format: to override.\n" - << " Also supports: --paimon_option :;:\n" - << " Note: use quotes in shell, e.g. \"--paimon_option k1:v1;k2:v2\".\n" - << "\n" - << "Example:\n" - << " paimon-read-write-benchmark --paimon_source_data_file /path/data.parquet \\\n" - << " --paimon_pk_columns=id --paimon_option \"read.batch-size:8192\" \\\n" - << " --benchmark_filter=BM_Read\n" - << std::endl; + std::cout + << "Paimon benchmark custom options:\n" + << " --paimon_source_data_file=\n" + << " Required. External source data file used to build benchmark data.\n" + << " Currently supports Parquet source files.\n" + << " Also supports: --paimon_source_data_file \n" + << " --paimon_source_table_path=\n" + << " Optional for BM_Read and BM_MOR_Read. If set, read directly from existing\n" + << " table path and skip source file loading and pre-write stage.\n" + << " Also supports: --paimon_source_table_path \n" + << " --paimon_pk_columns=\n" + << " Required by BM_PK_Write and BM_MOR_Read.\n" + << " Also supports: --paimon_pk_columns \n" + << " --paimon_option=:;:\n" + << " Optional and repeatable. Pass through table options as-is.\n" + << " Default table file format is parquet; use file.format: to override.\n" + << " Also supports: --paimon_option :;:\n" + << " Note: use quotes in shell, e.g. \"--paimon_option k1:v1;k2:v2\".\n" + << "\n" + << "Example:\n" + << " paimon-read-write-benchmark --paimon_source_data_file /path/data.parquet \\\n" + << " --paimon_pk_columns=id --paimon_option \"read.batch-size:8192\" \\\n" + << " --benchmark_filter=BM_Read\n" + << std::endl; } Result> CreateBenchmarkWorkspace() { @@ -362,13 +360,11 @@ Result> MakeRecordBatch( } Status EnsureTable(const std::string& root_path, const std::string& db_name, - const std::string& table_name, - const std::map& options, + const std::string& table_name, const std::map& options, const std::shared_ptr& schema, const std::vector& primary_keys = {}) { - PAIMON_ASSIGN_OR_RAISE(auto catalog, - AddContext(paimon::Catalog::Create(root_path, options), - "create catalog")); + PAIMON_ASSIGN_OR_RAISE( + auto catalog, AddContext(paimon::Catalog::Create(root_path, options), "create catalog")); PAIMON_RETURN_NOT_OK( AddContext(catalog->CreateDatabase(db_name, options, true), "create database")); @@ -376,16 +372,15 @@ Status EnsureTable(const std::string& root_path, const std::string& db_name, if (!arrow::ExportSchema(*schema, &c_schema).ok()) { return Status::Invalid("failed to export table schema"); } - PAIMON_RETURN_NOT_OK(AddContext( - catalog->CreateTable(paimon::Identifier(db_name, table_name), &c_schema, - /*partition_keys=*/{}, primary_keys, options, - /*ignore_if_exists=*/false), - "create table")); + PAIMON_RETURN_NOT_OK( + AddContext(catalog->CreateTable(paimon::Identifier(db_name, table_name), &c_schema, + /*partition_keys=*/{}, primary_keys, options, + /*ignore_if_exists=*/false), + "create table")); return Status::OK(); } -Status WriteSourceDataToWriter(paimon::FileStoreWrite* writer, - const SourceDataSpec& source_spec) { +Status WriteSourceDataToWriter(paimon::FileStoreWrite* writer, const SourceDataSpec& source_spec) { if (source_spec.format != "parquet") { return Status::Invalid("unknown source format: ", source_spec.format); } @@ -433,22 +428,18 @@ Status WriteAndCommit(const std::string& table_path, const std::map& options, const SourceDataSpec& source_spec) { paimon::WriteContextBuilder write_builder(table_path, "benchmark-writer"); - PAIMON_ASSIGN_OR_RAISE( - auto write_ctx, - AddContext(write_builder.SetOptions(options).Finish(), "create write context")); - PAIMON_ASSIGN_OR_RAISE( - auto writer, - AddContext(paimon::FileStoreWrite::Create(std::move(write_ctx)), - "create file store writer")); + PAIMON_ASSIGN_OR_RAISE(auto write_ctx, AddContext(write_builder.SetOptions(options).Finish(), + "create write context")); + PAIMON_ASSIGN_OR_RAISE(auto writer, + AddContext(paimon::FileStoreWrite::Create(std::move(write_ctx)), + "create file store writer")); PAIMON_RETURN_NOT_OK(WriteSourceDataToWriter(writer.get(), source_spec)); - PAIMON_ASSIGN_OR_RAISE(auto messages, - AddContext(writer->PrepareCommit(), "prepare commit")); + PAIMON_ASSIGN_OR_RAISE(auto messages, AddContext(writer->PrepareCommit(), "prepare commit")); paimon::CommitContextBuilder commit_builder(table_path, "benchmark-writer"); - PAIMON_ASSIGN_OR_RAISE( - auto commit_ctx, - AddContext(commit_builder.SetOptions(options).Finish(), "create commit context")); + PAIMON_ASSIGN_OR_RAISE(auto commit_ctx, AddContext(commit_builder.SetOptions(options).Finish(), + "create commit context")); PAIMON_ASSIGN_OR_RAISE( auto committer, AddContext(paimon::FileStoreCommit::Create(std::move(commit_ctx)), "create committer")); @@ -523,8 +514,8 @@ Result GetOrCreateSharedMorReadTable( return &cache; } -Result GetOrCreateSharedReadTable( - const std::string& file_format, const SourceDataSpec& source_spec) { +Result GetOrCreateSharedReadTable(const std::string& file_format, + const SourceDataSpec& source_spec) { static SharedReadTableCache cache; static std::mutex cache_mutex; @@ -558,11 +549,10 @@ Result ReadRows(const std::string& table_path, const std::map& options, int32_t prefetch_parallel_num) { paimon::ScanContextBuilder scan_builder(table_path); + PAIMON_ASSIGN_OR_RAISE(auto scan_ctx, AddContext(scan_builder.SetOptions(options).Finish(), + "create scan context")); PAIMON_ASSIGN_OR_RAISE( - auto scan_ctx, AddContext(scan_builder.SetOptions(options).Finish(), "create scan context")); - PAIMON_ASSIGN_OR_RAISE(auto scanner, - AddContext(paimon::TableScan::Create(std::move(scan_ctx)), - "create scanner")); + auto scanner, AddContext(paimon::TableScan::Create(std::move(scan_ctx)), "create scanner")); PAIMON_ASSIGN_OR_RAISE(auto plan, AddContext(scanner->CreatePlan(), "create plan")); paimon::ReadContextBuilder read_builder(table_path); @@ -573,14 +563,11 @@ Result ReadRows(const std::string& table_path, .SetPrefetchMaxParallelNum(prefetch_parallel_num) .EnableMultiThreadRowToBatch(GetRowToBatchThreadNumber() > 1) .SetRowToBatchThreadNumber(GetRowToBatchThreadNumber()); - PAIMON_ASSIGN_OR_RAISE(auto read_ctx, - AddContext(read_builder.Finish(), "create read context")); - PAIMON_ASSIGN_OR_RAISE( - auto reader, - AddContext(paimon::TableRead::Create(std::move(read_ctx)), "create table reader")); + PAIMON_ASSIGN_OR_RAISE(auto read_ctx, AddContext(read_builder.Finish(), "create read context")); + PAIMON_ASSIGN_OR_RAISE(auto reader, AddContext(paimon::TableRead::Create(std::move(read_ctx)), + "create table reader")); PAIMON_ASSIGN_OR_RAISE(auto batch_reader, - AddContext(reader->CreateReader(plan->Splits()), - "create batch reader")); + AddContext(reader->CreateReader(plan->Splits()), "create batch reader")); int64_t total_rows = 0; while (true) { @@ -713,9 +700,8 @@ void RunBMRead(::benchmark::State& state) { auto options = BuildOptions(file_format); auto source_table_read_result = BenchmarkHelpers::TryRunSourceTableReadMode( - state, "read", source_table_path, [&]() { - return ReadRows(source_table_path, options, prefetch_parallel_num); - }); + state, "read", source_table_path, + [&]() { return ReadRows(source_table_path, options, prefetch_parallel_num); }); if (!source_table_read_result.ok()) { SkipWithMessage(state, source_table_read_result.status().ToString()); return; @@ -792,9 +778,9 @@ void RunBMPkWrite(::benchmark::State& state) { for (auto _ : state) { const std::string db_name = "bench_db"; const std::string table_name = "pk_write_" + std::to_string(NextTableId()); - const Status ensure_status = EnsureTable(workspace.value()->Str(), db_name, table_name, - options, prepared.schema, - /*primary_keys=*/pk_columns); + const Status ensure_status = + EnsureTable(workspace.value()->Str(), db_name, table_name, options, prepared.schema, + /*primary_keys=*/pk_columns); if (!ensure_status.ok()) { SkipWithMessage(state, ensure_status.ToString()); return; @@ -829,8 +815,8 @@ void RunBMMorRead(::benchmark::State& state) { } const auto source_table_read_options = BuildOptions(file_format); - auto source_table_read_result = BenchmarkHelpers::TryRunSourceTableReadMode( - state, "mor-read", source_table_path, [&]() { + auto source_table_read_result = + BenchmarkHelpers::TryRunSourceTableReadMode(state, "mor-read", source_table_path, [&]() { return ReadRows(source_table_path, source_table_read_options, prefetch_parallel_num); }); if (!source_table_read_result.ok()) { diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h index c7650d3dc..5eef468c5 100644 --- a/benchmark/cli_option_parsing.h +++ b/benchmark/cli_option_parsing.h @@ -45,8 +45,8 @@ inline std::string TrimAsciiWhitespace(const std::string& value) { return value.substr(first, last - first + 1); } -inline Result> ParseCommaSeparatedColumns( - const std::string& input, const std::string& option_name) { +inline Result> ParseCommaSeparatedColumns(const std::string& input, + const std::string& option_name) { if (input.empty()) { return Status::Invalid("missing value for ", option_name); } @@ -127,13 +127,11 @@ inline Result ParseStringOptionArg(int32_t argc, char** argv, const std::s } inline Result ParseCommaSeparatedOptionArg(int32_t argc, char** argv, const std::string& arg, - const std::string& option_name, - int32_t* arg_index, + const std::string& option_name, int32_t* arg_index, std::vector* columns_out) { std::string parsed_value; if (ConsumeCliOption(arg, option_name, &parsed_value)) { - PAIMON_ASSIGN_OR_RAISE(*columns_out, - ParseCommaSeparatedColumns(parsed_value, option_name)); + PAIMON_ASSIGN_OR_RAISE(*columns_out, ParseCommaSeparatedColumns(parsed_value, option_name)); return true; } @@ -144,9 +142,8 @@ inline Result ParseCommaSeparatedOptionArg(int32_t argc, char** argv, cons if (*arg_index + 1 >= argc) { return Status::Invalid("missing value for ", option_name); } - PAIMON_ASSIGN_OR_RAISE(*columns_out, - ParseCommaSeparatedColumns(std::string(argv[++(*arg_index)]), - option_name)); + PAIMON_ASSIGN_OR_RAISE( + *columns_out, ParseCommaSeparatedColumns(std::string(argv[++(*arg_index)]), option_name)); return true; } diff --git a/benchmark/cli_option_parsing_test.cpp b/benchmark/cli_option_parsing_test.cpp index 52bbcf5da..8082bdc9d 100644 --- a/benchmark/cli_option_parsing_test.cpp +++ b/benchmark/cli_option_parsing_test.cpp @@ -54,8 +54,7 @@ TEST(CliOptionParsingTest, ConsumeCliOptionWorks) { TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsWorks) { ASSERT_OK_AND_ASSIGN(auto parsed, - paimon::benchmark::ParseCommaSeparatedColumns("id, name,age", - "--cols")); + paimon::benchmark::ParseCommaSeparatedColumns("id, name,age", "--cols")); ASSERT_EQ(parsed.size(), 3U); ASSERT_EQ(parsed[0], "id"); ASSERT_EQ(parsed[1], "name"); @@ -70,17 +69,15 @@ TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsRejectsInvalidInput) { TEST(CliOptionParsingTest, ParseDelimitedOptionsWorks) { ASSERT_OK_AND_ASSIGN( - auto parsed, - paimon::benchmark::ParseDelimitedOptions("k1:v1;k2:v2", "--paimon_option")); + auto parsed, paimon::benchmark::ParseDelimitedOptions("k1:v1;k2:v2", "--paimon_option")); ASSERT_EQ(parsed.size(), 2U); ASSERT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); ASSERT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); } TEST(CliOptionParsingTest, ParseDelimitedOptionsTrimsKeyAndValue) { - ASSERT_OK_AND_ASSIGN( - auto parsed, - paimon::benchmark::ParseDelimitedOptions(" k1 : v1 ; k2: v2 ", "--paimon_option")); + ASSERT_OK_AND_ASSIGN(auto parsed, paimon::benchmark::ParseDelimitedOptions(" k1 : v1 ; k2: v2 ", + "--paimon_option")); ASSERT_EQ(parsed.size(), 2U); ASSERT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); ASSERT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); @@ -99,11 +96,10 @@ TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) ArgvHolder argv_holder({"prog", "--foo=bar"}); int32_t arg_index = 1; std::string value; - ASSERT_OK_AND_ASSIGN( - bool is_parsed, - paimon::benchmark::ParseStringOptionArg( - argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], - "--foo", &arg_index, &value)); + ASSERT_OK_AND_ASSIGN(bool is_parsed, + paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--foo", &arg_index, &value)); ASSERT_TRUE(is_parsed); ASSERT_EQ(arg_index, 1); ASSERT_EQ(value, "bar"); @@ -113,11 +109,10 @@ TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) ArgvHolder argv_holder({"prog", "--foo", "bar"}); int32_t arg_index = 1; std::string value; - ASSERT_OK_AND_ASSIGN( - bool is_parsed, - paimon::benchmark::ParseStringOptionArg( - argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], - "--foo", &arg_index, &value)); + ASSERT_OK_AND_ASSIGN(bool is_parsed, + paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--foo", &arg_index, &value)); ASSERT_TRUE(is_parsed); ASSERT_EQ(arg_index, 2); ASSERT_EQ(value, "bar"); @@ -128,9 +123,9 @@ TEST(CliOptionParsingTest, ParseStringOptionArgRejectsMissingValue) { ArgvHolder argv_holder({"prog", "--foo"}); int32_t arg_index = 1; std::string value; - ASSERT_NOK(paimon::benchmark::ParseStringOptionArg( - argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], "--foo", - &arg_index, &value)); + ASSERT_NOK(paimon::benchmark::ParseStringOptionArg(argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--foo", + &arg_index, &value)); } TEST(CliOptionParsingTest, ParseStringOptionArgIgnoresOtherOptions) { @@ -151,11 +146,10 @@ TEST(CliOptionParsingTest, ParseCommaSeparatedOptionArgAndDelimitedRepeatableOpt ArgvHolder argv_holder({"prog", "--cols", "id,name"}); int32_t arg_index = 1; std::vector columns; - ASSERT_OK_AND_ASSIGN( - bool is_parsed, - paimon::benchmark::ParseCommaSeparatedOptionArg( - argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], - "--cols", &arg_index, &columns)); + ASSERT_OK_AND_ASSIGN(bool is_parsed, + paimon::benchmark::ParseCommaSeparatedOptionArg( + argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--cols", &arg_index, &columns)); ASSERT_TRUE(is_parsed); ASSERT_EQ(arg_index, 2); ASSERT_EQ(columns.size(), 2U); @@ -167,11 +161,10 @@ TEST(CliOptionParsingTest, ParseCommaSeparatedOptionArgAndDelimitedRepeatableOpt ArgvHolder argv_holder({"prog", "--paimon_option", "k1:v1;k2:v2"}); int32_t arg_index = 1; std::vector> options; - ASSERT_OK_AND_ASSIGN( - bool is_parsed, - paimon::benchmark::ParseDelimitedRepeatableOptionArg( - argv_holder.argc(), argv_holder.argv.data(), argv_holder.args[arg_index], - "--paimon_option", &arg_index, &options)); + ASSERT_OK_AND_ASSIGN(bool is_parsed, paimon::benchmark::ParseDelimitedRepeatableOptionArg( + argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--paimon_option", + &arg_index, &options)); ASSERT_TRUE(is_parsed); ASSERT_EQ(arg_index, 2); ASSERT_EQ(options.size(), 2U); From c9e2a1e12d95c0a0d0952fd330bcfa7c666a2004 Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Thu, 4 Jun 2026 16:41:49 +0800 Subject: [PATCH 12/13] fix --- benchmark/benchmark_helpers.cpp | 7 +++- benchmark/benchmark_suite.cpp | 72 ++++++++++++++++++++------------- benchmark/cli_option_parsing.h | 14 ++++--- 3 files changed, 58 insertions(+), 35 deletions(-) diff --git a/benchmark/benchmark_helpers.cpp b/benchmark/benchmark_helpers.cpp index 6c9503236..94da33957 100644 --- a/benchmark/benchmark_helpers.cpp +++ b/benchmark/benchmark_helpers.cpp @@ -18,6 +18,8 @@ #include +#include "fmt/format.h" + #include "benchmark/benchmark.h" namespace paimon::benchmark { @@ -26,7 +28,7 @@ bool BenchmarkHelpers::ValidateFileFormatOrSkip(::benchmark::State& state, const std::string& file_format, bool is_supported, SkipFn skip) { if (!is_supported) { - skip(state, "file format is not supported in this build: " + file_format); + skip(state, fmt::format("file format is not supported in this build: {}", file_format)); return false; } return true; @@ -47,7 +49,8 @@ bool BenchmarkHelpers::ValidateSourceSupportOrSkip(::benchmark::State& state, bool is_supported, SkipFn skip) { if (!is_supported) { skip(state, - "source data mode requires reader support in this build for format: " + source_format); + fmt::format("source data mode requires reader support in this build for format: {}", + source_format)); return false; } return true; diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp index 9968fc132..1d815bf66 100644 --- a/benchmark/benchmark_suite.cpp +++ b/benchmark/benchmark_suite.cpp @@ -316,7 +316,8 @@ Result LoadParquetSourceMetadata(const std::string& path) { return cache; } - PAIMON_ASSIGN_OR_RAISE(auto parquet_reader, OpenParquetSourceReader(path)); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr parquet_reader, + OpenParquetSourceReader(path)); std::shared_ptr schema; const auto schema_status = parquet_reader->GetSchema(&schema); if (!schema_status.ok()) { @@ -364,7 +365,8 @@ Status EnsureTable(const std::string& root_path, const std::string& db_name, const std::shared_ptr& schema, const std::vector& primary_keys = {}) { PAIMON_ASSIGN_OR_RAISE( - auto catalog, AddContext(paimon::Catalog::Create(root_path, options), "create catalog")); + std::unique_ptr catalog, + AddContext(paimon::Catalog::Create(root_path, options), "create catalog")); PAIMON_RETURN_NOT_OK( AddContext(catalog->CreateDatabase(db_name, options, true), "create database")); @@ -389,7 +391,8 @@ Status WriteSourceDataToWriter(paimon::FileStoreWrite* writer, const SourceDataS return Status::Invalid( "Parquet source data mode requires parquet::arrow reader support in this build"); #else - PAIMON_ASSIGN_OR_RAISE(auto parquet_reader, OpenParquetSourceReader(source_spec.path)); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr parquet_reader, + OpenParquetSourceReader(source_spec.path)); std::unique_ptr batch_reader; const auto reader_status = parquet_reader->GetRecordBatchReader(&batch_reader); if (!reader_status.ok()) { @@ -412,7 +415,8 @@ Status WriteSourceDataToWriter(paimon::FileStoreWrite* writer, const SourceDataS } auto struct_array = BuildStructArrayFromRecordBatch(record_batch); - PAIMON_ASSIGN_OR_RAISE(auto batch, MakeRecordBatch(struct_array)); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr batch, + MakeRecordBatch(struct_array)); PAIMON_RETURN_NOT_OK(AddContext(writer->Write(std::move(batch)), "write batch")); written_rows += record_batch->num_rows(); } @@ -428,20 +432,24 @@ Status WriteAndCommit(const std::string& table_path, const std::map& options, const SourceDataSpec& source_spec) { paimon::WriteContextBuilder write_builder(table_path, "benchmark-writer"); - PAIMON_ASSIGN_OR_RAISE(auto write_ctx, AddContext(write_builder.SetOptions(options).Finish(), - "create write context")); - PAIMON_ASSIGN_OR_RAISE(auto writer, - AddContext(paimon::FileStoreWrite::Create(std::move(write_ctx)), - "create file store writer")); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr write_ctx, + AddContext(write_builder.SetOptions(options).Finish(), "create write context")); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr writer, + AddContext(paimon::FileStoreWrite::Create(std::move(write_ctx)), + "create file store writer")); PAIMON_RETURN_NOT_OK(WriteSourceDataToWriter(writer.get(), source_spec)); - PAIMON_ASSIGN_OR_RAISE(auto messages, AddContext(writer->PrepareCommit(), "prepare commit")); + PAIMON_ASSIGN_OR_RAISE(std::vector> messages, + AddContext(writer->PrepareCommit(), "prepare commit")); paimon::CommitContextBuilder commit_builder(table_path, "benchmark-writer"); - PAIMON_ASSIGN_OR_RAISE(auto commit_ctx, AddContext(commit_builder.SetOptions(options).Finish(), - "create commit context")); PAIMON_ASSIGN_OR_RAISE( - auto committer, + std::unique_ptr commit_ctx, + AddContext(commit_builder.SetOptions(options).Finish(), "create commit context")); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr committer, AddContext(paimon::FileStoreCommit::Create(std::move(commit_ctx)), "create committer")); PAIMON_RETURN_NOT_OK(AddContext(committer->Commit(messages), "commit write")); return Status::OK(); @@ -494,9 +502,11 @@ Result GetOrCreateSharedMorReadTable( } auto options = BuildPkOptions(file_format); - PAIMON_ASSIGN_OR_RAISE(const auto source_metadata, LoadSourceDataMetadata(source_spec)); + PAIMON_ASSIGN_OR_RAISE(const SourceDataMetadata source_metadata, + LoadSourceDataMetadata(source_spec)); - PAIMON_ASSIGN_OR_RAISE(auto workspace, CreateBenchmarkWorkspace()); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr workspace, + CreateBenchmarkWorkspace()); const std::string db_name = "bench_db"; const std::string table_name = "mor_read_shared_" + std::to_string(NextTableId()); PAIMON_RETURN_NOT_OK(EnsureTable(workspace->Str(), db_name, table_name, options, @@ -527,9 +537,11 @@ Result GetOrCreateSharedReadTable(const std::string } auto options = BuildOptions(file_format); - PAIMON_ASSIGN_OR_RAISE(const auto source_metadata, LoadSourceDataMetadata(source_spec)); + PAIMON_ASSIGN_OR_RAISE(const SourceDataMetadata source_metadata, + LoadSourceDataMetadata(source_spec)); - PAIMON_ASSIGN_OR_RAISE(auto workspace, CreateBenchmarkWorkspace()); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr workspace, + CreateBenchmarkWorkspace()); const std::string db_name = "bench_db"; const std::string table_name = "read_shared_" + std::to_string(NextTableId()); PAIMON_RETURN_NOT_OK( @@ -549,11 +561,14 @@ Result ReadRows(const std::string& table_path, const std::map& options, int32_t prefetch_parallel_num) { paimon::ScanContextBuilder scan_builder(table_path); - PAIMON_ASSIGN_OR_RAISE(auto scan_ctx, AddContext(scan_builder.SetOptions(options).Finish(), - "create scan context")); PAIMON_ASSIGN_OR_RAISE( - auto scanner, AddContext(paimon::TableScan::Create(std::move(scan_ctx)), "create scanner")); - PAIMON_ASSIGN_OR_RAISE(auto plan, AddContext(scanner->CreatePlan(), "create plan")); + std::unique_ptr scan_ctx, + AddContext(scan_builder.SetOptions(options).Finish(), "create scan context")); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr scanner, + AddContext(paimon::TableScan::Create(std::move(scan_ctx)), "create scanner")); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr plan, + AddContext(scanner->CreatePlan(), "create plan")); paimon::ReadContextBuilder read_builder(table_path); constexpr int32_t kPrefetchBatchCount = 600; @@ -563,15 +578,18 @@ Result ReadRows(const std::string& table_path, .SetPrefetchMaxParallelNum(prefetch_parallel_num) .EnableMultiThreadRowToBatch(GetRowToBatchThreadNumber() > 1) .SetRowToBatchThreadNumber(GetRowToBatchThreadNumber()); - PAIMON_ASSIGN_OR_RAISE(auto read_ctx, AddContext(read_builder.Finish(), "create read context")); - PAIMON_ASSIGN_OR_RAISE(auto reader, AddContext(paimon::TableRead::Create(std::move(read_ctx)), - "create table reader")); - PAIMON_ASSIGN_OR_RAISE(auto batch_reader, - AddContext(reader->CreateReader(plan->Splits()), "create batch reader")); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_ctx, + AddContext(read_builder.Finish(), "create read context")); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr reader, + AddContext(paimon::TableRead::Create(std::move(read_ctx)), "create table reader")); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr batch_reader, + AddContext(reader->CreateReader(plan->Splits()), "create batch reader")); int64_t total_rows = 0; while (true) { - PAIMON_ASSIGN_OR_RAISE(auto batch, + PAIMON_ASSIGN_OR_RAISE(paimon::BatchReader::ReadBatch batch, AddContext(batch_reader->NextBatch(), "read next batch")); if (paimon::BatchReader::IsEofBatch(batch)) { break; diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h index 5eef468c5..477d83af5 100644 --- a/benchmark/cli_option_parsing.h +++ b/benchmark/cli_option_parsing.h @@ -26,6 +26,8 @@ namespace paimon::benchmark { +using ParsedOptions = std::vector>; + inline bool ConsumeCliOption(const std::string& arg, const std::string& option_name, std::string* value_out) { const std::string prefix = option_name + "="; @@ -69,13 +71,13 @@ inline Result> ParseCommaSeparatedColumns(const std::st return columns; } -inline Result>> ParseDelimitedOptions( +inline Result ParseDelimitedOptions( const std::string& input, const std::string& option_name) { if (input.empty()) { return Status::Invalid("missing value for ", option_name); } - std::vector> parsed; + ParsedOptions parsed; std::string token; for (size_t index = 0; index <= input.size(); ++index) { const bool at_end = (index == input.size()); @@ -152,8 +154,8 @@ inline Result ParseDelimitedRepeatableOptionArg( int32_t* arg_index, std::vector>* options_out) { std::string parsed_value; if (ConsumeCliOption(arg, option_name, &parsed_value)) { - PAIMON_ASSIGN_OR_RAISE(const auto parsed_options, - ParseDelimitedOptions(parsed_value, option_name)); + ParsedOptions parsed_options; + PAIMON_ASSIGN_OR_RAISE(parsed_options, ParseDelimitedOptions(parsed_value, option_name)); options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); return true; } @@ -167,8 +169,8 @@ inline Result ParseDelimitedRepeatableOptionArg( } const std::string option_arg = argv[++(*arg_index)]; - PAIMON_ASSIGN_OR_RAISE(const auto parsed_options, - ParseDelimitedOptions(option_arg, option_name)); + ParsedOptions parsed_options; + PAIMON_ASSIGN_OR_RAISE(parsed_options, ParseDelimitedOptions(option_arg, option_name)); options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); return true; } From c25bc717080bd25ae299bdb34a0afafbc7ab7c42 Mon Sep 17 00:00:00 2001 From: "yonghao.fyh" Date: Thu, 4 Jun 2026 16:49:42 +0800 Subject: [PATCH 13/13] fix --- benchmark/benchmark_helpers.cpp | 3 +-- benchmark/benchmark_suite.cpp | 12 +++++------- benchmark/cli_option_parsing.h | 4 ++-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/benchmark/benchmark_helpers.cpp b/benchmark/benchmark_helpers.cpp index 94da33957..5dfd23725 100644 --- a/benchmark/benchmark_helpers.cpp +++ b/benchmark/benchmark_helpers.cpp @@ -18,9 +18,8 @@ #include -#include "fmt/format.h" - #include "benchmark/benchmark.h" +#include "fmt/format.h" namespace paimon::benchmark { diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp index 1d815bf66..9911eaac7 100644 --- a/benchmark/benchmark_suite.cpp +++ b/benchmark/benchmark_suite.cpp @@ -435,10 +435,9 @@ Status WriteAndCommit(const std::string& table_path, PAIMON_ASSIGN_OR_RAISE( std::unique_ptr write_ctx, AddContext(write_builder.SetOptions(options).Finish(), "create write context")); - PAIMON_ASSIGN_OR_RAISE( - std::unique_ptr writer, - AddContext(paimon::FileStoreWrite::Create(std::move(write_ctx)), - "create file store writer")); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr writer, + AddContext(paimon::FileStoreWrite::Create(std::move(write_ctx)), + "create file store writer")); PAIMON_RETURN_NOT_OK(WriteSourceDataToWriter(writer.get(), source_spec)); PAIMON_ASSIGN_OR_RAISE(std::vector> messages, @@ -583,9 +582,8 @@ Result ReadRows(const std::string& table_path, PAIMON_ASSIGN_OR_RAISE( std::unique_ptr reader, AddContext(paimon::TableRead::Create(std::move(read_ctx)), "create table reader")); - PAIMON_ASSIGN_OR_RAISE( - std::unique_ptr batch_reader, - AddContext(reader->CreateReader(plan->Splits()), "create batch reader")); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr batch_reader, + AddContext(reader->CreateReader(plan->Splits()), "create batch reader")); int64_t total_rows = 0; while (true) { diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h index 477d83af5..f12b52175 100644 --- a/benchmark/cli_option_parsing.h +++ b/benchmark/cli_option_parsing.h @@ -71,8 +71,8 @@ inline Result> ParseCommaSeparatedColumns(const std::st return columns; } -inline Result ParseDelimitedOptions( - const std::string& input, const std::string& option_name) { +inline Result ParseDelimitedOptions(const std::string& input, + const std::string& option_name) { if (input.empty()) { return Status::Invalid("missing value for ", option_name); }