diff --git a/CMakeLists.txt b/CMakeLists.txt index 124d6c540..154a38d97 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) option(PAIMON_BUILD_STATIC "Build static library" ON) option(PAIMON_BUILD_SHARED "Build shared library" ON) option(PAIMON_BUILD_TESTS "Build tests" OFF) +option(PAIMON_BUILD_BENCHMARKS "Build benchmarks" OFF) option(PAIMON_USE_ASAN "Use Address Sanitizer" OFF) option(PAIMON_USE_UBSAN "Use Undefined Behavior Sanitizer" OFF) option(PAIMON_USE_CXX11_ABI "Use C++11 ABI" ON) @@ -354,6 +355,29 @@ endif() set(ENV{PAIMON_TEST_DATA} "${CMAKE_SOURCE_DIR}/test/test_data") +if(PAIMON_BUILD_TESTS OR PAIMON_BUILD_BENCHMARKS) + resolve_dependency(GTest) + include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) + + paimon_link_libraries_whole_archive(PAIMON_LOCAL_FILE_SYSTEM_STATIC_LINK_LIBS + paimon_local_file_system_static) + paimon_link_libraries_no_as_needed(PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS + paimon_local_file_system_shared) + paimon_link_libraries_whole_archive(PAIMON_BLOB_FILE_FORMAT_STATIC_LINK_LIBS + paimon_blob_file_format_static) + paimon_link_libraries_whole_archive(PAIMON_PARQUET_FILE_FORMAT_STATIC_LINK_LIBS + paimon_parquet_file_format_static) + + if(PAIMON_ENABLE_ORC) + paimon_link_libraries_whole_archive(PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS + paimon_orc_file_format_static) + endif() + if(PAIMON_ENABLE_AVRO) + paimon_link_libraries_whole_archive(PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS + paimon_avro_file_format_static) + endif() +endif() + if(PAIMON_BUILD_TESTS) if(NOT PAIMON_ENABLE_ORC) message(FATAL_ERROR "PAIMON_ENABLE_ORC must be enabled if PAIMON_BUILD_TESTS is enable" @@ -365,7 +389,6 @@ if(PAIMON_BUILD_TESTS) endif() # Adding unit tests part of the "paimon" portion of the test suite add_custom_target(paimon-tests) - resolve_dependency(GTest) add_custom_target(unittest ctest @@ -375,7 +398,6 @@ if(PAIMON_BUILD_TESTS) --output-on-failure) add_dependencies(unittest paimon-tests) - include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) include_directories("${CMAKE_SOURCE_DIR}/test/") paimon_link_libraries_whole_archive( @@ -388,15 +410,6 @@ if(PAIMON_BUILD_TESTS) TEST_PLUGIN_LINK_LIBS paimon_parquet_file_format_shared paimon_blob_file_format_shared) set(TEST_STATIC_LINK_LIBS ${TEST_WHOLE_ARCHIVE_LINK_LIBS} ${TEST_PLUGIN_LINK_LIBS}) - paimon_link_libraries_whole_archive(PAIMON_LOCAL_FILE_SYSTEM_STATIC_LINK_LIBS - paimon_local_file_system_static) - paimon_link_libraries_no_as_needed(PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS - paimon_local_file_system_shared) - paimon_link_libraries_whole_archive(PAIMON_BLOB_FILE_FORMAT_STATIC_LINK_LIBS - paimon_blob_file_format_static) - paimon_link_libraries_whole_archive(PAIMON_PARQUET_FILE_FORMAT_STATIC_LINK_LIBS - paimon_parquet_file_format_static) - if(PAIMON_ENABLE_LANCE) paimon_link_libraries_whole_archive(PAIMON_LANCE_FILE_FORMAT_STATIC_LINK_LIBS paimon_lance_file_format_static) @@ -405,15 +418,11 @@ if(PAIMON_BUILD_TESTS) list(APPEND TEST_STATIC_LINK_LIBS ${TEST_PLUGIN_LINK_LIBS}) endif() if(PAIMON_ENABLE_ORC) - paimon_link_libraries_whole_archive(PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS - paimon_orc_file_format_static) paimon_link_libraries_no_as_needed(TEST_PLUGIN_LINK_LIBS paimon_orc_file_format_shared) list(APPEND TEST_STATIC_LINK_LIBS ${TEST_PLUGIN_LINK_LIBS}) endif() if(PAIMON_ENABLE_AVRO) - paimon_link_libraries_whole_archive(PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS - paimon_avro_file_format_static) paimon_link_libraries_no_as_needed(TEST_PLUGIN_LINK_LIBS paimon_avro_file_format_shared) list(APPEND TEST_STATIC_LINK_LIBS ${TEST_PLUGIN_LINK_LIBS}) @@ -441,6 +450,19 @@ if(PAIMON_BUILD_TESTS) endif() endif() +if(PAIMON_BUILD_BENCHMARKS) + add_custom_target(paimon-benchmarks) + add_custom_target(benchmark + ctest + -j4 + -L + benchmark + --output-on-failure) + add_dependencies(benchmark paimon-benchmarks) + + set(PAIMON_BENCHMARK_LINK_TOOLCHAIN benchmark::benchmark) +endif() + paimon_print_dependency_resolution_summary() include(CMakePackageConfigHelpers) @@ -472,3 +494,6 @@ add_subdirectory(src/paimon/global_index/lucene) add_subdirectory(src/paimon/testing/mock) add_subdirectory(src/paimon/testing/utils) add_subdirectory(test/inte) +if(PAIMON_BUILD_BENCHMARKS) + add_subdirectory(benchmark) +endif() diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 000000000..805cdb53b --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,76 @@ +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT PAIMON_BUILD_BENCHMARKS AND NOT PAIMON_BUILD_TESTS) + return() +endif() + +find_package(Threads REQUIRED) + +set(PAIMON_BENCHMARK_STATIC_LINK_LIBS + paimon_shared ${PAIMON_LOCAL_FILE_SYSTEM_SHARED_LINK_LIBS} + ${PAIMON_PARQUET_FILE_FORMAT_STATIC_LINK_LIBS} + ${PAIMON_BLOB_FILE_FORMAT_STATIC_LINK_LIBS}) + +if(PAIMON_ENABLE_ORC) + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS + ${PAIMON_ORC_FILE_FORMAT_STATIC_LINK_LIBS}) +endif() + +if(PAIMON_ENABLE_AVRO) + list(APPEND PAIMON_BENCHMARK_STATIC_LINK_LIBS + ${PAIMON_AVRO_FILE_FORMAT_STATIC_LINK_LIBS}) +endif() + +set(PAIMON_BENCHMARK_PLATFORM_LINK_LIBS) +if(UNIX AND NOT APPLE) + find_library(PAIMON_BENCHMARK_RT_LIBRARY rt) + if(PAIMON_BENCHMARK_RT_LIBRARY) + list(APPEND PAIMON_BENCHMARK_PLATFORM_LINK_LIBS ${PAIMON_BENCHMARK_RT_LIBRARY}) + endif() +endif() + +if(PAIMON_BUILD_BENCHMARKS) + add_paimon_benchmark(read_write_benchmark + SOURCES + benchmark_helpers.cpp + benchmark_suite.cpp + benchmark_case_write.cpp + benchmark_case_read.cpp + benchmark_case_pk_write.cpp + benchmark_case_mor_read.cpp + read_write_benchmark.cpp + STATIC_LINK_LIBS + arrow + parquet + ${PAIMON_BENCHMARK_STATIC_LINK_LIBS} + test_utils_static + Threads::Threads + ${CMAKE_DL_LIBS} + ${PAIMON_BENCHMARK_PLATFORM_LINK_LIBS} + ${PAIMON_BENCHMARK_LINK_TOOLCHAIN} + EXTRA_INCLUDES + ${CMAKE_SOURCE_DIR}) +endif() + +if(PAIMON_BUILD_TESTS) + add_paimon_test(cli_option_parsing_test + SOURCES + cli_option_parsing_test.cpp + EXTRA_INCLUDES + ${CMAKE_SOURCE_DIR} + STATIC_LINK_LIBS + paimon_shared + ${GTEST_LINK_TOOLCHAIN}) +endif() diff --git a/benchmark/benchmark_case_mor_read.cpp b/benchmark/benchmark_case_mor_read.cpp new file mode 100644 index 000000000..d96c7e3e0 --- /dev/null +++ b/benchmark/benchmark_case_mor_read.cpp @@ -0,0 +1,33 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_suite.h" + +namespace { + +void BM_MOR_Read(::benchmark::State& state) { + paimon::benchmark::RunBMMorRead(state); +} + +} // namespace + +BENCHMARK(BM_MOR_Read) + ->ArgNames({"prefetch_parallel"}) + ->Unit(benchmark::kMillisecond) + ->UseRealTime() + ->Args({1}) + ->Args({2}) + ->Args({4}); diff --git a/benchmark/benchmark_case_pk_write.cpp b/benchmark/benchmark_case_pk_write.cpp new file mode 100644 index 000000000..d18a71d39 --- /dev/null +++ b/benchmark/benchmark_case_pk_write.cpp @@ -0,0 +1,27 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_suite.h" + +namespace { + +void BM_PK_Write(::benchmark::State& state) { + paimon::benchmark::RunBMPkWrite(state); +} + +} // namespace + +BENCHMARK(BM_PK_Write)->Unit(benchmark::kMillisecond)->UseRealTime(); diff --git a/benchmark/benchmark_case_read.cpp b/benchmark/benchmark_case_read.cpp new file mode 100644 index 000000000..71b528d18 --- /dev/null +++ b/benchmark/benchmark_case_read.cpp @@ -0,0 +1,33 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_suite.h" + +namespace { + +void BM_Read(::benchmark::State& state) { + paimon::benchmark::RunBMRead(state); +} + +} // namespace + +BENCHMARK(BM_Read) + ->ArgNames({"prefetch_parallel"}) + ->Unit(benchmark::kMillisecond) + ->UseRealTime() + ->Args({1}) + ->Args({2}) + ->Args({4}); diff --git a/benchmark/benchmark_case_write.cpp b/benchmark/benchmark_case_write.cpp new file mode 100644 index 000000000..4bf34695a --- /dev/null +++ b/benchmark/benchmark_case_write.cpp @@ -0,0 +1,27 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_suite.h" + +namespace { + +void BM_Write(::benchmark::State& state) { + paimon::benchmark::RunBMWrite(state); +} + +} // namespace + +BENCHMARK(BM_Write)->Unit(benchmark::kMillisecond)->UseRealTime(); diff --git a/benchmark/benchmark_helpers.cpp b/benchmark/benchmark_helpers.cpp new file mode 100644 index 000000000..5dfd23725 --- /dev/null +++ b/benchmark/benchmark_helpers.cpp @@ -0,0 +1,91 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_helpers.h" + +#include + +#include "benchmark/benchmark.h" +#include "fmt/format.h" + +namespace paimon::benchmark { + +bool BenchmarkHelpers::ValidateFileFormatOrSkip(::benchmark::State& state, + const std::string& file_format, bool is_supported, + SkipFn skip) { + if (!is_supported) { + skip(state, fmt::format("file format is not supported in this build: {}", file_format)); + return false; + } + return true; +} + +bool BenchmarkHelpers::ValidateSourcePresenceOrSkip(::benchmark::State& state, + const std::string& source_path, + const std::string& message, SkipFn skip) { + if (source_path.empty()) { + skip(state, message); + return false; + } + return true; +} + +bool BenchmarkHelpers::ValidateSourceSupportOrSkip(::benchmark::State& state, + const std::string& source_format, + bool is_supported, SkipFn skip) { + if (!is_supported) { + skip(state, + fmt::format("source data mode requires reader support in this build for format: {}", + source_format)); + return false; + } + return true; +} + +bool BenchmarkHelpers::ValidatePrefetchParallelOrSkip(::benchmark::State& state, + int32_t prefetch_parallel_num, SkipFn skip) { + if (prefetch_parallel_num <= 0) { + skip(state, "prefetch_parallel must be greater than 0"); + return false; + } + return true; +} + +Result BenchmarkHelpers::RunReadIterations(::benchmark::State& state, + const ReadOnceFn& read_once) { + int64_t rows_read = 0; + for (auto _ : state) { + PAIMON_ASSIGN_OR_RAISE(rows_read, read_once()); + } + return rows_read; +} + +Result BenchmarkHelpers::TryRunSourceTableReadMode(::benchmark::State& state, + const std::string& benchmark_name, + const std::string& source_table_path, + const ReadOnceFn& read_once) { + if (source_table_path.empty()) { + return false; + } + + std::cout << "[benchmark][" << benchmark_name << "] source_table_path=" << source_table_path + << std::endl; + PAIMON_ASSIGN_OR_RAISE(const int64_t rows_read, RunReadIterations(state, read_once)); + state.SetItemsProcessed(state.iterations() * rows_read); + return true; +} + +} // namespace paimon::benchmark diff --git a/benchmark/benchmark_helpers.h b/benchmark/benchmark_helpers.h new file mode 100644 index 000000000..0c5f58f69 --- /dev/null +++ b/benchmark/benchmark_helpers.h @@ -0,0 +1,59 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/result.h" + +namespace benchmark { +class State; +} + +namespace paimon::benchmark { + +class BenchmarkHelpers { + public: + using ReadOnceFn = std::function()>; + using SkipFn = void (*)(::benchmark::State&, const std::string&); + + static bool ValidateFileFormatOrSkip(::benchmark::State& state, const std::string& file_format, + bool is_supported, SkipFn skip); + + static bool ValidateSourcePresenceOrSkip(::benchmark::State& state, + const std::string& source_path, + const std::string& message, SkipFn skip); + + static bool ValidateSourceSupportOrSkip(::benchmark::State& state, + const std::string& source_format, bool is_supported, + SkipFn skip); + + static bool ValidatePrefetchParallelOrSkip(::benchmark::State& state, + int32_t prefetch_parallel_num, SkipFn skip); + + static Result RunReadIterations(::benchmark::State& state, + const ReadOnceFn& read_once); + + static Result TryRunSourceTableReadMode(::benchmark::State& state, + const std::string& benchmark_name, + const std::string& source_table_path, + const ReadOnceFn& read_once); +}; + +} // namespace paimon::benchmark diff --git a/benchmark/benchmark_suite.cpp b/benchmark/benchmark_suite.cpp new file mode 100644 index 000000000..9911eaac7 --- /dev/null +++ b/benchmark/benchmark_suite.cpp @@ -0,0 +1,879 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/benchmark_suite.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/c/bridge.h" +#include "arrow/io/api.h" +#include "benchmark/benchmark_helpers.h" +#include "benchmark/cli_option_parsing.h" +#include "paimon/api.h" +#include "paimon/catalog/catalog.h" +#include "paimon/testing/utils/testharness.h" + +#if __has_include("parquet/arrow/reader.h") +#include "parquet/arrow/reader.h" +#include "parquet/file_reader.h" +#define PAIMON_BENCHMARK_HAS_PARQUET_READER 1 +#else +#define PAIMON_BENCHMARK_HAS_PARQUET_READER 0 +#endif + +namespace paimon::benchmark { + +namespace { + +constexpr int64_t kSourceBatchMaxRows = 4096; +constexpr int32_t kRowToBatchThreadNumber = 3; + +struct BenchmarkCliOptions { + std::string source_data_file; + std::string source_table_path; + std::vector pk_columns; + std::vector> extra_options; +}; + +struct SourceDataSpec { + std::string format; + std::string path; +}; + +BenchmarkCliOptions& MutableBenchmarkCliOptions() { + static BenchmarkCliOptions options; + return options; +} + +const BenchmarkCliOptions& GetBenchmarkCliOptions() { + return MutableBenchmarkCliOptions(); +} + +Status ParsePaimonBenchmarkCliArgsImpl(int32_t* argc, char** argv) { + auto& options = MutableBenchmarkCliOptions(); + options = BenchmarkCliOptions{}; + const int32_t parsed_argc = *argc; + int32_t write_index = 1; + for (int32_t arg_index = 1; arg_index < parsed_argc; ++arg_index) { + const std::string arg(argv[arg_index]); + + PAIMON_ASSIGN_OR_RAISE(bool is_parsed, + paimon::benchmark::ParseStringOptionArg( + parsed_argc, argv, arg, "--paimon_source_data_file", &arg_index, + &options.source_data_file)); + if (is_parsed) { + continue; + } + PAIMON_ASSIGN_OR_RAISE(is_parsed, paimon::benchmark::ParseStringOptionArg( + parsed_argc, argv, arg, "--paimon_source_table_path", + &arg_index, &options.source_table_path)); + if (is_parsed) { + continue; + } + PAIMON_ASSIGN_OR_RAISE(is_parsed, paimon::benchmark::ParseCommaSeparatedOptionArg( + parsed_argc, argv, arg, "--paimon_pk_columns", + &arg_index, &options.pk_columns)); + if (is_parsed) { + continue; + } + PAIMON_ASSIGN_OR_RAISE(is_parsed, paimon::benchmark::ParseDelimitedRepeatableOptionArg( + parsed_argc, argv, arg, "--paimon_option", &arg_index, + &options.extra_options)); + if (is_parsed) { + continue; + } + + argv[write_index++] = argv[arg_index]; + } + + *argc = write_index; + argv[write_index] = nullptr; + return Status::OK(); +} + +bool HasHelpFlagImpl(int32_t argc, char** argv) { + for (int32_t arg_index = 1; arg_index < argc; ++arg_index) { + const std::string arg(argv[arg_index]); + if (arg == "-h" || arg == "--help" || arg == "--help=true") { + return true; + } + } + return false; +} + +void PrintPaimonBenchmarkCliHelpImpl() { + std::cout + << "Paimon benchmark custom options:\n" + << " --paimon_source_data_file=\n" + << " Required. External source data file used to build benchmark data.\n" + << " Currently supports Parquet source files.\n" + << " Also supports: --paimon_source_data_file \n" + << " --paimon_source_table_path=\n" + << " Optional for BM_Read and BM_MOR_Read. If set, read directly from existing\n" + << " table path and skip source file loading and pre-write stage.\n" + << " Also supports: --paimon_source_table_path \n" + << " --paimon_pk_columns=\n" + << " Required by BM_PK_Write and BM_MOR_Read.\n" + << " Also supports: --paimon_pk_columns \n" + << " --paimon_option=:;:\n" + << " Optional and repeatable. Pass through table options as-is.\n" + << " Default table file format is parquet; use file.format: to override.\n" + << " Also supports: --paimon_option :;:\n" + << " Note: use quotes in shell, e.g. \"--paimon_option k1:v1;k2:v2\".\n" + << "\n" + << "Example:\n" + << " paimon-read-write-benchmark --paimon_source_data_file /path/data.parquet \\\n" + << " --paimon_pk_columns=id --paimon_option \"read.batch-size:8192\" \\\n" + << " --benchmark_filter=BM_Read\n" + << std::endl; +} + +Result> CreateBenchmarkWorkspace() { + auto workspace = paimon::test::UniqueTestDirectory::Create(); + if (workspace == nullptr) { + return Status::Invalid("failed to create benchmark workspace"); + } + return workspace; +} + +uint64_t NextTableId() { + static std::atomic id{0}; + return ++id; +} + +std::string RequirePath(const std::string& root_path, const std::string& db_name, + const std::string& table_name) { + return root_path + "/" + db_name + ".db/" + table_name; +} + +template +Result AddContext(paimon::Result&& result, const std::string& context) { + if (!result.ok()) { + const Status status = result.status(); + return status.WithMessage(context, ": ", status.message()); + } + return std::move(result).value(); +} + +Status AddContext(const paimon::Status& status, const std::string& context) { + if (!status.ok()) { + return status.WithMessage(context, ": ", status.message()); + } + return Status::OK(); +} + +void SkipWithMessage(::benchmark::State& state, const std::string& message) { + state.SkipWithError(message); +} + +std::string GetConfiguredFileFormat() { + std::string file_format = "parquet"; + for (const auto& kv : GetBenchmarkCliOptions().extra_options) { + if (kv.first == paimon::Options::FILE_FORMAT) { + file_format = kv.second; + } + } + return file_format; +} + +bool IsFileFormatSupported(const std::string& format) { + if (format == "parquet") { + return true; + } + if (format == "orc") { +#ifdef PAIMON_ENABLE_ORC + return true; +#else + return false; +#endif + } + return false; +} + +void ApplyExtraOptions(std::map* options) { + for (const auto& kv : GetBenchmarkCliOptions().extra_options) { + (*options)[kv.first] = kv.second; + } +} + +std::map BuildOptions(const std::string& file_format) { + std::map options = { + {paimon::Options::FILE_FORMAT, file_format}, + }; + ApplyExtraOptions(&options); + return options; +} + +std::map BuildPkOptions(const std::string& file_format) { + auto options = BuildOptions(file_format); + options[paimon::Options::BUCKET] = "1"; + options[paimon::Options::MERGE_ENGINE] = "deduplicate"; + return options; +} + +std::string GetSourceDataFilePath() { + return GetBenchmarkCliOptions().source_data_file; +} + +std::string GetSourceTablePath() { + return GetBenchmarkCliOptions().source_table_path; +} + +const std::vector& GetPkColumns() { + return GetBenchmarkCliOptions().pk_columns; +} + +SourceDataSpec GetSourceDataSpec() { + const std::string source_data_file_path = GetSourceDataFilePath(); + if (!source_data_file_path.empty()) { + return {"parquet", source_data_file_path}; + } + return {"", ""}; +} + +int64_t GetSourceBatchMaxRows() { + return kSourceBatchMaxRows; +} + +int32_t GetRowToBatchThreadNumber() { + return kRowToBatchThreadNumber; +} + +bool SupportsParquetSourceDataMode() { +#if PAIMON_BENCHMARK_HAS_PARQUET_READER + return true; +#else + return false; +#endif +} + +bool SupportsSourceDataMode(const std::string& source_format) { + if (source_format == "parquet") { + return SupportsParquetSourceDataMode(); + } + return false; +} + +struct SourceDataMetadata { + std::shared_ptr schema; + int64_t total_rows = 0; + std::string format; + std::string path; +}; + +#if PAIMON_BENCHMARK_HAS_PARQUET_READER +Result> OpenParquetSourceReader( + const std::string& path) { + auto input = arrow::io::ReadableFile::Open(path); + if (!input.ok()) { + return Status::Invalid("open Parquet source failed: ", path, ", ", + input.status().ToString()); + } + + std::unique_ptr parquet_reader; + const auto open_status = parquet::arrow::OpenFile( + input.ValueUnsafe(), arrow::default_memory_pool(), &parquet_reader); + if (!open_status.ok()) { + return Status::Invalid("create Parquet reader failed: ", open_status.ToString()); + } + parquet_reader->set_batch_size(GetSourceBatchMaxRows()); + return parquet_reader; +} +#endif + +Result LoadParquetSourceMetadata(const std::string& path) { +#if !PAIMON_BENCHMARK_HAS_PARQUET_READER + return Status::Invalid( + "Parquet source data mode requires parquet::arrow reader support in this build"); +#else + static SourceDataMetadata cache; + static std::mutex cache_mutex; + std::lock_guard lock(cache_mutex); + if (cache.path == path && cache.format == "parquet") { + return cache; + } + + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr parquet_reader, + OpenParquetSourceReader(path)); + std::shared_ptr schema; + const auto schema_status = parquet_reader->GetSchema(&schema); + if (!schema_status.ok()) { + return Status::Invalid("read Parquet source schema failed: ", schema_status.ToString()); + } + + const int64_t total_rows = parquet_reader->parquet_reader()->metadata()->num_rows(); + if (total_rows <= 0) { + return Status::Invalid("Parquet source is empty: ", path); + } + + cache.schema = std::move(schema); + cache.total_rows = total_rows; + cache.format = "parquet"; + cache.path = path; + return cache; +#endif +} + +Result LoadSourceDataMetadata(const SourceDataSpec& source_spec) { + if (source_spec.format == "parquet") { + return LoadParquetSourceMetadata(source_spec.path); + } + return Status::Invalid("unknown source format: ", source_spec.format); +} + +std::shared_ptr BuildStructArrayFromRecordBatch( + const std::shared_ptr& batch) { + return std::make_shared(arrow::struct_(batch->schema()->fields()), + batch->num_rows(), batch->columns()); +} + +Result> MakeRecordBatch( + const std::shared_ptr& arr) { + ArrowArray c_array; + if (!arrow::ExportArray(*arr, &c_array).ok()) { + return Status::Invalid("failed to export arrow array"); + } + paimon::RecordBatchBuilder builder(&c_array); + return AddContext(builder.Finish(), "build paimon record batch"); +} + +Status EnsureTable(const std::string& root_path, const std::string& db_name, + const std::string& table_name, const std::map& options, + const std::shared_ptr& schema, + const std::vector& primary_keys = {}) { + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr catalog, + AddContext(paimon::Catalog::Create(root_path, options), "create catalog")); + PAIMON_RETURN_NOT_OK( + AddContext(catalog->CreateDatabase(db_name, options, true), "create database")); + + ArrowSchema c_schema; + if (!arrow::ExportSchema(*schema, &c_schema).ok()) { + return Status::Invalid("failed to export table schema"); + } + PAIMON_RETURN_NOT_OK( + AddContext(catalog->CreateTable(paimon::Identifier(db_name, table_name), &c_schema, + /*partition_keys=*/{}, primary_keys, options, + /*ignore_if_exists=*/false), + "create table")); + return Status::OK(); +} + +Status WriteSourceDataToWriter(paimon::FileStoreWrite* writer, const SourceDataSpec& source_spec) { + if (source_spec.format != "parquet") { + return Status::Invalid("unknown source format: ", source_spec.format); + } + +#if !PAIMON_BENCHMARK_HAS_PARQUET_READER + return Status::Invalid( + "Parquet source data mode requires parquet::arrow reader support in this build"); +#else + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr parquet_reader, + OpenParquetSourceReader(source_spec.path)); + std::unique_ptr batch_reader; + const auto reader_status = parquet_reader->GetRecordBatchReader(&batch_reader); + if (!reader_status.ok()) { + return Status::Invalid("create Parquet source batch reader failed: ", + reader_status.ToString()); + } + + int64_t written_rows = 0; + while (true) { + std::shared_ptr record_batch; + const auto read_status = batch_reader->ReadNext(&record_batch); + if (!read_status.ok()) { + return Status::Invalid("read Parquet source batch failed: ", read_status.ToString()); + } + if (record_batch == nullptr) { + break; + } + if (record_batch->num_rows() <= 0) { + continue; + } + + auto struct_array = BuildStructArrayFromRecordBatch(record_batch); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr batch, + MakeRecordBatch(struct_array)); + PAIMON_RETURN_NOT_OK(AddContext(writer->Write(std::move(batch)), "write batch")); + written_rows += record_batch->num_rows(); + } + + if (written_rows <= 0) { + return Status::Invalid("source file has no non-empty data batches: ", source_spec.path); + } + return Status::OK(); +#endif +} + +Status WriteAndCommit(const std::string& table_path, + const std::map& options, + const SourceDataSpec& source_spec) { + paimon::WriteContextBuilder write_builder(table_path, "benchmark-writer"); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr write_ctx, + AddContext(write_builder.SetOptions(options).Finish(), "create write context")); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr writer, + AddContext(paimon::FileStoreWrite::Create(std::move(write_ctx)), + "create file store writer")); + + PAIMON_RETURN_NOT_OK(WriteSourceDataToWriter(writer.get(), source_spec)); + PAIMON_ASSIGN_OR_RAISE(std::vector> messages, + AddContext(writer->PrepareCommit(), "prepare commit")); + + paimon::CommitContextBuilder commit_builder(table_path, "benchmark-writer"); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr commit_ctx, + AddContext(commit_builder.SetOptions(options).Finish(), "create commit context")); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr committer, + AddContext(paimon::FileStoreCommit::Create(std::move(commit_ctx)), "create committer")); + PAIMON_RETURN_NOT_OK(AddContext(committer->Commit(messages), "commit write")); + return Status::OK(); +} + +struct SharedReadTableCache { + std::string key; + std::unique_ptr workspace; + std::string table_path; + int64_t total_rows = 0; +}; + +struct SharedMorReadTableCache { + std::string key; + std::unique_ptr workspace; + std::string table_path; + int64_t total_rows = 0; +}; + +std::string BuildReadTableCacheKey(const std::string& file_format, + const SourceDataSpec& source_spec) { + return file_format + "|" + source_spec.format + "|" + source_spec.path + "|" + + std::to_string(GetSourceBatchMaxRows()); +} + +std::string JoinColumns(const std::vector& columns) { + std::string joined; + for (size_t i = 0; i < columns.size(); ++i) { + if (i > 0) { + joined.append(","); + } + joined.append(columns[i]); + } + return joined; +} + +Result GetOrCreateSharedMorReadTable( + const std::string& file_format, const SourceDataSpec& source_spec) { + static SharedMorReadTableCache cache; + static std::mutex cache_mutex; + + const std::vector& pk_columns = GetPkColumns(); + const std::string cache_key = + BuildReadTableCacheKey(file_format, source_spec) + "|pk=" + JoinColumns(pk_columns); + std::lock_guard lock(cache_mutex); + if (cache.workspace != nullptr && cache.key == cache_key) { + std::cout << "[benchmark][mor-read] reuse_output_table_path=" << cache.table_path + << std::endl; + return &cache; + } + + auto options = BuildPkOptions(file_format); + PAIMON_ASSIGN_OR_RAISE(const SourceDataMetadata source_metadata, + LoadSourceDataMetadata(source_spec)); + + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr workspace, + CreateBenchmarkWorkspace()); + const std::string db_name = "bench_db"; + const std::string table_name = "mor_read_shared_" + std::to_string(NextTableId()); + PAIMON_RETURN_NOT_OK(EnsureTable(workspace->Str(), db_name, table_name, options, + source_metadata.schema, + /*primary_keys=*/pk_columns)); + const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); + std::cout << "[benchmark][mor-read] create_shared_output_table_path=" << table_path + << std::endl; + PAIMON_RETURN_NOT_OK(WriteAndCommit(table_path, options, source_spec)); + + cache.key = cache_key; + cache.workspace = std::move(workspace); + cache.table_path = table_path; + cache.total_rows = source_metadata.total_rows; + return &cache; +} + +Result GetOrCreateSharedReadTable(const std::string& file_format, + const SourceDataSpec& source_spec) { + static SharedReadTableCache cache; + static std::mutex cache_mutex; + + const std::string cache_key = BuildReadTableCacheKey(file_format, source_spec); + std::lock_guard lock(cache_mutex); + if (cache.workspace != nullptr && cache.key == cache_key) { + std::cout << "[benchmark][read] reuse_output_table_path=" << cache.table_path << std::endl; + return &cache; + } + + auto options = BuildOptions(file_format); + PAIMON_ASSIGN_OR_RAISE(const SourceDataMetadata source_metadata, + LoadSourceDataMetadata(source_spec)); + + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr workspace, + CreateBenchmarkWorkspace()); + const std::string db_name = "bench_db"; + const std::string table_name = "read_shared_" + std::to_string(NextTableId()); + PAIMON_RETURN_NOT_OK( + EnsureTable(workspace->Str(), db_name, table_name, options, source_metadata.schema)); + const std::string table_path = RequirePath(workspace->Str(), db_name, table_name); + std::cout << "[benchmark][read] create_shared_output_table_path=" << table_path << std::endl; + PAIMON_RETURN_NOT_OK(WriteAndCommit(table_path, options, source_spec)); + + cache.key = cache_key; + cache.workspace = std::move(workspace); + cache.table_path = table_path; + cache.total_rows = source_metadata.total_rows; + return &cache; +} + +Result ReadRows(const std::string& table_path, + const std::map& options, + int32_t prefetch_parallel_num) { + paimon::ScanContextBuilder scan_builder(table_path); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr scan_ctx, + AddContext(scan_builder.SetOptions(options).Finish(), "create scan context")); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr scanner, + AddContext(paimon::TableScan::Create(std::move(scan_ctx)), "create scanner")); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr plan, + AddContext(scanner->CreatePlan(), "create plan")); + + paimon::ReadContextBuilder read_builder(table_path); + constexpr int32_t kPrefetchBatchCount = 600; + read_builder.SetOptions(options) + .EnablePrefetch(true) + .SetPrefetchBatchCount(kPrefetchBatchCount) + .SetPrefetchMaxParallelNum(prefetch_parallel_num) + .EnableMultiThreadRowToBatch(GetRowToBatchThreadNumber() > 1) + .SetRowToBatchThreadNumber(GetRowToBatchThreadNumber()); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_ctx, + AddContext(read_builder.Finish(), "create read context")); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr reader, + AddContext(paimon::TableRead::Create(std::move(read_ctx)), "create table reader")); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr batch_reader, + AddContext(reader->CreateReader(plan->Splits()), "create batch reader")); + + int64_t total_rows = 0; + while (true) { + PAIMON_ASSIGN_OR_RAISE(paimon::BatchReader::ReadBatch batch, + AddContext(batch_reader->NextBatch(), "read next batch")); + if (paimon::BatchReader::IsEofBatch(batch)) { + break; + } + auto& [array, schema] = batch; + auto imported = arrow::ImportArray(array.get(), schema.get()); + if (!imported.ok()) { + return Status::Invalid("import c data array failed: ", imported.status().ToString()); + } + total_rows += imported.ValueUnsafe()->length(); + } + + return total_rows; +} + +struct PreparedSourceData { + std::shared_ptr schema; + int64_t total_rows = 0; +}; + +bool TryGetSourceSpec(::benchmark::State& state, SourceDataSpec* source_spec) { + (void)state; + *source_spec = GetSourceDataSpec(); + return true; +} + +bool TryPrepareSourceData(::benchmark::State& state, const SourceDataSpec& source_spec, + PreparedSourceData* prepared) { + auto source_metadata = LoadSourceDataMetadata(source_spec); + if (!source_metadata.ok()) { + SkipWithMessage(state, source_metadata.status().ToString()); + return false; + } + prepared->schema = source_metadata.value().schema; + prepared->total_rows = source_metadata.value().total_rows; + return true; +} + +} // namespace + +Status ParsePaimonBenchmarkCliArgs(int* argc, char** argv) { + auto parsed_argc = static_cast(*argc); + PAIMON_RETURN_NOT_OK(ParsePaimonBenchmarkCliArgsImpl(&parsed_argc, argv)); + *argc = static_cast(parsed_argc); + return Status::OK(); +} + +bool HasHelpFlag(int32_t argc, char** argv) { + return HasHelpFlagImpl(argc, argv); +} + +void PrintPaimonBenchmarkCliHelp() { + PrintPaimonBenchmarkCliHelpImpl(); +} + +void RunBMWrite(::benchmark::State& state) { + const std::string file_format = GetConfiguredFileFormat(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, "--paimon_source_data_file is required", &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { + return; + } + + auto options = BuildOptions(file_format); + PreparedSourceData prepared; + if (!TryPrepareSourceData(state, source_spec, &prepared)) { + return; + } + auto workspace = CreateBenchmarkWorkspace(); + if (!workspace.ok()) { + SkipWithMessage(state, workspace.status().ToString()); + return; + } + + for (auto _ : state) { + const std::string db_name = "bench_db"; + const std::string table_name = "write_" + std::to_string(NextTableId()); + const Status ensure_status = + EnsureTable(workspace.value()->Str(), db_name, table_name, options, prepared.schema); + if (!ensure_status.ok()) { + SkipWithMessage(state, ensure_status.ToString()); + return; + } + const std::string table_path = RequirePath(workspace.value()->Str(), db_name, table_name); + std::cout << "[benchmark][write] output_table_path=" << table_path << std::endl; + const Status write_status = WriteAndCommit(table_path, options, source_spec); + if (!write_status.ok()) { + SkipWithMessage(state, write_status.ToString()); + return; + } + } + + state.SetItemsProcessed(state.iterations() * prepared.total_rows); +} + +void RunBMRead(::benchmark::State& state) { + const auto prefetch_parallel_num = static_cast(state.range(0)); + const std::string file_format = GetConfiguredFileFormat(); + const std::string source_table_path = GetSourceTablePath(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { + return; + } + + if (!BenchmarkHelpers::ValidatePrefetchParallelOrSkip(state, prefetch_parallel_num, + &SkipWithMessage)) { + return; + } + + auto options = BuildOptions(file_format); + + auto source_table_read_result = BenchmarkHelpers::TryRunSourceTableReadMode( + state, "read", source_table_path, + [&]() { return ReadRows(source_table_path, options, prefetch_parallel_num); }); + if (!source_table_read_result.ok()) { + SkipWithMessage(state, source_table_read_result.status().ToString()); + return; + } + if (source_table_read_result.value()) { + return; + } + + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, + "--paimon_source_data_file is required when --paimon_source_table_path is not set", + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + + auto shared_table = GetOrCreateSharedReadTable(file_format, source_spec); + if (!shared_table.ok()) { + SkipWithMessage(state, shared_table.status().ToString()); + return; + } + + auto rows_read = BenchmarkHelpers::RunReadIterations(state, [&]() { + return ReadRows(shared_table.value()->table_path, options, prefetch_parallel_num); + }); + if (!rows_read.ok()) { + SkipWithMessage(state, rows_read.status().ToString()); + return; + } + + state.SetItemsProcessed(state.iterations() * rows_read.value()); +} + +void RunBMPkWrite(::benchmark::State& state) { + const std::string file_format = GetConfiguredFileFormat(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, "--paimon_source_data_file is required", &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { + return; + } + const std::vector& pk_columns = GetPkColumns(); + if (pk_columns.empty()) { + SkipWithMessage(state, "--paimon_pk_columns is required for BM_PK_Write"); + return; + } + + auto options = BuildPkOptions(file_format); + PreparedSourceData prepared; + if (!TryPrepareSourceData(state, source_spec, &prepared)) { + return; + } + auto workspace = CreateBenchmarkWorkspace(); + if (!workspace.ok()) { + SkipWithMessage(state, workspace.status().ToString()); + return; + } + + for (auto _ : state) { + const std::string db_name = "bench_db"; + const std::string table_name = "pk_write_" + std::to_string(NextTableId()); + const Status ensure_status = + EnsureTable(workspace.value()->Str(), db_name, table_name, options, prepared.schema, + /*primary_keys=*/pk_columns); + if (!ensure_status.ok()) { + SkipWithMessage(state, ensure_status.ToString()); + return; + } + const std::string table_path = RequirePath(workspace.value()->Str(), db_name, table_name); + std::cout << "[benchmark][pk-write] output_table_path=" << table_path << std::endl; + const Status write_status = WriteAndCommit(table_path, options, source_spec); + if (!write_status.ok()) { + SkipWithMessage(state, write_status.ToString()); + return; + } + } + + state.SetItemsProcessed(state.iterations() * prepared.total_rows); +} + +void RunBMMorRead(::benchmark::State& state) { + const auto prefetch_parallel_num = static_cast(state.range(0)); + const std::string file_format = GetConfiguredFileFormat(); + const std::string source_table_path = GetSourceTablePath(); + SourceDataSpec source_spec; + if (!TryGetSourceSpec(state, &source_spec)) { + return; + } + if (!BenchmarkHelpers::ValidateFileFormatOrSkip( + state, file_format, IsFileFormatSupported(file_format), &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidatePrefetchParallelOrSkip(state, prefetch_parallel_num, + &SkipWithMessage)) { + return; + } + + const auto source_table_read_options = BuildOptions(file_format); + auto source_table_read_result = + BenchmarkHelpers::TryRunSourceTableReadMode(state, "mor-read", source_table_path, [&]() { + return ReadRows(source_table_path, source_table_read_options, prefetch_parallel_num); + }); + if (!source_table_read_result.ok()) { + SkipWithMessage(state, source_table_read_result.status().ToString()); + return; + } + if (source_table_read_result.value()) { + return; + } + + if (!BenchmarkHelpers::ValidateSourcePresenceOrSkip( + state, source_spec.path, + "--paimon_source_data_file is required when --paimon_source_table_path is not set", + &SkipWithMessage)) { + return; + } + if (!BenchmarkHelpers::ValidateSourceSupportOrSkip(state, source_spec.format, + SupportsSourceDataMode(source_spec.format), + &SkipWithMessage)) { + return; + } + if (GetPkColumns().empty()) { + SkipWithMessage(state, "--paimon_pk_columns is required for BM_MOR_Read"); + return; + } + + auto options = BuildPkOptions(file_format); + auto shared_table = GetOrCreateSharedMorReadTable(file_format, source_spec); + if (!shared_table.ok()) { + SkipWithMessage(state, shared_table.status().ToString()); + return; + } + + auto rows_read = BenchmarkHelpers::RunReadIterations(state, [&]() { + return ReadRows(shared_table.value()->table_path, options, prefetch_parallel_num); + }); + if (!rows_read.ok()) { + SkipWithMessage(state, rows_read.status().ToString()); + return; + } + state.SetItemsProcessed(state.iterations() * rows_read.value()); +} + +} // namespace paimon::benchmark diff --git a/benchmark/benchmark_suite.h b/benchmark/benchmark_suite.h new file mode 100644 index 000000000..8b540e7e9 --- /dev/null +++ b/benchmark/benchmark_suite.h @@ -0,0 +1,35 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "benchmark/benchmark.h" +#include "paimon/status.h" + +namespace paimon::benchmark { + +Status ParsePaimonBenchmarkCliArgs(int* argc, char** argv); +bool HasHelpFlag(int32_t argc, char** argv); +void PrintPaimonBenchmarkCliHelp(); + +void RunBMWrite(::benchmark::State& state); +void RunBMRead(::benchmark::State& state); +void RunBMPkWrite(::benchmark::State& state); +void RunBMMorRead(::benchmark::State& state); + +} // namespace paimon::benchmark diff --git a/benchmark/cli_option_parsing.h b/benchmark/cli_option_parsing.h new file mode 100644 index 000000000..f12b52175 --- /dev/null +++ b/benchmark/cli_option_parsing.h @@ -0,0 +1,178 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon::benchmark { + +using ParsedOptions = std::vector>; + +inline bool ConsumeCliOption(const std::string& arg, const std::string& option_name, + std::string* value_out) { + const std::string prefix = option_name + "="; + if (arg.rfind(prefix, 0) != 0) { + return false; + } + *value_out = arg.substr(prefix.size()); + return true; +} + +inline std::string TrimAsciiWhitespace(const std::string& value) { + const auto first = value.find_first_not_of(" \t\n\r"); + if (first == std::string::npos) { + return ""; + } + const auto last = value.find_last_not_of(" \t\n\r"); + return value.substr(first, last - first + 1); +} + +inline Result> ParseCommaSeparatedColumns(const std::string& input, + const std::string& option_name) { + if (input.empty()) { + return Status::Invalid("missing value for ", option_name); + } + + std::vector columns; + size_t segment_start = 0; + for (size_t index = 0; index <= input.size(); ++index) { + if (index != input.size() && input[index] != ',') { + continue; + } + + const std::string column = + TrimAsciiWhitespace(input.substr(segment_start, index - segment_start)); + if (column.empty()) { + return Status::Invalid("invalid ", option_name, ": empty column name"); + } + columns.push_back(column); + segment_start = index + 1; + } + return columns; +} + +inline Result ParseDelimitedOptions(const std::string& input, + const std::string& option_name) { + if (input.empty()) { + return Status::Invalid("missing value for ", option_name); + } + + ParsedOptions parsed; + std::string token; + for (size_t index = 0; index <= input.size(); ++index) { + const bool at_end = (index == input.size()); + if (!at_end && input[index] != ';') { + token.push_back(input[index]); + continue; + } + + const std::string segment = TrimAsciiWhitespace(token); + if (segment.empty()) { + return Status::Invalid("invalid ", option_name, ": empty option segment"); + } + + const auto separator = segment.find(':'); + if (separator == std::string::npos) { + return Status::Invalid("invalid ", option_name, ": expected key:value"); + } + + const std::string key = TrimAsciiWhitespace(segment.substr(0, separator)); + const std::string value = TrimAsciiWhitespace(segment.substr(separator + 1)); + if (key.empty() || value.empty()) { + return Status::Invalid("invalid ", option_name, ": expected key:value"); + } + + parsed.emplace_back(key, value); + token.clear(); + } + return parsed; +} + +inline Result ParseStringOptionArg(int32_t argc, char** argv, const std::string& arg, + const std::string& option_name, int32_t* arg_index, + std::string* value_out) { + std::string parsed_value; + if (ConsumeCliOption(arg, option_name, &parsed_value)) { + *value_out = std::move(parsed_value); + return true; + } + + if (arg != option_name) { + return false; + } + + if (*arg_index + 1 >= argc) { + return Status::Invalid("missing value for ", option_name); + } + *value_out = argv[++(*arg_index)]; + return true; +} + +inline Result ParseCommaSeparatedOptionArg(int32_t argc, char** argv, const std::string& arg, + const std::string& option_name, int32_t* arg_index, + std::vector* columns_out) { + std::string parsed_value; + if (ConsumeCliOption(arg, option_name, &parsed_value)) { + PAIMON_ASSIGN_OR_RAISE(*columns_out, ParseCommaSeparatedColumns(parsed_value, option_name)); + return true; + } + + if (arg != option_name) { + return false; + } + + if (*arg_index + 1 >= argc) { + return Status::Invalid("missing value for ", option_name); + } + PAIMON_ASSIGN_OR_RAISE( + *columns_out, ParseCommaSeparatedColumns(std::string(argv[++(*arg_index)]), option_name)); + return true; +} + +inline Result ParseDelimitedRepeatableOptionArg( + int32_t argc, char** argv, const std::string& arg, const std::string& option_name, + int32_t* arg_index, std::vector>* options_out) { + std::string parsed_value; + if (ConsumeCliOption(arg, option_name, &parsed_value)) { + ParsedOptions parsed_options; + PAIMON_ASSIGN_OR_RAISE(parsed_options, ParseDelimitedOptions(parsed_value, option_name)); + options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); + return true; + } + + if (arg != option_name) { + return false; + } + + if (*arg_index + 1 >= argc) { + return Status::Invalid("missing value for ", option_name); + } + + const std::string option_arg = argv[++(*arg_index)]; + ParsedOptions parsed_options; + PAIMON_ASSIGN_OR_RAISE(parsed_options, ParseDelimitedOptions(option_arg, option_name)); + options_out->insert(options_out->end(), parsed_options.begin(), parsed_options.end()); + return true; +} + +} // namespace paimon::benchmark diff --git a/benchmark/cli_option_parsing_test.cpp b/benchmark/cli_option_parsing_test.cpp new file mode 100644 index 000000000..8082bdc9d --- /dev/null +++ b/benchmark/cli_option_parsing_test.cpp @@ -0,0 +1,177 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/cli_option_parsing.h" + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::testing { +namespace { + +struct ArgvHolder { + std::vector args; + std::vector argv; + + explicit ArgvHolder(std::vector in_args) : args(std::move(in_args)) { + argv.reserve(args.size()); + for (auto& arg : args) { + argv.push_back(arg.data()); + } + } + + int32_t argc() const { + return static_cast(argv.size()); + } +}; + +TEST(CliOptionParsingTest, ConsumeCliOptionWorks) { + std::string value; + ASSERT_TRUE(paimon::benchmark::ConsumeCliOption("--foo=bar", "--foo", &value)); + ASSERT_EQ(value, "bar"); + + value.clear(); + ASSERT_FALSE(paimon::benchmark::ConsumeCliOption("--foo", "--foo", &value)); +} + +TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsWorks) { + ASSERT_OK_AND_ASSIGN(auto parsed, + paimon::benchmark::ParseCommaSeparatedColumns("id, name,age", "--cols")); + ASSERT_EQ(parsed.size(), 3U); + ASSERT_EQ(parsed[0], "id"); + ASSERT_EQ(parsed[1], "name"); + ASSERT_EQ(parsed[2], "age"); +} + +TEST(CliOptionParsingTest, ParseCommaSeparatedColumnsRejectsInvalidInput) { + ASSERT_NOK(paimon::benchmark::ParseCommaSeparatedColumns("", "--cols")); + ASSERT_NOK(paimon::benchmark::ParseCommaSeparatedColumns("id,", "--cols")); + ASSERT_NOK(paimon::benchmark::ParseCommaSeparatedColumns("id,,name", "--cols")); +} + +TEST(CliOptionParsingTest, ParseDelimitedOptionsWorks) { + ASSERT_OK_AND_ASSIGN( + auto parsed, paimon::benchmark::ParseDelimitedOptions("k1:v1;k2:v2", "--paimon_option")); + ASSERT_EQ(parsed.size(), 2U); + ASSERT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); + ASSERT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); +} + +TEST(CliOptionParsingTest, ParseDelimitedOptionsTrimsKeyAndValue) { + ASSERT_OK_AND_ASSIGN(auto parsed, paimon::benchmark::ParseDelimitedOptions(" k1 : v1 ; k2: v2 ", + "--paimon_option")); + ASSERT_EQ(parsed.size(), 2U); + ASSERT_EQ(parsed[0], std::make_pair(std::string("k1"), std::string("v1"))); + ASSERT_EQ(parsed[1], std::make_pair(std::string("k2"), std::string("v2"))); +} + +TEST(CliOptionParsingTest, ParseDelimitedOptionsRejectsInvalidInput) { + ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions("", "--paimon_option")); + ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions("k1:v1;", "--paimon_option")); + ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions("k1:", "--paimon_option")); + ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions(":v1", "--paimon_option")); + ASSERT_NOK(paimon::benchmark::ParseDelimitedOptions("k1: ", "--paimon_option")); +} + +TEST(CliOptionParsingTest, ParseStringOptionArgWorksForEqualsAndSeparatedForms) { + { + ArgvHolder argv_holder({"prog", "--foo=bar"}); + int32_t arg_index = 1; + std::string value; + ASSERT_OK_AND_ASSIGN(bool is_parsed, + paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--foo", &arg_index, &value)); + ASSERT_TRUE(is_parsed); + ASSERT_EQ(arg_index, 1); + ASSERT_EQ(value, "bar"); + } + + { + ArgvHolder argv_holder({"prog", "--foo", "bar"}); + int32_t arg_index = 1; + std::string value; + ASSERT_OK_AND_ASSIGN(bool is_parsed, + paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--foo", &arg_index, &value)); + ASSERT_TRUE(is_parsed); + ASSERT_EQ(arg_index, 2); + ASSERT_EQ(value, "bar"); + } +} + +TEST(CliOptionParsingTest, ParseStringOptionArgRejectsMissingValue) { + ArgvHolder argv_holder({"prog", "--foo"}); + int32_t arg_index = 1; + std::string value; + ASSERT_NOK(paimon::benchmark::ParseStringOptionArg(argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--foo", + &arg_index, &value)); +} + +TEST(CliOptionParsingTest, ParseStringOptionArgIgnoresOtherOptions) { + ArgvHolder argv_holder({"prog", "--bar=baz"}); + int32_t arg_index = 1; + std::string value; + ASSERT_OK_AND_ASSIGN(bool is_parsed, + paimon::benchmark::ParseStringOptionArg( + argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--foo", &arg_index, &value)); + ASSERT_FALSE(is_parsed); + ASSERT_EQ(arg_index, 1); + ASSERT_TRUE(value.empty()); +} + +TEST(CliOptionParsingTest, ParseCommaSeparatedOptionArgAndDelimitedRepeatableOptionArgWorks) { + { + ArgvHolder argv_holder({"prog", "--cols", "id,name"}); + int32_t arg_index = 1; + std::vector columns; + ASSERT_OK_AND_ASSIGN(bool is_parsed, + paimon::benchmark::ParseCommaSeparatedOptionArg( + argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--cols", &arg_index, &columns)); + ASSERT_TRUE(is_parsed); + ASSERT_EQ(arg_index, 2); + ASSERT_EQ(columns.size(), 2U); + ASSERT_EQ(columns[0], "id"); + ASSERT_EQ(columns[1], "name"); + } + + { + ArgvHolder argv_holder({"prog", "--paimon_option", "k1:v1;k2:v2"}); + int32_t arg_index = 1; + std::vector> options; + ASSERT_OK_AND_ASSIGN(bool is_parsed, paimon::benchmark::ParseDelimitedRepeatableOptionArg( + argv_holder.argc(), argv_holder.argv.data(), + argv_holder.args[arg_index], "--paimon_option", + &arg_index, &options)); + ASSERT_TRUE(is_parsed); + ASSERT_EQ(arg_index, 2); + ASSERT_EQ(options.size(), 2U); + ASSERT_EQ(options[0], std::make_pair(std::string("k1"), std::string("v1"))); + ASSERT_EQ(options[1], std::make_pair(std::string("k2"), std::string("v2"))); + } +} + +} // namespace +} // namespace paimon::testing diff --git a/benchmark/read_write_benchmark.cpp b/benchmark/read_write_benchmark.cpp new file mode 100644 index 000000000..398e7f794 --- /dev/null +++ b/benchmark/read_write_benchmark.cpp @@ -0,0 +1,43 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "benchmark/benchmark.h" +#include "benchmark/benchmark_suite.h" + +int main(int argc, char** argv) { + if (paimon::benchmark::HasHelpFlag(static_cast(argc), argv)) { + paimon::benchmark::PrintPaimonBenchmarkCliHelp(); + return 0; + } + + const paimon::Status parse_status = paimon::benchmark::ParsePaimonBenchmarkCliArgs(&argc, argv); + if (!parse_status.ok()) { + std::cerr << "paimon-read-write-benchmark: " << parse_status.ToString() << std::endl; + std::cerr << "Try 'paimon-read-write-benchmark --help' for more information." << std::endl; + return 1; + } + + benchmark::Initialize(&argc, argv); + if (benchmark::ReportUnrecognizedArguments(argc, argv)) { + return 1; + } + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/cmake_modules/BuildUtils.cmake b/cmake_modules/BuildUtils.cmake index ed27ed786..d6d3b4a58 100644 --- a/cmake_modules/BuildUtils.cmake +++ b/cmake_modules/BuildUtils.cmake @@ -404,3 +404,121 @@ function(add_paimon_test REL_TEST_NAME) ${PCH_ARGS} ${ARG_UNPARSED_ARGUMENTS}) endfunction() + +function(add_benchmark_case REL_BENCHMARK_NAME) + set(options ENABLED) + set(one_value_args) + set(multi_value_args + SOURCES + STATIC_LINK_LIBS + EXTRA_LINK_LIBS + EXTRA_INCLUDES + LABELS + PREFIX) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(NOT PAIMON_BUILD_BENCHMARKS AND NOT ARG_ENABLED) + return() + endif() + + get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) + + if(ARG_PREFIX) + set(BENCHMARK_NAME "${ARG_PREFIX}-${BENCHMARK_NAME}") + endif() + + if(ARG_SOURCES) + set(SOURCES ${ARG_SOURCES}) + else() + set(SOURCES "${REL_BENCHMARK_NAME}.cpp") + endif() + + string(REPLACE "_" "-" BENCHMARK_NAME ${BENCHMARK_NAME}) + set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") + message(STATUS ${BENCHMARK_NAME}) + add_executable(${BENCHMARK_NAME} ${SOURCES}) + + if(ARG_STATIC_LINK_LIBS) + target_link_libraries(${BENCHMARK_NAME} PRIVATE ${ARG_STATIC_LINK_LIBS}) + endif() + + if(ARG_EXTRA_LINK_LIBS) + target_link_libraries(${BENCHMARK_NAME} PRIVATE ${ARG_EXTRA_LINK_LIBS}) + endif() + + if(ARG_EXTRA_INCLUDES) + target_include_directories(${BENCHMARK_NAME} SYSTEM PUBLIC ${ARG_EXTRA_INCLUDES}) + endif() + + if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + target_compile_options(${BENCHMARK_NAME} PRIVATE -Wno-global-constructors) + endif() + target_compile_options(${BENCHMARK_NAME} PRIVATE -fno-access-control) + + add_test(${BENCHMARK_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh + ${CMAKE_BINARY_DIR} + benchmark + ${BENCHMARK_PATH}) + + foreach(TARGET ${ARG_LABELS}) + add_dependencies(${TARGET} ${BENCHMARK_NAME}) + endforeach() + + set(LABELS) + list(APPEND LABELS "benchmark") + if(ARG_LABELS) + list(APPEND LABELS ${ARG_LABELS}) + endif() + + foreach(LABEL ${ARG_LABELS}) + set(LABEL_BENCHMARK_NAME "benchmark-${LABEL}") + if(NOT TARGET ${LABEL_BENCHMARK_NAME}) + add_custom_target(${LABEL_BENCHMARK_NAME} + ctest -L "${LABEL}" --output-on-failure + USES_TERMINAL) + endif() + add_dependencies(${LABEL_BENCHMARK_NAME} ${BENCHMARK_NAME}) + endforeach() + + set_property(TEST ${BENCHMARK_NAME} + APPEND + PROPERTY LABELS ${LABELS}) +endfunction() + +function(add_paimon_benchmark REL_BENCHMARK_NAME) + set(options) + set(one_value_args PREFIX) + set(multi_value_args LABELS) + cmake_parse_arguments(ARG + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + + if(ARG_PREFIX) + set(PREFIX ${ARG_PREFIX}) + else() + set(PREFIX "paimon") + endif() + + if(ARG_LABELS) + set(LABELS ${ARG_LABELS}) + else() + set(LABELS "paimon-benchmarks") + endif() + + add_benchmark_case(${REL_BENCHMARK_NAME} + PREFIX + ${PREFIX} + LABELS + ${LABELS} + ${ARG_UNPARSED_ARGUMENTS}) +endfunction() diff --git a/cmake_modules/DefineOptions.cmake b/cmake_modules/DefineOptions.cmake index 41ff252c5..cf7f964be 100644 --- a/cmake_modules/DefineOptions.cmake +++ b/cmake_modules/DefineOptions.cmake @@ -107,6 +107,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") define_option(PAIMON_BUILD_TESTS "Build the Paimon googletest unit tests" OFF) + define_option(PAIMON_BUILD_BENCHMARKS + "Build the Paimon Google Benchmark performance benchmarks" OFF) + if(PAIMON_BUILD_SHARED) set(PAIMON_TEST_LINKAGE_DEFAULT "shared") else() @@ -241,6 +244,13 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") AUTO BUNDLED SYSTEM) + + define_option_string(benchmark_SOURCE + "Dependency source for Google Benchmark" + "" + AUTO + BUNDLED + SYSTEM) endif() macro(validate_config) diff --git a/cmake_modules/FindbenchmarkAlt.cmake b/cmake_modules/FindbenchmarkAlt.cmake new file mode 100644 index 000000000..a731f3359 --- /dev/null +++ b/cmake_modules/FindbenchmarkAlt.cmake @@ -0,0 +1,60 @@ +# Copyright 2026-present Alibaba Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set(_PAIMON_BENCHMARK_ROOTS ${benchmark_ROOT} ${BENCHMARK_ROOT} ${PAIMON_PACKAGE_PREFIX}) +list(REMOVE_ITEM _PAIMON_BENCHMARK_ROOTS "") +if(_PAIMON_BENCHMARK_ROOTS) + set(_PAIMON_BENCHMARK_FIND_ARGS HINTS ${_PAIMON_BENCHMARK_ROOTS} NO_DEFAULT_PATH) +endif() + +find_package(benchmark CONFIG QUIET ${_PAIMON_BENCHMARK_FIND_ARGS}) + +if(NOT TARGET benchmark::benchmark) + find_path(BENCHMARK_INCLUDE_DIR + NAMES benchmark/benchmark.h ${_PAIMON_BENCHMARK_FIND_ARGS} + PATH_SUFFIXES include) + find_library(BENCHMARK_LIBRARY + NAMES benchmark ${_PAIMON_BENCHMARK_FIND_ARGS} + PATH_SUFFIXES lib lib64) + find_library(BENCHMARK_MAIN_LIBRARY + NAMES benchmark_main ${_PAIMON_BENCHMARK_FIND_ARGS} + PATH_SUFFIXES lib lib64) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(benchmarkAlt REQUIRED_VARS BENCHMARK_INCLUDE_DIR + BENCHMARK_LIBRARY) + + if(benchmarkAlt_FOUND) + if(NOT TARGET benchmark::benchmark) + add_library(benchmark::benchmark UNKNOWN IMPORTED) + set_target_properties(benchmark::benchmark + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + endif() + + if(BENCHMARK_MAIN_LIBRARY AND NOT TARGET benchmark::benchmark_main) + add_library(benchmark::benchmark_main UNKNOWN IMPORTED) + set_target_properties(benchmark::benchmark_main + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_MAIN_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + endif() + endif() +else() + set(benchmarkAlt_FOUND TRUE) +endif() + +unset(_PAIMON_BENCHMARK_ROOTS) +unset(_PAIMON_BENCHMARK_FIND_ARGS) diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index 068cf7de7..271011a0d 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -245,6 +245,18 @@ else() endif() endif() +if(DEFINED ENV{PAIMON_BENCHMARK_URL}) + set(BENCHMARK_SOURCE_URL "$ENV{PAIMON_BENCHMARK_URL}") +else() + if(EXISTS "${THIRDPARTY_DIR}/${PAIMON_BENCHMARK_PKG_NAME}") + set_urls(BENCHMARK_SOURCE_URL "${THIRDPARTY_DIR}/${PAIMON_BENCHMARK_PKG_NAME}") + else() + set_urls(BENCHMARK_SOURCE_URL + "${THIRDPARTY_MIRROR_URL}https://github.com/google/benchmark/archive/refs/tags/v${PAIMON_BENCHMARK_BUILD_VERSION}.tar.gz" + ) + endif() +endif() + if(DEFINED ENV{PAIMON_TBB_URL}) set(TBB_SOURCE_URL "$ENV{PAIMON_TBB_URL}") else() @@ -500,6 +512,8 @@ function(paimon_get_dependency_compat_target DEPENDENCY_NAME OUT_VAR) set(_target libprotobuf) elseif("${DEPENDENCY_NAME}" STREQUAL "GTest") set(_target GTest::gtest) + elseif("${DEPENDENCY_NAME}" STREQUAL "benchmark") + set(_target benchmark::benchmark) elseif("${DEPENDENCY_NAME}" STREQUAL "RE2") set(_target re2::re2) elseif("${DEPENDENCY_NAME}" STREQUAL "Snappy") @@ -586,6 +600,8 @@ macro(paimon_build_dependency DEPENDENCY_NAME) build_avro() elseif("${DEPENDENCY_NAME}" STREQUAL "GTest") build_gtest() + elseif("${DEPENDENCY_NAME}" STREQUAL "benchmark") + build_benchmark() else() message(FATAL_ERROR "No bundled build rule for ${DEPENDENCY_NAME}") endif() @@ -1743,6 +1759,49 @@ macro(build_tbb) endmacro(build_tbb) +macro(build_benchmark) + message(STATUS "Building benchmark from source") + + set(BENCHMARK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/benchmark_ep-install") + set(BENCHMARK_INCLUDE_DIR "${BENCHMARK_PREFIX}/include") + set(BENCHMARK_STATIC_LIB + "${BENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + set(BENCHMARK_MAIN_STATIC_LIB + "${BENCHMARK_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}benchmark_main${CMAKE_STATIC_LIBRARY_SUFFIX}" + ) + + set(BENCHMARK_CMAKE_ARGS + ${EP_COMMON_CMAKE_ARGS} + "-DCMAKE_INSTALL_PREFIX=${BENCHMARK_PREFIX}" + -DBENCHMARK_ENABLE_TESTING=OFF + -DBENCHMARK_ENABLE_GTEST_TESTS=OFF + -DBENCHMARK_DOWNLOAD_DEPENDENCIES=OFF) + + externalproject_add(benchmark_ep + URL ${BENCHMARK_SOURCE_URL} + URL_HASH "SHA256=${PAIMON_BENCHMARK_BUILD_SHA256_CHECKSUM}" + CMAKE_ARGS ${BENCHMARK_CMAKE_ARGS} + BUILD_BYPRODUCTS "${BENCHMARK_STATIC_LIB}" + "${BENCHMARK_MAIN_STATIC_LIB}") + + file(MAKE_DIRECTORY "${BENCHMARK_INCLUDE_DIR}") + + add_library(benchmark::benchmark STATIC IMPORTED) + set_target_properties(benchmark::benchmark + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + add_dependencies(benchmark::benchmark benchmark_ep) + + add_library(benchmark::benchmark_main STATIC IMPORTED) + set_target_properties(benchmark::benchmark_main + PROPERTIES IMPORTED_LOCATION "${BENCHMARK_MAIN_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES + "${BENCHMARK_INCLUDE_DIR}") + add_dependencies(benchmark::benchmark_main benchmark_ep) +endmacro() + macro(build_glog) message(STATUS "Building glog from source") set(GLOG_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/glog_ep-install") @@ -1810,6 +1869,9 @@ if(PAIMON_ENABLE_ORC) resolve_dependency(Protobuf) resolve_dependency(ORC) endif() +if(PAIMON_BUILD_BENCHMARKS) + resolve_dependency(benchmark) +endif() if(PAIMON_ENABLE_JINDO) build_jindosdk_c() build_jindosdk_nextarch() diff --git a/docs/source/examples/benchmark.rst b/docs/source/examples/benchmark.rst new file mode 100644 index 000000000..2bd06cbe9 --- /dev/null +++ b/docs/source/examples/benchmark.rst @@ -0,0 +1,85 @@ +.. Copyright 2026-present Alibaba Inc. + +.. Licensed under the Apache License, Version 2.0 (the "License"); +.. you may not use this file except in compliance with the License. +.. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, software +.. distributed under the License is distributed on an "AS IS" BASIS, +.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. See the License for the specific language governing permissions and +.. limitations under the License. + +================ +Benchmark Usage +================ + +Paimon C++ provides Google Benchmark based cases for append-table write/read and +primary-key table write/MOR read paths. Benchmarks are disabled by default. + +Build +===== + +Enable benchmarks when configuring CMake:: + + cmake -S . -B build -DPAIMON_BUILD_BENCHMARKS=ON + cmake --build build --target paimon-read-write-benchmark + +Run all benchmark cases through CTest:: + + cmake --build build --target benchmark + +Custom Options +============== + +``paimon-read-write-benchmark`` accepts Google Benchmark options plus the Paimon +specific options below: + +``--paimon_source_data_file=`` + Source data file used to build benchmark data. Currently Parquet source files + are supported. + +``--paimon_source_table_path=`` + Read directly from an existing table path for ``BM_Read`` and ``BM_MOR_Read``. + When set, the source loading and pre-write stage are skipped. + +``--paimon_pk_columns=`` + Primary key columns for ``BM_PK_Write`` and ``BM_MOR_Read``. These cases + explicitly use ``bucket=1`` because benchmark batches are written to bucket 0. + +``--paimon_option=:;:`` + Repeatable table options passed through to Paimon. The default table file + format is ``parquet``; use ``--paimon_option file.format:`` to + override it. For ``BM_PK_Write`` and ``BM_MOR_Read``, ``bucket`` is forced to + ``1``. + +Examples +======== + +Append table write:: + + paimon-read-write-benchmark \ + --paimon_source_data_file /path/data.parquet \ + --benchmark_filter=BM_Write + +Append table read with four prefetch workers:: + + paimon-read-write-benchmark \ + --paimon_source_data_file /path/data.parquet \ + --benchmark_filter=BM_Read/4 + +Primary-key table write:: + + paimon-read-write-benchmark \ + --paimon_source_data_file /path/data.parquet \ + --paimon_pk_columns=id \ + --benchmark_filter=BM_PK_Write + +MOR read from an existing table:: + + paimon-read-write-benchmark \ + --paimon_source_table_path /path/table \ + --paimon_pk_columns=id \ + --benchmark_filter=BM_MOR_Read/4 diff --git a/docs/source/examples/index.rst b/docs/source/examples/index.rst index 0ba3b318a..b3ec8c536 100644 --- a/docs/source/examples/index.rst +++ b/docs/source/examples/index.rst @@ -20,3 +20,4 @@ Examples write_commit_scan_read clean + benchmark diff --git a/src/paimon/testing/utils/CMakeLists.txt b/src/paimon/testing/utils/CMakeLists.txt index b8fe36296..ee18db9ea 100644 --- a/src/paimon/testing/utils/CMakeLists.txt +++ b/src/paimon/testing/utils/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -if(PAIMON_BUILD_TESTS) +if(PAIMON_BUILD_TESTS OR PAIMON_BUILD_BENCHMARKS) set(PAIMON_TEST_UTILS testharness.cpp data_generator.cpp) @@ -25,6 +25,9 @@ if(PAIMON_BUILD_TESTS) STATIC_LINK_LIBS paimon_static ${GTEST_LINK_TOOLCHAIN}) +endif() + +if(PAIMON_BUILD_TESTS) add_paimon_test(test_utils_test SOURCES diff --git a/third_party/versions.txt b/third_party/versions.txt index c27599997..89624f5c0 100644 --- a/third_party/versions.txt +++ b/third_party/versions.txt @@ -60,6 +60,10 @@ PAIMON_GTEST_BUILD_VERSION=1.11.0 PAIMON_GTEST_BUILD_SHA256_CHECKSUM=b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 PAIMON_GTEST_PKG_NAME=gtest-${PAIMON_GTEST_BUILD_VERSION}.tar.gz +PAIMON_BENCHMARK_BUILD_VERSION=1.9.1 +PAIMON_BENCHMARK_BUILD_SHA256_CHECKSUM=32131c08ee31eeff2c8968d7e874f3cb648034377dfc32a4c377fa8796d84981 +PAIMON_BENCHMARK_PKG_NAME=benchmark-${PAIMON_BENCHMARK_BUILD_VERSION}.tar.gz + PAIMON_ARROW_BUILD_VERSION=17.0.0 PAIMON_ARROW_BUILD_SHA256_CHECKSUM=9d280d8042e7cf526f8c28d170d93bfab65e50f94569f6a790982a878d8d898d PAIMON_ARROW_PKG_NAME=apache-arrow-${PAIMON_ARROW_BUILD_VERSION}.tar.gz @@ -124,6 +128,7 @@ DEPENDENCIES=( "PAIMON_TBB_URL ${PAIMON_TBB_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/uxlfoundation/oneTBB/archive/refs/tags/${PAIMON_TBB_BUILD_VERSION}.tar.gz" "PAIMON_ORC_URL ${PAIMON_ORC_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/apache/orc/archive/refs/tags/${PAIMON_ORC_BUILD_VERSION}.tar.gz" "PAIMON_GTEST_URL ${PAIMON_GTEST_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/google/googletest/archive/release-${PAIMON_GTEST_BUILD_VERSION}.tar.gz" + "PAIMON_BENCHMARK_URL ${PAIMON_BENCHMARK_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/google/benchmark/archive/refs/tags/v${PAIMON_BENCHMARK_BUILD_VERSION}.tar.gz" "PAIMON_ARROW_URL ${PAIMON_ARROW_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/apache/arrow/releases/download/apache-arrow-${PAIMON_ARROW_BUILD_VERSION}/apache-arrow-${PAIMON_ARROW_BUILD_VERSION}.tar.gz" "PAIMON_AVRO_URL ${PAIMON_AVRO_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/apache/avro/archive/${PAIMON_AVRO_BUILD_VERSION}.tar.gz" "PAIMON_FMT_URL ${PAIMON_FMT_PKG_NAME} ${THIRDPARTY_MIRROR_URL}https://github.com/fmtlib/fmt/archive/refs/tags/${PAIMON_FMT_BUILD_VERSION}.tar.gz"