diff --git a/.github/scripts/run-sql-bench.sh b/.github/scripts/run-sql-bench.sh index 93e96cb89dd..9fd91b0dd7f 100755 --- a/.github/scripts/run-sql-bench.sh +++ b/.github/scripts/run-sql-bench.sh @@ -2,8 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright the Vortex contributors # -# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets. -# This script is used by the sql-benchmarks.yml workflow. +# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench, clickhouse-bench) +# for the given targets. This script is used by the sql-benchmarks.yml workflow. # # Usage: # run-sql-bench.sh [options] @@ -11,12 +11,12 @@ # Arguments: # subcommand The benchmark subcommand (e.g., tpch, clickbench, tpcds) # targets Comma-separated list of engine:format pairs -# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet") +# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet,clickhouse:parquet") # # Options: # --scale-factor Scale factor for the benchmark (e.g., 1.0, 10.0) # --remote-storage Remote storage URL (e.g., s3://bucket/path/) -# If provided, runs in remote mode (no lance support). +# If provided, runs in remote mode (no lance/clickhouse support). # --benchmark-id Benchmark ID for error messages (e.g., tpch-s3) set -Eeu -o pipefail @@ -78,6 +78,7 @@ fi df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false") +has_clickhouse=$(echo "$targets" | grep -q '^clickhouse:' && echo "true" || echo "false") # Build options string. opts="" @@ -127,3 +128,14 @@ if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/l cat lance-results.json >> results.json fi + +# ClickHouse-bench only runs for local benchmarks (clickhouse-local reads local files). +if ! $is_remote && [[ "$has_clickhouse" == "true" ]] && [[ -f "target/release_debug/clickhouse-bench" ]]; then + # shellcheck disable=SC2086 + target/release_debug/clickhouse-bench "$subcommand" \ + -d gh-json \ + $opts \ + -o ch-results.json + + cat ch-results.json >> results.json +fi diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 9743a554eae..36ebbcfe4b7 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -121,7 +121,7 @@ jobs: "id": "clickbench-nvme", "subcommand": "clickbench", "name": "Clickbench on NVME", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", + "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb,clickhouse:parquet", "build_lance": true }, { diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index f98e7ef87d4..424d16066aa 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -21,7 +21,7 @@ on: "id": "clickbench-nvme", "subcommand": "clickbench", "name": "Clickbench on NVME", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb" + "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb,clickhouse:parquet" }, { "id": "tpch-nvme", @@ -127,6 +127,16 @@ jobs: - uses: ./.github/actions/system-info + - name: Install ClickHouse + if: contains(matrix.targets, 'clickhouse:') + env: + CLICKHOUSE_VERSION: "25.8.18.1" + run: | + wget -qO- "https://github.com/ClickHouse/ClickHouse/releases/download/v${CLICKHOUSE_VERSION}-lts/clickhouse-common-static-${CLICKHOUSE_VERSION}-amd64.tgz" | tar xz + cp clickhouse-common-static-${CLICKHOUSE_VERSION}/usr/bin/clickhouse . + chmod +x clickhouse + echo "CLICKHOUSE_BINARY=$PWD/clickhouse" >> $GITHUB_ENV + - name: Build binaries shell: bash env: @@ -136,6 +146,9 @@ jobs: if [ "${{ matrix.build_lance }}" = "true" ]; then packages="$packages --bin lance-bench" fi + if echo "${{ matrix.targets }}" | grep -q 'clickhouse:'; then + packages="$packages --bin clickhouse-bench" + fi cargo build $packages --profile release_debug - name: Generate data diff --git a/Cargo.lock b/Cargo.lock index f330414d9c1..96ddd27fc9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1195,6 +1195,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" +[[package]] +name = "clickhouse-bench" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "tokio", + "tracing", + "vortex-bench", +] + [[package]] name = "codespan-reporting" version = "0.13.1" diff --git a/Cargo.toml b/Cargo.toml index b5490594dd7..0219f565f93 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ members = [ "encodings/zstd", "encodings/bytebool", # Benchmarks + "benchmarks/clickhouse-bench", "benchmarks/lance-bench", "benchmarks/compress-bench", "benchmarks/datafusion-bench", diff --git a/benchmarks/clickhouse-bench/Cargo.toml b/benchmarks/clickhouse-bench/Cargo.toml new file mode 100644 index 00000000000..7b26ae12053 --- /dev/null +++ b/benchmarks/clickhouse-bench/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "clickhouse-bench" +description = "ClickHouse (clickhouse-local) benchmark runner for Vortex" +authors.workspace = true +edition.workspace = true +homepage.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true +publish = false + +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true, features = ["derive"] } +tokio = { workspace = true, features = ["full"] } +tracing = { workspace = true } +vortex-bench = { workspace = true } + +[lints] +workspace = true diff --git a/benchmarks/clickhouse-bench/build.rs b/benchmarks/clickhouse-bench/build.rs new file mode 100644 index 00000000000..7ef98c8e48d --- /dev/null +++ b/benchmarks/clickhouse-bench/build.rs @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Build script that exports the ClickHouse binary path. +//! +//! Resolution order: +//! 1. `CLICKHOUSE_BINARY` env var — use as-is. +//! 2. Falls back to `"clickhouse"` (i.e., resolve from `$PATH` at runtime). +//! +//! Users must install ClickHouse themselves for local runs. +//! In CI, it is installed via the workflow before the benchmark step. + +fn main() { + println!("cargo:rerun-if-env-changed=CLICKHOUSE_BINARY"); + + let binary = std::env::var("CLICKHOUSE_BINARY").unwrap_or_else(|_| "clickhouse".to_string()); + println!("cargo:rustc-env=CLICKHOUSE_BINARY={binary}"); +} diff --git a/benchmarks/clickhouse-bench/src/lib.rs b/benchmarks/clickhouse-bench/src/lib.rs new file mode 100644 index 00000000000..9327776578b --- /dev/null +++ b/benchmarks/clickhouse-bench/src/lib.rs @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! ClickHouse Local context for benchmarks. +//! +//! Uses `clickhouse-local` via `std::process::Command` to execute SQL queries +//! against Parquet files on disk. +//! +//! The ClickHouse binary is resolved at build time via `build.rs`: +//! 1. `CLICKHOUSE_BINARY` env var — use the specified path. +//! 2. Falls back to `"clickhouse"` — resolved from `$PATH` at runtime. +//! +//! For local runs, install ClickHouse manually (e.g., `brew install clickhouse` +//! or download from ). +//! In CI, it is installed by the workflow before the benchmark step. + +use std::io::Write; +use std::path::PathBuf; +use std::process::Command; +use std::process::Stdio; +use std::time::Duration; +use std::time::Instant; + +use anyhow::Context; +use anyhow::Result; +use tracing::trace; +use vortex_bench::Benchmark; +use vortex_bench::Format; + +/// Path to the ClickHouse binary, set by build.rs at compile time. +/// +/// This is either the value of the `CLICKHOUSE_BINARY` env var at build time, +/// or `"clickhouse"` (resolved from `$PATH` at runtime). +const CLICKHOUSE_BINARY: &str = env!("CLICKHOUSE_BINARY"); + +/// A client that wraps `clickhouse-local` for running SQL benchmarks. +pub struct ClickHouseClient { + /// The path to the `clickhouse` binary. + binary: PathBuf, + /// SQL statements to run before each query (CREATE VIEW statements). + setup_sql: Vec, +} + +impl ClickHouseClient { + /// Create a new client. Only Parquet format is supported. + /// + /// The ClickHouse binary is resolved from (in order): + /// 1. `CLICKHOUSE_BINARY` env var at build time + /// 2. `"clickhouse"` on `$PATH` + pub fn new(benchmark: &dyn Benchmark, format: Format) -> Result { + if format != Format::Parquet { + anyhow::bail!("clickhouse-bench only supports Parquet format, got {format}"); + } + + let binary = PathBuf::from(CLICKHOUSE_BINARY); + + // Verify the binary is usable (either absolute path exists, or resolvable via PATH). + Self::verify_binary(&binary)?; + + tracing::info!(binary = %binary.display(), "Using clickhouse-local"); + + let mut client = Self { + binary, + setup_sql: Vec::new(), + }; + client.register_tables(benchmark, format)?; + Ok(client) + } + + /// Check that the ClickHouse binary is available. + /// + /// For absolute paths, checks that the file exists on disk. + /// For bare names (e.g., `"clickhouse"`), tries to invoke it to verify it's resolvable. + fn verify_binary(binary: &PathBuf) -> Result<()> { + if binary.is_absolute() { + anyhow::ensure!( + binary.exists(), + "ClickHouse binary not found at '{path}'. \ + Set CLICKHOUSE_BINARY env var to the correct path, or install ClickHouse \ + and ensure it is on $PATH.", + path = binary.display() + ); + } + + // Verify the binary is actually usable by running `clickhouse local --version`. + let output = Command::new(binary.as_os_str()) + .args(["local", "--version"]) + .output() + .with_context(|| { + format!( + "ClickHouse binary '{name}' not found on $PATH. \ + Install ClickHouse (https://clickhouse.com/docs/en/install) or set \ + CLICKHOUSE_BINARY env var to an absolute path before building.", + name = binary.display() + ) + })?; + + anyhow::ensure!( + output.status.success(), + "ClickHouse binary at '{name}' failed to run: {stderr}", + name = binary.display(), + stderr = String::from_utf8_lossy(&output.stderr) + ); + + let version = String::from_utf8_lossy(&output.stdout); + tracing::debug!(version = version.trim(), "Verified clickhouse binary"); + + Ok(()) + } + + /// Generate `CREATE VIEW ... AS SELECT * FROM file(...)` statements. + /// + /// We use a VIEW over the `file()` table function rather than `CREATE TABLE ... ENGINE = File()` + /// because the `file()` function handles glob patterns (e.g., `*.parquet`) more reliably across + /// ClickHouse versions. + fn register_tables(&mut self, benchmark: &dyn Benchmark, format: Format) -> Result<()> { + let data_url = benchmark.data_url(); + let base_dir = if data_url.scheme() == "file" { + data_url + .to_file_path() + .map_err(|_| anyhow::anyhow!("Invalid file URL: {data_url}"))? + } else { + anyhow::bail!("clickhouse-bench only supports local file:// data URLs"); + }; + + let format_dir = base_dir.join(format.name()); + if !format_dir.exists() { + anyhow::bail!( + "Data directory does not exist: {}. Run data generation first.", + format_dir.display() + ); + } + + for table_spec in benchmark.table_specs() { + let name = table_spec.name; + let pattern = benchmark + .pattern(name, format) + .map(|p| p.to_string()) + .unwrap_or_else(|| format!("*.{}", format.ext())); + + let data_path = format!("{}/{}", format_dir.display(), pattern); + + tracing::info!( + table = name, + path = %data_path, + "Registering ClickHouse table" + ); + + let create_sql = format!( + "CREATE VIEW IF NOT EXISTS {name} AS \ + SELECT * FROM file('{data_path}', Parquet);" + ); + self.setup_sql.push(create_sql); + } + + Ok(()) + } + + /// Execute a SQL query via `clickhouse-local`, returning `(row_count, timing)`. + /// + /// The approach: + /// 1. Prepend all CREATE VIEW statements + /// 2. Append the benchmark query + /// 3. Pipe the combined SQL into `clickhouse local` via stdin + /// 4. Parse stdout to count result rows + pub fn execute_query(&self, query: &str) -> Result<(usize, Option)> { + trace!("execute clickhouse query: {query}"); + + // Build the full SQL: setup views + the actual query + let mut full_sql = String::new(); + for stmt in &self.setup_sql { + full_sql.push_str(stmt); + full_sql.push('\n'); + } + full_sql.push_str(query); + // Ensure we have a trailing semicolon + if !query.trim_end().ends_with(';') { + full_sql.push(';'); + } + + let time_instant = Instant::now(); + + // The `clickhouse` binary is a multi-tool; invoke it as `clickhouse local`. + let mut child = Command::new(&self.binary) + .args(["local", "--format", "TabSeparated"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn clickhouse-local")?; + + // Write SQL to stdin + { + let stdin = child + .stdin + .as_mut() + .context("Failed to open clickhouse-local stdin")?; + stdin + .write_all(full_sql.as_bytes()) + .context("Failed to write SQL to clickhouse-local stdin")?; + } + + let output = child + .wait_with_output() + .context("Failed to wait for clickhouse-local")?; + + let query_time = time_instant.elapsed(); + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!( + "clickhouse-local failed (exit {}): {stderr}", + output.status.code().unwrap_or(-1) + ); + } + + // Count non-empty lines in stdout as row count + let stdout = String::from_utf8_lossy(&output.stdout); + let row_count = stdout.lines().filter(|line| !line.is_empty()).count(); + + Ok((row_count, Some(query_time))) + } +} diff --git a/benchmarks/clickhouse-bench/src/main.rs b/benchmarks/clickhouse-bench/src/main.rs new file mode 100644 index 00000000000..bd8e7a7666d --- /dev/null +++ b/benchmarks/clickhouse-bench/src/main.rs @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::path::PathBuf; + +use clap::Parser; +use clickhouse_bench::ClickHouseClient; +use tokio::runtime::Runtime; +use vortex_bench::BenchmarkArg; +use vortex_bench::Engine; +use vortex_bench::Format; +use vortex_bench::Opt; +use vortex_bench::Opts; +use vortex_bench::create_benchmark; +use vortex_bench::create_output_writer; +use vortex_bench::display::DisplayFormat; +use vortex_bench::runner::SqlBenchmarkRunner; +use vortex_bench::runner::filter_queries; +use vortex_bench::setup_logging_and_tracing; + +/// ClickHouse (clickhouse-local) benchmark runner. +/// +/// Runs queries against Parquet data using clickhouse-local as a performance baseline. +/// This allows comparing ClickHouse's native Parquet reading performance against other engines +/// (DuckDB, DataFusion) on the same hardware and dataset. +#[derive(Parser)] +struct Args { + #[arg(value_enum)] + benchmark: BenchmarkArg, + + #[arg(short, long, default_value_t = 5)] + iterations: usize, + + #[arg(short, long)] + verbose: bool, + + #[arg(long)] + tracing: bool, + + #[arg(short, long, default_value_t, value_enum)] + display_format: DisplayFormat, + + #[arg(short, long, value_delimiter = ',')] + queries: Option>, + + #[arg(short, long, value_delimiter = ',')] + exclude_queries: Option>, + + #[arg(short)] + output_path: Option, + + #[arg(long, default_value_t = false)] + track_memory: bool, + + #[arg(long, default_value_t = false)] + hide_progress_bar: bool, + + #[arg(long = "opt", value_delimiter = ',', value_parser = clap::value_parser!(Opt))] + options: Vec, +} + +fn main() -> anyhow::Result<()> { + let args = Args::parse(); + let opts = Opts::from(args.options); + + setup_logging_and_tracing(args.verbose, args.tracing)?; + + let benchmark = create_benchmark(args.benchmark, &opts)?; + + let filtered_queries = filter_queries( + benchmark.queries()?, + args.queries.as_ref(), + args.exclude_queries.as_ref(), + ); + + // Generate base Parquet data if needed. + if benchmark.data_url().scheme() == "file" { + let runtime = Runtime::new()?; + runtime.block_on(async { benchmark.generate_base_data().await })?; + } + + let formats = vec![Format::Parquet]; + + let mut runner = SqlBenchmarkRunner::new( + benchmark.as_ref(), + Engine::ClickHouse, + formats, + args.track_memory, + args.hide_progress_bar, + )?; + + runner.run_all( + &filtered_queries, + args.iterations, + |format| ClickHouseClient::new(benchmark.as_ref(), format), + |ctx, _query_idx, _format, query| ctx.execute_query(query), + )?; + + let benchmark_id = format!("clickhouse-{}", benchmark.dataset_name()); + let writer = create_output_writer(&args.display_format, args.output_path, &benchmark_id)?; + runner.export_to(&args.display_format, writer)?; + + Ok(()) +} diff --git a/vortex-bench/src/clickbench/benchmark.rs b/vortex-bench/src/clickbench/benchmark.rs index 5e14cbcf40e..cece28c6b6b 100644 --- a/vortex-bench/src/clickbench/benchmark.rs +++ b/vortex-bench/src/clickbench/benchmark.rs @@ -1,17 +1,16 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::env; use std::fs; -use std::path::Path; +use std::path::PathBuf; use anyhow::Result; use reqwest::Client; use url::Url; -use vortex::error::VortexExpect; use crate::Benchmark; use crate::BenchmarkDataset; +use crate::Engine; use crate::IdempotentPath; use crate::TableSpec; use crate::clickbench::*; @@ -21,6 +20,8 @@ pub struct ClickBenchBenchmark { pub flavor: Flavor, pub queries_file: Option, pub data_url: Url, + /// Override the engine to select engine-specific query files. + pub engine: Option, } impl ClickBenchBenchmark { @@ -34,17 +35,46 @@ impl ClickBenchBenchmark { flavor, queries_file, data_url: url, + engine: None, }) } + /// Set the engine to select engine-specific query files. + pub fn with_engine(mut self, engine: Engine) -> Self { + self.engine = Some(engine); + self + } + + /// Returns the path to the queries file. + fn queries_file_path(&self) -> PathBuf { + if let Some(file) = &self.queries_file { + return file.into(); + } + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + manifest_dir.join("clickbench_queries.sql") + } + + /// Returns true if the engine requires unquoted column names. + fn uses_unquoted_identifiers(&self) -> bool { + matches!(self.engine, Some(Engine::ClickHouse)) + } + + /// Strips double quotes only from simple SQL identifiers for engines like + /// ClickHouse that don't require quoted column names. + fn normalize_query(&self, query: &str) -> String { + if !self.uses_unquoted_identifiers() { + return query.to_string(); + } + + strip_simple_identifier_quotes(query) + } + fn create_data_url(remote_data_dir: &Option, flavor: Flavor) -> Result { match remote_data_dir { None => { let basepath = format!("clickbench_{flavor}").to_data_path(); - Ok(Url::parse(&format!( - "file:{}/", - basepath.to_str().vortex_expect("path should be utf8") - ))?) + Url::from_directory_path(basepath) + .map_err(|_| anyhow::anyhow!("Failed to convert ClickBench data path to URL")) } Some(remote_data_dir) => { if !remote_data_dir.ends_with("/") { @@ -66,19 +96,74 @@ impl ClickBenchBenchmark { } } +fn strip_simple_identifier_quotes(query: &str) -> String { + let bytes = query.as_bytes(); + let mut out = String::with_capacity(query.len()); + let mut i = 0; + + while i < query.len() { + let rel = match query[i..].find('"') { + Some(pos) => pos, + None => { + out.push_str(&query[i..]); + break; + } + }; + + let start = i + rel; + out.push_str(&query[i..start]); + + let mut end = start + 1; + while end < bytes.len() { + if bytes[end] == b'"' { + if end + 1 < bytes.len() && bytes[end + 1] == b'"' { + end += 2; + } else { + break; + } + } else { + end += 1; + } + } + + if end >= bytes.len() { + out.push_str(&query[start..]); + break; + } + + let inner = &query[start + 1..end]; + if is_simple_identifier(inner) { + out.push_str(inner); + } else { + out.push_str(&query[start..=end]); + } + + i = end + 1; + } + + out +} + +fn is_simple_identifier(s: &str) -> bool { + let mut chars = s.chars(); + let Some(first) = chars.next() else { + return false; + }; + + (first.is_ascii_alphabetic() || first == '_') + && chars.all(|c| c.is_ascii_alphanumeric() || c == '_') +} + #[async_trait::async_trait] impl Benchmark for ClickBenchBenchmark { fn queries(&self) -> Result> { - let queries_filepath = match &self.queries_file { - Some(file) => file.into(), - None => Path::new(env!("CARGO_MANIFEST_DIR")).join("clickbench_queries.sql"), - }; + let queries_filepath = self.queries_file_path(); Ok(fs::read_to_string(queries_filepath)? .split(';') .map(|s| s.trim()) .filter(|s| !s.is_empty()) - .map(|s| s.to_string()) + .map(|s| self.normalize_query(s)) .enumerate() .collect()) } diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index 6dad0f0f6a1..8be4c6bcea8 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -206,6 +206,9 @@ pub enum Engine { #[clap(name = "duckdb")] #[serde(rename = "duckdb")] DuckDB, + #[clap(name = "clickhouse")] + #[serde(rename = "clickhouse")] + ClickHouse, } impl Display for Engine { @@ -213,6 +216,7 @@ impl Display for Engine { match self { Engine::DataFusion => write!(f, "datafusion"), Engine::DuckDB => write!(f, "duckdb"), + Engine::ClickHouse => write!(f, "clickhouse"), Engine::Vortex => write!(f, "vortex"), Engine::Arrow => write!(f, "arrow"), }