From baab377ba938f4d7b66890f62246af6c7c5bc4f0 Mon Sep 17 00:00:00 2001 From: mprammer Date: Mon, 18 May 2026 17:19:03 -0400 Subject: [PATCH 1/4] bench: add Appian benchmark to the SQL bench matrix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires DuckDB's in-tree appian_benchmarks suite into vortex-bench so the same 8 join-heavy queries (~5M rows across 9 LEFT-OUTER-joined views) get the datafusion+duckdb × parquet/vortex/vortex-compact/duckdb treatment that clickbench/tpch/fineweb already get. The workload exercises wide CTE aggregations that the other suites don't. AppianBenchmark::generate_base_data downloads the upstream .duckdb blob and shells out to duckdb to materialize 9 lowercased Parquet shards, mirroring how realnest/gharchive and public_bi handle their own non-Parquet sources. The conversion lowercases column names at COPY time so DataFusion's default enable_ident_normalization=true resolves the verbatim camelCase Appian queries (orderItem_quantity, FROM CustomerView, ...) against the schema without per-engine special-casing or query rewriting — keeping upstream query strings byte-identical so future q09.sql etc. drop in unchanged. CI matrix entry runs appian-nvme at PR time on 5 core engine×format combos (datafusion+duckdb × parquet/vortex plus duckdb:duckdb), with develop fanning out to add vortex-compact for both engines. Co-Authored-By: Claude Signed-off-by: mprammer --- .github/workflows/sql-benchmarks.yml | 23 + bench-orchestrator/README.md | 2 +- .../bench_orchestrator/config.py | 1 + vortex-bench/src/appian/mod.rs | 583 ++++++++++++++++++ vortex-bench/src/datasets/mod.rs | 17 + vortex-bench/src/lib.rs | 9 + vortex-bench/src/v3.rs | 3 + 7 files changed, 637 insertions(+), 1 deletion(-) create mode 100644 vortex-bench/src/appian/mod.rs diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index a1809b4ac92..71d55465494 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -252,6 +252,29 @@ on: {"engine": "datafusion", "format": "vortex"} ], "scale_factor": "1" + }, + { + "id": "appian-nvme", + "subcommand": "appian", + "name": "Appian on NVME", + "data_formats": ["parquet", "vortex", "vortex-compact", "duckdb"], + "pr_targets": [ + {"engine": "datafusion", "format": "parquet"}, + {"engine": "datafusion", "format": "vortex"}, + {"engine": "duckdb", "format": "parquet"}, + {"engine": "duckdb", "format": "vortex"}, + {"engine": "duckdb", "format": "duckdb"} + ], + "develop_targets": [ + {"engine": "datafusion", "format": "parquet"}, + {"engine": "datafusion", "format": "vortex"}, + {"engine": "datafusion", "format": "vortex-compact"}, + {"engine": "duckdb", "format": "parquet"}, + {"engine": "duckdb", "format": "vortex"}, + {"engine": "duckdb", "format": "vortex-compact"}, + {"engine": "duckdb", "format": "duckdb"} + ], + "iterations": "10" } ] diff --git a/bench-orchestrator/README.md b/bench-orchestrator/README.md index 23a927b96d3..0b267008a85 100644 --- a/bench-orchestrator/README.md +++ b/bench-orchestrator/README.md @@ -41,7 +41,7 @@ vx-bench run [options] **Arguments:** -- `benchmark`: Benchmark suite to run (`tpch`, `tpcds`, `clickbench`, `fineweb`, `gh-archive`, `polarsignals`, `public-bi`, `statpopgen`) +- `benchmark`: Benchmark suite to run (`appian`, `tpch`, `tpcds`, `clickbench`, `fineweb`, `gh-archive`, `polarsignals`, `public-bi`, `statpopgen`) **Options:** diff --git a/bench-orchestrator/bench_orchestrator/config.py b/bench-orchestrator/bench_orchestrator/config.py index bd81ce64fb2..fb90ac8bd6a 100644 --- a/bench-orchestrator/bench_orchestrator/config.py +++ b/bench-orchestrator/bench_orchestrator/config.py @@ -42,6 +42,7 @@ class Format(Enum): class Benchmark(Enum): """Available benchmark suites.""" + APPIAN = "appian" TPCH = "tpch" TPCDS = "tpcds" CLICKBENCH = "clickbench" diff --git a/vortex-bench/src/appian/mod.rs b/vortex-bench/src/appian/mod.rs new file mode 100644 index 00000000000..56d991ee98a --- /dev/null +++ b/vortex-bench/src/appian/mod.rs @@ -0,0 +1,583 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Appian benchmark. +//! +//! Mirrors the queries from DuckDB's in-tree `benchmark/appian_benchmarks` suite. Upstream +//! ships the data as a single `.duckdb` blob (~593 MB); we download it once and shell out +//! to the `duckdb` CLI to project each table into Parquet, lowercasing column names along +//! the way. `data-gen` then handles every other format from those Parquet files. +//! +//! ## Identifier case +//! +//! The upstream `.duckdb` blob preserves camelCase column names (`orderItem_quantity`, +//! `address_customerId`, ...) and capitalized table names (`CustomerView`). The Appian +//! queries reference those identifiers unquoted, which would break under DataFusion's +//! default `enable_ident_normalization=true` (parser lowercases identifier references +//! while the Parquet schema and registered table names preserve case → field-not-found). +//! +//! The conversion below lowercases every column at COPY time, and the table names in +//! [`TABLES`] are already lowercase. Both engines then resolve the verbatim camelCase +//! queries the same way: DataFusion lowercases the query identifiers and matches them +//! against the lowercased Parquet schema, while DuckDB's case-insensitive unquoted +//! identifier resolution makes the original case irrelevant. + +use std::path::PathBuf; +use std::process::Command; + +use anyhow::Context; +use anyhow::bail; +use glob::Pattern; +use tracing::info; +use url::Url; + +use crate::Benchmark; +use crate::BenchmarkDataset; +use crate::Format; +use crate::TableSpec; +use crate::datasets::data_downloads::download_data; +use crate::utils::file::resolve_data_url; + +/// Upstream `.duckdb` blob; pinned to the URL hard-coded into DuckDB's +/// `benchmark/appian_benchmarks/appian.benchmark.in`. +const UPSTREAM_BLOB_URL: &str = "https://blobs.duckdb.org/data/appian_benchmark_data.duckdb"; + +/// Table names from DuckDB's `appian.benchmark.in` template in upstream case. Ordering +/// must match [`TABLES`] so each upstream source maps to its lowercased Parquet output. +const UPSTREAM_TABLES: &[&str] = &[ + "AddressView", + "CategoryView", + "CreditCardView", + "CustomerView", + "OrderItemNovelty_Update", + "OrderItemView", + "OrderView", + "ProductView", + "TaxRecordView", +]; + +/// Lowercased table names registered with the query engines. Matches the output Parquet +/// file names produced by [`AppianBenchmark::generate_base_data`]. +const TABLES: &[&str] = &[ + "addressview", + "categoryview", + "creditcardview", + "customerview", + "orderitemnovelty_update", + "orderitemview", + "orderview", + "productview", + "taxrecordview", +]; + +/// Eight join-heavy queries copied verbatim from +/// `duckdb/duckdb:benchmark/appian_benchmarks/queries/q0[1-8].sql`. +const QUERIES: &[&str] = &[ + // q01 — three-way left join, group by state, sum order-item quantities. + "SELECT address_state AS g0, sum(orderItem_quantity) AS p0 +FROM CustomerView c +LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId +LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId +LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId +GROUP BY address_state +ORDER BY address_state +LIMIT 500", + // q02 — eight-CTE breadth-first aggregation across the whole schema. + "SELECT + a.address_state AS g0, + t1rp1 AS g1, + t2rp1 AS g2, + max(t5rp1) AS p0, + avg(t8rp1 * t8rp2) AS p1, + max(t6rp1) AS p2, + count(c.customer_priority) AS p3, + coalesce(avg(t7rp1), 0.0) AS p4 +FROM CustomerView c +LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId +LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId +LEFT OUTER JOIN ( + SELECT sum(creditCard_cvv) AS t1rp1, c.customer_id AS t1pk + FROM CustomerView c + LEFT OUTER JOIN CreditCardView cc ON c.customer_id = cc.creditCard_customerId + GROUP BY c.customer_id + ) t1 ON c.customer_id = t1.t1pk +LEFT OUTER JOIN ( + SELECT min(p.product_likes) AS t2rp1, c.customer_id AS t2pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + WHERE ca.category_seasonal = TRUE + GROUP BY c.customer_id + ) t2 ON c.customer_id = t2.t2pk +LEFT OUTER JOIN ( + SELECT max(o.order_subShipments) AS t5rp1, c.customer_id AS t5pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + GROUP BY c.customer_id + ) t5 ON c.customer_id = t5pk +LEFT OUTER JOIN ( + SELECT max(coalesce(oi.orderItem_weight, 1)) AS t6rp1, c.customer_id AS t6pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + WHERE o.order_serverId IN (1, 3, 5) + GROUP BY c.customer_id + ) t6 ON c.customer_id = t6pk +LEFT OUTER JOIN ( + SELECT count(ca.category_seasonal) AS t7rp1, c.customer_id AS t7pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + WHERE ca.category_perishable = TRUE + GROUP BY c.customer_id + ) t7 ON c.customer_id = t7pk +LEFT OUTER JOIN ( + SELECT + sum(creditCard_zip) AS t8rp1, + sum(creditCard_lastChargeAmount) AS t8rp2, + c.customer_id AS t8pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number + GROUP BY c.customer_id + ) t8 ON c.customer_id = t8pk +WHERE t.taxRecord_value > 149670.0 +GROUP BY a.address_state, t1rp1, t2rp1 +ORDER BY g0, p0, p1 +LIMIT 500", + // q03 — many-way star join with a CASE expression over a date diff. + "SELECT + c.customer_priority AS g0, + t1rp1 AS g1, + t.taxRecord_bracket AS g2, + sum(oi.orderItem_weight) AS p0, + max(ca.category_demandScore) AS p1, + max(ca.category_auditDate) AS p2, + CAST(avg(ca.category_valuation) AS int) AS p3, + sum(t1rp2) AS p4, + sum( + CASE + WHEN p.product_inventoryLastOrderedOn - ca.category_auditDate > 300 THEN 1 + WHEN p.product_inventoryLastOrderedOn - ca.category_auditDate > 150 THEN 10 + WHEN p.product_inventoryLastOrderedOn - ca.category_auditDate > 0 THEN 100 + ELSE 1000 + END +(c.customer_priority * a.address_zone)) AS p5 +FROM OrderItemView oi +LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id +LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id +LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number +LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id +LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId +LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId +LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name +LEFT OUTER JOIN ( + SELECT + min(cc.creditCard_expirationDate) AS t1rp1, + sum(cc.creditCard_lastChargeAmount) AS t1rp2, + c.customer_id AS t1pk + FROM CustomerView c + LEFT OUTER JOIN CreditCardView cc ON c.customer_id = cc.creditCard_customerId + GROUP BY c.customer_id + ) t1 ON c.customer_id = t1pk +WHERE cc.creditCard_lastChargeAmount > 90.0 AND p.product_price > 34.0 +GROUP BY c.customer_priority, t1rp1, t.taxRecord_bracket +ORDER BY p1, p3, g2 +LIMIT 500", + // q04 — category-rooted fan-out with four parallel sub-aggregations. + "SELECT + t2rp1 AS g0, + t3rp1 AS g1, + t4rp1 AS g2, + CAST(avg(cc.creditCard_lastChargeAmount) AS int) AS p0, + min(cc.creditCard_lastChargeTimestamp) AS p1, + count(DISTINCT (cc.creditCard_holder)) AS p2 +FROM CategoryView ca +LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName +LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId +LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id +LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number +LEFT OUTER JOIN ( + SELECT sum(taxRecord_bracket) AS t1rp1, ca.category_name AS t1pk + FROM CategoryView ca + LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName + LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId + LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id + LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id + LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId + LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId + GROUP BY ca.category_name + ) t1 ON ca.category_name = t1pk +LEFT OUTER JOIN ( + SELECT max(p.product_likes) AS t2rp1, ca.category_name AS t2pk + FROM CategoryView ca + LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName + GROUP BY ca.category_name + ) t2 ON ca.category_name = t2pk +LEFT OUTER JOIN ( + SELECT sum(oi.orderItem_productGroup) AS t3rp1, ca.category_name AS t3pk + FROM CategoryView ca + LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName + LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId + WHERE oi.orderItem_weight > 15.0 + GROUP BY ca.category_name + ) t3 ON ca.category_name = t3pk +LEFT OUTER JOIN ( + SELECT max(cc.creditCard_zip) AS t4rp1, ca.category_name AS t4pk + FROM CategoryView ca + LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName + LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId + LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id + LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number + GROUP BY ca.category_name + ) t4 ON ca.category_name = t4pk +WHERE t1rp1 > 6 +GROUP BY t2rp1, t3rp1, t4rp1 +ORDER BY g1, p2 +LIMIT 500", + // q05 — tax-record rooted query with a timestamp-bound subquery. + "SELECT t.taxRecord_rate AS g0, t2rp1 AS g1, min(c.customer_balance) AS p0 +FROM TaxRecordView t +LEFT OUTER JOIN AddressView a ON t.taxRecord_addressId = a.address_id +LEFT OUTER JOIN CustomerView c ON a.address_customerId = c.customer_id +LEFT OUTER JOIN ( + SELECT min(o.order_placedOn) AS t1rp1, t.taxRecord_id AS t1pk + FROM TaxRecordView t + LEFT OUTER JOIN AddressView a ON t.taxRecord_addressId = a.address_id + LEFT OUTER JOIN OrderView o ON a.address_customerId = o.order_customerId + GROUP BY t.taxRecord_id + ) t1 ON t.taxRecord_id = t1pk +LEFT OUTER JOIN ( + SELECT sum(p.product_price * oi.orderItem_quantity) AS t2rp1, t.taxRecord_id AS t2pk + FROM TaxRecordView t + LEFT OUTER JOIN AddressView a ON t.taxRecord_addressId = a.address_id + LEFT OUTER JOIN OrderView o ON a.address_customerId = o.order_customerId + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + GROUP BY t.taxRecord_id + ) t2 ON t.taxRecord_id = t2pk +WHERE t1rp1 > '2020-01-14 12:12:30.0' +GROUP BY t.taxRecord_rate, t2rp1 +ORDER BY p0 +LIMIT 500", + // q06 — product-rooted with cascading CASE buckets over CTE outputs. + "SELECT + t1rp2 AS g0, + sum(t1rp3) / sum(t1rp4) AS p0, + sum( + CASE + WHEN t1rp5 > 1 THEN 1 + WHEN t2rp1 > 20200 THEN 2 + WHEN t1rp6 > 15 THEN 3 + WHEN t3rp1 > 150 THEN 4 + ELSE 5 + END) AS p1 +FROM ProductView p +LEFT OUTER JOIN ( + SELECT + avg(a.address_valuation) AS t1rp1, + sum(a.address_zone) AS t1rp2, + sum(a.address_zone) AS t1rp3, + count(a.address_zone) AS t1rp4, + avg(o.order_serverId) AS t1rp5, + avg(c.customer_balance) AS t1rp6, + p.product_id AS t1pk + FROM ProductView p + LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId + LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id + LEFT OUTER JOIN AddressView a ON o.order_customerId = a.address_customerId + LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id + GROUP BY p.product_id + ) t1 ON p.product_id = t1pk +LEFT OUTER JOIN ( + SELECT min(a.address_zip) AS t2rp1, p.product_id AS t2pk + FROM ProductView p + LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId + LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id + LEFT OUTER JOIN AddressView a ON o.order_customerId = a.address_customerId + WHERE a.address_state IN ('PA', 'CA', 'VA', 'MA', 'ME', 'MD', 'CO', 'MO') + GROUP BY p.product_id + ) t2 ON p.product_id = t2pk +LEFT OUTER JOIN ( + SELECT ca.category_warehouseSqft AS t3rp1, p.product_id AS t3pk + FROM ProductView p + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + WHERE ca.category_seasonal = TRUE + ) t3 ON p.product_id = t3pk +WHERE t1rp1 > 10000.0 +GROUP BY t1rp2 +ORDER BY p0 +LIMIT 500", + // q07 — customer-rooted with derived divisions and an IN filter on CC fields. + "SELECT + t1rp1 AS g0, + t2rp1 AS g1, + c.customer_age AS g2, + c.customer_balance AS g3, + count(c.customer_name) AS p0, + sum(c.customer_age) AS p1 +FROM CustomerView c +LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId +LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId +LEFT OUTER JOIN ( + SELECT avg(oi.orderItem_weight) AS t1rp1, c.customer_id AS t1pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + WHERE creditCard_cvv IN (113, 115, 117, 119, 121) + GROUP BY c.customer_id + ) t1 ON c.customer_id = t1pk +LEFT OUTER JOIN ( + SELECT + avg((oi.orderItem_quantity * p.product_price) /(oi.orderItem_weight + oi.orderItem_sku)) AS t2rp1, + c.customer_id AS t2pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + WHERE ca.category_name IN ('Pet', 'Food', 'Game', 'Software') + GROUP BY c.customer_id + ) t2 ON c.customer_id = t2pk +WHERE t.taxRecord_bracketThreshold IN (22, 24, 27, 29) +GROUP BY t1rp1, t2rp1, c.customer_age, c.customer_balance +ORDER BY p0, p1 +LIMIT 500", + // q08 — credit-card-rooted query with six parallel sub-aggregations. + "SELECT + t4rp1 AS g0, + t5rp1 AS g1, + sum(creditCard_lastChargeAmount) AS p0, + min(t6rp1) AS p1, + sum(t3rp2) AS p2 +FROM CreditCardView cc +LEFT OUTER JOIN ( + SELECT min(order_id) AS t1rp1, creditCard_number AS t1pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + WHERE order_slaProbability > 0.125 + GROUP BY creditCard_number + ) t1 ON cc.creditCard_number = t1pk +LEFT OUTER JOIN ( + SELECT sum(orderItem_weight) AS t2rp1, creditCard_number AS t2pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + GROUP BY creditCard_number + ) t2 ON cc.creditCard_number = t2pk +LEFT OUTER JOIN ( + SELECT + min(address_zip) AS t3rp1, + sum(taxRecord_bracketThreshold) AS t3rp2, + creditCard_number AS t3pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id + LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId + LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId + GROUP BY creditCard_number + ) t3 ON cc.creditCard_number = t3pk +LEFT OUTER JOIN ( + SELECT sum(product_price) AS t4rp1, creditCard_number AS t4pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + WHERE orderItem_weight < 25.0 + GROUP BY creditCard_number + ) t4 ON cc.creditCard_number = t4pk +LEFT OUTER JOIN ( + SELECT sum(category_regulationProbability) AS t5rp1, creditCard_number AS t5pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + GROUP BY creditCard_number + ) t5 ON cc.creditCard_number = t5pk +LEFT OUTER JOIN ( + SELECT min(product_inventoryLastOrderedOn) AS t6rp1, creditCard_number AS t6pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + WHERE product_price < 200.0 + GROUP BY creditCard_number + ) t6 ON cc.creditCard_number = t6pk +WHERE t1rp1 > 10000 OR t2rp1 > 15 OR t3rp1 > 20200 +GROUP BY t4rp1, t5rp1 +ORDER BY p0, p1, p2 +LIMIT 500", +]; + +/// Benchmark over the [Appian benchmark suite from DuckDB][upstream]. +/// +/// [upstream]: https://github.com/duckdb/duckdb/tree/main/benchmark/appian_benchmarks +pub struct AppianBenchmark { + data_url: Url, +} + +impl AppianBenchmark { + pub fn new(data_url: Url) -> Self { + Self { data_url } + } + + pub fn with_remote_data_dir(use_remote_data_dir: Option) -> anyhow::Result { + let data_url = resolve_data_url(use_remote_data_dir.as_deref(), "appian")?; + Ok(Self { data_url }) + } + + fn base_dir(&self) -> anyhow::Result { + self.data_url + .to_file_path() + .map_err(|_| anyhow::anyhow!( + "Failed to convert data URL to filesystem path - ensure data_url uses 'file://' scheme" + )) + } + + fn parquet_dir(&self) -> anyhow::Result { + Ok(self.base_dir()?.join(Format::Parquet.name())) + } +} + +#[async_trait::async_trait] +impl Benchmark for AppianBenchmark { + fn queries(&self) -> anyhow::Result> { + Ok(QUERIES.iter().map(|s| s.to_string()).enumerate().collect()) + } + + async fn generate_base_data(&self) -> anyhow::Result<()> { + if self.data_url.scheme() != "file" { + return Ok(()); + } + + let parquet_dir = self.parquet_dir()?; + std::fs::create_dir_all(&parquet_dir)?; + + // Idempotency: if every target Parquet is already in place, do nothing. + if TABLES + .iter() + .all(|t| parquet_dir.join(format!("{t}.parquet")).exists()) + { + info!( + "appian: {} Parquet shards already present in {}", + TABLES.len(), + parquet_dir.display(), + ); + return Ok(()); + } + + // Download the upstream `.duckdb` blob into the dataset cache directory. + let blob_path = self.base_dir()?.join("appian_benchmark_data.duckdb"); + let blob = download_data(blob_path, UPSTREAM_BLOB_URL).await?; + + // DuckDB SQL can't use a query result as a projection list, so build per-table + // lowercased projections in Rust, then run all nine `COPY`s in a single subprocess. + let projections = discover_projections(&blob)?; + let mut script = format!("ATTACH '{}' AS src (READ_ONLY);\n", blob.display()); + for (i, &upstream) in UPSTREAM_TABLES.iter().enumerate() { + let projection = projections + .iter() + .find(|(t, _)| t == upstream) + .map(|(_, p)| p.as_str()) + .with_context(|| format!("no columns reported for upstream table {upstream}"))?; + let out_path = parquet_dir.join(format!("{}.parquet", TABLES[i])); + script.push_str(&format!( + "COPY (SELECT {projection} FROM src.\"{upstream}\") TO '{}' (FORMAT PARQUET);\n", + out_path.display(), + )); + } + + let output = Command::new("duckdb").arg("-c").arg(&script).output()?; + if !output.status.success() { + bail!( + "duckdb appian COPY failed: stdout={:?} stderr={:?}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr), + ); + } + + info!( + "appian base data generated in {} ({} Parquet shards)", + parquet_dir.display(), + TABLES.len(), + ); + Ok(()) + } + + fn dataset(&self) -> BenchmarkDataset { + BenchmarkDataset::Appian + } + + fn dataset_name(&self) -> &str { + "appian" + } + + fn dataset_display(&self) -> String { + "appian".to_owned() + } + + fn data_url(&self) -> &Url { + &self.data_url + } + + fn table_specs(&self) -> Vec { + TABLES + .iter() + .map(|name| TableSpec::new(name, None)) + .collect() + } + + #[expect(clippy::expect_used)] + fn pattern(&self, table_name: &str, format: Format) -> Option { + Some( + format!("{}.{}", table_name, format.ext()) + .parse() + .expect("valid glob pattern"), + ) + } +} + +/// Run a single `duckdb` invocation that returns, for each upstream Appian table, a +/// projection string of the form `"OrigName" AS "origname", ...` so the `COPY` statements +/// below can lowercase every column name without enumerating them by hand. +fn discover_projections(blob: &std::path::Path) -> anyhow::Result> { + // `chr(31)` (unit separator) keeps `table_name` and the projection list distinct in + // the single-column `-list` output without colliding with `|` (list separator) or + // `,` (projection delimiter). + let sql = format!( + "ATTACH '{}' AS src (READ_ONLY); \ + SELECT table_name || chr(31) || \ + string_agg('\"' || column_name || '\" AS \"' || lower(column_name) || '\"', ', ' ORDER BY column_index) \ + FROM duckdb_columns() \ + WHERE database_name = 'src' \ + GROUP BY table_name;", + blob.display(), + ); + let output = Command::new("duckdb") + .arg("-noheader") + .arg("-list") + .arg("-c") + .arg(&sql) + .output()?; + if !output.status.success() { + bail!( + "duckdb column discovery failed: stdout={:?} stderr={:?}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr), + ); + } + let stdout = String::from_utf8(output.stdout)?; + Ok(stdout + .lines() + .filter_map(|line| { + line.split_once('\x1f') + .map(|(t, p)| (t.to_owned(), p.to_owned())) + }) + .collect()) +} diff --git a/vortex-bench/src/datasets/mod.rs b/vortex-bench/src/datasets/mod.rs index d35d3f869e0..3e72ba69e7f 100644 --- a/vortex-bench/src/datasets/mod.rs +++ b/vortex-bench/src/datasets/mod.rs @@ -59,6 +59,8 @@ pub trait Dataset { #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum BenchmarkDataset { + #[serde(rename = "appian")] + Appian, #[serde(rename = "tpch")] TpcH { scale_factor: String }, #[serde(rename = "tpcds")] @@ -80,6 +82,7 @@ pub enum BenchmarkDataset { impl BenchmarkDataset { pub fn name(&self) -> &str { match self { + BenchmarkDataset::Appian => "appian", BenchmarkDataset::TpcH { .. } => "tpch", BenchmarkDataset::TpcDS { .. } => "tpcds", BenchmarkDataset::ClickBench { .. } => "clickbench", @@ -95,6 +98,7 @@ impl BenchmarkDataset { impl Display for BenchmarkDataset { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { + BenchmarkDataset::Appian => write!(f, "appian"), BenchmarkDataset::TpcH { scale_factor } => write!(f, "tpch(sf={scale_factor})"), BenchmarkDataset::TpcDS { scale_factor } => write!(f, "tpcds(sf={scale_factor})"), BenchmarkDataset::ClickBench { flavor, .. } => match flavor { @@ -112,9 +116,22 @@ impl Display for BenchmarkDataset { } } +const APPIAN_TABLES: &[&str] = &[ + "addressview", + "categoryview", + "creditcardview", + "customerview", + "orderitemnovelty_update", + "orderitemview", + "orderview", + "productview", + "taxrecordview", +]; + impl BenchmarkDataset { pub fn tables(&self) -> &[&'static str] { match self { + BenchmarkDataset::Appian => APPIAN_TABLES, BenchmarkDataset::TpcDS { .. } => &[ "call_center", "catalog_sales", diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index 77d9da1e235..30ff45c97a8 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -10,6 +10,7 @@ use std::str::FromStr; use std::sync::LazyLock; use anyhow::bail; +use appian::AppianBenchmark; use clap::ValueEnum; use clickbench::ClickBenchBenchmark; use clickbench::Flavor; @@ -33,6 +34,7 @@ use vortex::file::VortexWriteOptions; use vortex::file::WriteStrategyBuilder; use vortex::utils::aliases::hash_map::HashMap; +pub mod appian; pub mod benchmark; pub mod clickbench; pub mod compress; @@ -245,6 +247,8 @@ impl CompactionStrategy { /// CLI argument for selecting which benchmark to run. #[derive(clap::ValueEnum, Clone, Copy)] pub enum BenchmarkArg { + #[clap(name = "appian")] + Appian, #[clap(name = "clickbench")] ClickBench, #[clap(name = "tpch")] @@ -272,6 +276,11 @@ const REMOTE_DATA_KEY: &str = "remote-data-dir"; /// Factory function to create a benchmark instance from CLI arguments. pub fn create_benchmark(b: BenchmarkArg, opts: &Opts) -> anyhow::Result> { match b { + BenchmarkArg::Appian => { + let remote_data_dir = opts.get_as::(REMOTE_DATA_KEY); + let benchmark = AppianBenchmark::with_remote_data_dir(remote_data_dir)?; + Ok(Box::new(benchmark) as _) + } BenchmarkArg::ClickBench => { let flavor = opts.get_as::("flavor").unwrap_or_default(); let remote_data_dir = opts.get_as::(REMOTE_DATA_KEY); diff --git a/vortex-bench/src/v3.rs b/vortex-bench/src/v3.rs index 99c85314fbe..efa961bc2ee 100644 --- a/vortex-bench/src/v3.rs +++ b/vortex-bench/src/v3.rs @@ -292,6 +292,7 @@ fn canonical_tpc_scale_factor(scale_factor: &str) -> String { /// | `PolarSignals { n_rows: _ }`| `polarsignals` | `None` | `None` | Same as StatPopGen. | /// | `Fineweb` | `fineweb` | `None` | `None` | | /// | `GhArchive` | `gharchive` | `None` | `None` | | +/// | `Appian` | `appian` | `None` | `None` | Static dataset; no scale factor. | /// | `PublicBi { name }` | `public-bi` | dataset name (e.g. `cms-provider`) | `None` | Sub-dataset name lives in `dataset_variant`. | pub fn benchmark_dataset_dims(d: &BenchmarkDataset) -> (String, Option, Option) { match d { @@ -321,6 +322,7 @@ pub fn benchmark_dataset_dims(d: &BenchmarkDataset) -> (String, Option, BenchmarkDataset::PolarSignals { .. } => ("polarsignals".to_string(), None, None), BenchmarkDataset::Fineweb => ("fineweb".to_string(), None, None), BenchmarkDataset::GhArchive => ("gharchive".to_string(), None, None), + BenchmarkDataset::Appian => ("appian".to_string(), None, None), } } @@ -700,6 +702,7 @@ mod tests { ), (BenchmarkDataset::Fineweb, "fineweb"), (BenchmarkDataset::GhArchive, "gharchive"), + (BenchmarkDataset::Appian, "appian"), ] { let (ds, variant, sf) = benchmark_dataset_dims(&case); assert_eq!(ds, expected, "dataset for {case:?}"); From c286abe76dafbc0f63863f8ea611033f7c9ec0f0 Mon Sep 17 00:00:00 2001 From: mprammer Date: Tue, 19 May 2026 10:04:03 -0400 Subject: [PATCH 2/4] bench: extract Appian queries to per-file .sql The eight Appian queries were ~340 lines of embedded string literals in appian/mod.rs, which is awkward to read and diff. Pull each one into its own `queries/qXX.sql` file (mirroring the upstream DuckDB layout) and embed via `include_str!` so it stays compile-time with no runtime fs read. Refreshing from upstream now reduces to dropping new .sql files into `queries/` and adding one line to the `QUERIES` array. Co-Authored-By: Claude Signed-off-by: mprammer --- vortex-bench/src/appian/mod.rs | 353 +----------------------- vortex-bench/src/appian/queries/q01.sql | 8 + vortex-bench/src/appian/queries/q02.sql | 66 +++++ vortex-bench/src/appian/queries/q03.sql | 37 +++ vortex-bench/src/appian/queries/q04.sql | 50 ++++ vortex-bench/src/appian/queries/q05.sql | 24 ++ vortex-bench/src/appian/queries/q06.sql | 47 ++++ vortex-bench/src/appian/queries/q07.sql | 35 +++ vortex-bench/src/appian/queries/q08.sql | 65 +++++ 9 files changed, 343 insertions(+), 342 deletions(-) create mode 100644 vortex-bench/src/appian/queries/q01.sql create mode 100644 vortex-bench/src/appian/queries/q02.sql create mode 100644 vortex-bench/src/appian/queries/q03.sql create mode 100644 vortex-bench/src/appian/queries/q04.sql create mode 100644 vortex-bench/src/appian/queries/q05.sql create mode 100644 vortex-bench/src/appian/queries/q06.sql create mode 100644 vortex-bench/src/appian/queries/q07.sql create mode 100644 vortex-bench/src/appian/queries/q08.sql diff --git a/vortex-bench/src/appian/mod.rs b/vortex-bench/src/appian/mod.rs index 56d991ee98a..99a9f29fe80 100644 --- a/vortex-bench/src/appian/mod.rs +++ b/vortex-bench/src/appian/mod.rs @@ -70,349 +70,18 @@ const TABLES: &[&str] = &[ "taxrecordview", ]; -/// Eight join-heavy queries copied verbatim from -/// `duckdb/duckdb:benchmark/appian_benchmarks/queries/q0[1-8].sql`. +/// Eight join-heavy queries from `duckdb/duckdb:benchmark/appian_benchmarks/queries/`. +/// Embedded byte-identically at compile time so upstream refreshes are a pure copy +/// into `queries/`. const QUERIES: &[&str] = &[ - // q01 — three-way left join, group by state, sum order-item quantities. - "SELECT address_state AS g0, sum(orderItem_quantity) AS p0 -FROM CustomerView c -LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId -LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId -LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId -GROUP BY address_state -ORDER BY address_state -LIMIT 500", - // q02 — eight-CTE breadth-first aggregation across the whole schema. - "SELECT - a.address_state AS g0, - t1rp1 AS g1, - t2rp1 AS g2, - max(t5rp1) AS p0, - avg(t8rp1 * t8rp2) AS p1, - max(t6rp1) AS p2, - count(c.customer_priority) AS p3, - coalesce(avg(t7rp1), 0.0) AS p4 -FROM CustomerView c -LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId -LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId -LEFT OUTER JOIN ( - SELECT sum(creditCard_cvv) AS t1rp1, c.customer_id AS t1pk - FROM CustomerView c - LEFT OUTER JOIN CreditCardView cc ON c.customer_id = cc.creditCard_customerId - GROUP BY c.customer_id - ) t1 ON c.customer_id = t1.t1pk -LEFT OUTER JOIN ( - SELECT min(p.product_likes) AS t2rp1, c.customer_id AS t2pk - FROM CustomerView c - LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId - LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId - LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id - LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name - WHERE ca.category_seasonal = TRUE - GROUP BY c.customer_id - ) t2 ON c.customer_id = t2.t2pk -LEFT OUTER JOIN ( - SELECT max(o.order_subShipments) AS t5rp1, c.customer_id AS t5pk - FROM CustomerView c - LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId - GROUP BY c.customer_id - ) t5 ON c.customer_id = t5pk -LEFT OUTER JOIN ( - SELECT max(coalesce(oi.orderItem_weight, 1)) AS t6rp1, c.customer_id AS t6pk - FROM CustomerView c - LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId - LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId - WHERE o.order_serverId IN (1, 3, 5) - GROUP BY c.customer_id - ) t6 ON c.customer_id = t6pk -LEFT OUTER JOIN ( - SELECT count(ca.category_seasonal) AS t7rp1, c.customer_id AS t7pk - FROM CustomerView c - LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId - LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId - LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id - LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name - WHERE ca.category_perishable = TRUE - GROUP BY c.customer_id - ) t7 ON c.customer_id = t7pk -LEFT OUTER JOIN ( - SELECT - sum(creditCard_zip) AS t8rp1, - sum(creditCard_lastChargeAmount) AS t8rp2, - c.customer_id AS t8pk - FROM CustomerView c - LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId - LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number - GROUP BY c.customer_id - ) t8 ON c.customer_id = t8pk -WHERE t.taxRecord_value > 149670.0 -GROUP BY a.address_state, t1rp1, t2rp1 -ORDER BY g0, p0, p1 -LIMIT 500", - // q03 — many-way star join with a CASE expression over a date diff. - "SELECT - c.customer_priority AS g0, - t1rp1 AS g1, - t.taxRecord_bracket AS g2, - sum(oi.orderItem_weight) AS p0, - max(ca.category_demandScore) AS p1, - max(ca.category_auditDate) AS p2, - CAST(avg(ca.category_valuation) AS int) AS p3, - sum(t1rp2) AS p4, - sum( - CASE - WHEN p.product_inventoryLastOrderedOn - ca.category_auditDate > 300 THEN 1 - WHEN p.product_inventoryLastOrderedOn - ca.category_auditDate > 150 THEN 10 - WHEN p.product_inventoryLastOrderedOn - ca.category_auditDate > 0 THEN 100 - ELSE 1000 - END +(c.customer_priority * a.address_zone)) AS p5 -FROM OrderItemView oi -LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id -LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id -LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number -LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id -LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId -LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId -LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name -LEFT OUTER JOIN ( - SELECT - min(cc.creditCard_expirationDate) AS t1rp1, - sum(cc.creditCard_lastChargeAmount) AS t1rp2, - c.customer_id AS t1pk - FROM CustomerView c - LEFT OUTER JOIN CreditCardView cc ON c.customer_id = cc.creditCard_customerId - GROUP BY c.customer_id - ) t1 ON c.customer_id = t1pk -WHERE cc.creditCard_lastChargeAmount > 90.0 AND p.product_price > 34.0 -GROUP BY c.customer_priority, t1rp1, t.taxRecord_bracket -ORDER BY p1, p3, g2 -LIMIT 500", - // q04 — category-rooted fan-out with four parallel sub-aggregations. - "SELECT - t2rp1 AS g0, - t3rp1 AS g1, - t4rp1 AS g2, - CAST(avg(cc.creditCard_lastChargeAmount) AS int) AS p0, - min(cc.creditCard_lastChargeTimestamp) AS p1, - count(DISTINCT (cc.creditCard_holder)) AS p2 -FROM CategoryView ca -LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName -LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId -LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id -LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number -LEFT OUTER JOIN ( - SELECT sum(taxRecord_bracket) AS t1rp1, ca.category_name AS t1pk - FROM CategoryView ca - LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName - LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId - LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id - LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id - LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId - LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId - GROUP BY ca.category_name - ) t1 ON ca.category_name = t1pk -LEFT OUTER JOIN ( - SELECT max(p.product_likes) AS t2rp1, ca.category_name AS t2pk - FROM CategoryView ca - LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName - GROUP BY ca.category_name - ) t2 ON ca.category_name = t2pk -LEFT OUTER JOIN ( - SELECT sum(oi.orderItem_productGroup) AS t3rp1, ca.category_name AS t3pk - FROM CategoryView ca - LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName - LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId - WHERE oi.orderItem_weight > 15.0 - GROUP BY ca.category_name - ) t3 ON ca.category_name = t3pk -LEFT OUTER JOIN ( - SELECT max(cc.creditCard_zip) AS t4rp1, ca.category_name AS t4pk - FROM CategoryView ca - LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName - LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId - LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id - LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number - GROUP BY ca.category_name - ) t4 ON ca.category_name = t4pk -WHERE t1rp1 > 6 -GROUP BY t2rp1, t3rp1, t4rp1 -ORDER BY g1, p2 -LIMIT 500", - // q05 — tax-record rooted query with a timestamp-bound subquery. - "SELECT t.taxRecord_rate AS g0, t2rp1 AS g1, min(c.customer_balance) AS p0 -FROM TaxRecordView t -LEFT OUTER JOIN AddressView a ON t.taxRecord_addressId = a.address_id -LEFT OUTER JOIN CustomerView c ON a.address_customerId = c.customer_id -LEFT OUTER JOIN ( - SELECT min(o.order_placedOn) AS t1rp1, t.taxRecord_id AS t1pk - FROM TaxRecordView t - LEFT OUTER JOIN AddressView a ON t.taxRecord_addressId = a.address_id - LEFT OUTER JOIN OrderView o ON a.address_customerId = o.order_customerId - GROUP BY t.taxRecord_id - ) t1 ON t.taxRecord_id = t1pk -LEFT OUTER JOIN ( - SELECT sum(p.product_price * oi.orderItem_quantity) AS t2rp1, t.taxRecord_id AS t2pk - FROM TaxRecordView t - LEFT OUTER JOIN AddressView a ON t.taxRecord_addressId = a.address_id - LEFT OUTER JOIN OrderView o ON a.address_customerId = o.order_customerId - LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId - LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id - GROUP BY t.taxRecord_id - ) t2 ON t.taxRecord_id = t2pk -WHERE t1rp1 > '2020-01-14 12:12:30.0' -GROUP BY t.taxRecord_rate, t2rp1 -ORDER BY p0 -LIMIT 500", - // q06 — product-rooted with cascading CASE buckets over CTE outputs. - "SELECT - t1rp2 AS g0, - sum(t1rp3) / sum(t1rp4) AS p0, - sum( - CASE - WHEN t1rp5 > 1 THEN 1 - WHEN t2rp1 > 20200 THEN 2 - WHEN t1rp6 > 15 THEN 3 - WHEN t3rp1 > 150 THEN 4 - ELSE 5 - END) AS p1 -FROM ProductView p -LEFT OUTER JOIN ( - SELECT - avg(a.address_valuation) AS t1rp1, - sum(a.address_zone) AS t1rp2, - sum(a.address_zone) AS t1rp3, - count(a.address_zone) AS t1rp4, - avg(o.order_serverId) AS t1rp5, - avg(c.customer_balance) AS t1rp6, - p.product_id AS t1pk - FROM ProductView p - LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId - LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id - LEFT OUTER JOIN AddressView a ON o.order_customerId = a.address_customerId - LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id - GROUP BY p.product_id - ) t1 ON p.product_id = t1pk -LEFT OUTER JOIN ( - SELECT min(a.address_zip) AS t2rp1, p.product_id AS t2pk - FROM ProductView p - LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId - LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id - LEFT OUTER JOIN AddressView a ON o.order_customerId = a.address_customerId - WHERE a.address_state IN ('PA', 'CA', 'VA', 'MA', 'ME', 'MD', 'CO', 'MO') - GROUP BY p.product_id - ) t2 ON p.product_id = t2pk -LEFT OUTER JOIN ( - SELECT ca.category_warehouseSqft AS t3rp1, p.product_id AS t3pk - FROM ProductView p - LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name - WHERE ca.category_seasonal = TRUE - ) t3 ON p.product_id = t3pk -WHERE t1rp1 > 10000.0 -GROUP BY t1rp2 -ORDER BY p0 -LIMIT 500", - // q07 — customer-rooted with derived divisions and an IN filter on CC fields. - "SELECT - t1rp1 AS g0, - t2rp1 AS g1, - c.customer_age AS g2, - c.customer_balance AS g3, - count(c.customer_name) AS p0, - sum(c.customer_age) AS p1 -FROM CustomerView c -LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId -LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId -LEFT OUTER JOIN ( - SELECT avg(oi.orderItem_weight) AS t1rp1, c.customer_id AS t1pk - FROM CustomerView c - LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId - LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number - LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId - WHERE creditCard_cvv IN (113, 115, 117, 119, 121) - GROUP BY c.customer_id - ) t1 ON c.customer_id = t1pk -LEFT OUTER JOIN ( - SELECT - avg((oi.orderItem_quantity * p.product_price) /(oi.orderItem_weight + oi.orderItem_sku)) AS t2rp1, - c.customer_id AS t2pk - FROM CustomerView c - LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId - LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId - LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id - LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name - WHERE ca.category_name IN ('Pet', 'Food', 'Game', 'Software') - GROUP BY c.customer_id - ) t2 ON c.customer_id = t2pk -WHERE t.taxRecord_bracketThreshold IN (22, 24, 27, 29) -GROUP BY t1rp1, t2rp1, c.customer_age, c.customer_balance -ORDER BY p0, p1 -LIMIT 500", - // q08 — credit-card-rooted query with six parallel sub-aggregations. - "SELECT - t4rp1 AS g0, - t5rp1 AS g1, - sum(creditCard_lastChargeAmount) AS p0, - min(t6rp1) AS p1, - sum(t3rp2) AS p2 -FROM CreditCardView cc -LEFT OUTER JOIN ( - SELECT min(order_id) AS t1rp1, creditCard_number AS t1pk - FROM CreditCardView cc - LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber - WHERE order_slaProbability > 0.125 - GROUP BY creditCard_number - ) t1 ON cc.creditCard_number = t1pk -LEFT OUTER JOIN ( - SELECT sum(orderItem_weight) AS t2rp1, creditCard_number AS t2pk - FROM CreditCardView cc - LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber - LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId - GROUP BY creditCard_number - ) t2 ON cc.creditCard_number = t2pk -LEFT OUTER JOIN ( - SELECT - min(address_zip) AS t3rp1, - sum(taxRecord_bracketThreshold) AS t3rp2, - creditCard_number AS t3pk - FROM CreditCardView cc - LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber - LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id - LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId - LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId - GROUP BY creditCard_number - ) t3 ON cc.creditCard_number = t3pk -LEFT OUTER JOIN ( - SELECT sum(product_price) AS t4rp1, creditCard_number AS t4pk - FROM CreditCardView cc - LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber - LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId - LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id - WHERE orderItem_weight < 25.0 - GROUP BY creditCard_number - ) t4 ON cc.creditCard_number = t4pk -LEFT OUTER JOIN ( - SELECT sum(category_regulationProbability) AS t5rp1, creditCard_number AS t5pk - FROM CreditCardView cc - LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber - LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId - LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id - LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name - GROUP BY creditCard_number - ) t5 ON cc.creditCard_number = t5pk -LEFT OUTER JOIN ( - SELECT min(product_inventoryLastOrderedOn) AS t6rp1, creditCard_number AS t6pk - FROM CreditCardView cc - LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber - LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId - LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id - LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name - WHERE product_price < 200.0 - GROUP BY creditCard_number - ) t6 ON cc.creditCard_number = t6pk -WHERE t1rp1 > 10000 OR t2rp1 > 15 OR t3rp1 > 20200 -GROUP BY t4rp1, t5rp1 -ORDER BY p0, p1, p2 -LIMIT 500", + include_str!("queries/q01.sql"), + include_str!("queries/q02.sql"), + include_str!("queries/q03.sql"), + include_str!("queries/q04.sql"), + include_str!("queries/q05.sql"), + include_str!("queries/q06.sql"), + include_str!("queries/q07.sql"), + include_str!("queries/q08.sql"), ]; /// Benchmark over the [Appian benchmark suite from DuckDB][upstream]. diff --git a/vortex-bench/src/appian/queries/q01.sql b/vortex-bench/src/appian/queries/q01.sql new file mode 100644 index 00000000000..a182ae4e748 --- /dev/null +++ b/vortex-bench/src/appian/queries/q01.sql @@ -0,0 +1,8 @@ +SELECT address_state AS g0, sum(orderItem_quantity) AS p0 +FROM CustomerView c +LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId +LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId +LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId +GROUP BY address_state +ORDER BY address_state +LIMIT 500; diff --git a/vortex-bench/src/appian/queries/q02.sql b/vortex-bench/src/appian/queries/q02.sql new file mode 100644 index 00000000000..699fc79fa03 --- /dev/null +++ b/vortex-bench/src/appian/queries/q02.sql @@ -0,0 +1,66 @@ +SELECT + a.address_state AS g0, + t1rp1 AS g1, + t2rp1 AS g2, + max(t5rp1) AS p0, + avg(t8rp1 * t8rp2) AS p1, + max(t6rp1) AS p2, + count(c.customer_priority) AS p3, + coalesce(avg(t7rp1), 0.0) AS p4 +FROM CustomerView c +LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId +LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId +LEFT OUTER JOIN ( + SELECT sum(creditCard_cvv) AS t1rp1, c.customer_id AS t1pk + FROM CustomerView c + LEFT OUTER JOIN CreditCardView cc ON c.customer_id = cc.creditCard_customerId + GROUP BY c.customer_id + ) t1 ON c.customer_id = t1.t1pk +LEFT OUTER JOIN ( + SELECT min(p.product_likes) AS t2rp1, c.customer_id AS t2pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + WHERE ca.category_seasonal = TRUE + GROUP BY c.customer_id + ) t2 ON c.customer_id = t2.t2pk +LEFT OUTER JOIN ( + SELECT max(o.order_subShipments) AS t5rp1, c.customer_id AS t5pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + GROUP BY c.customer_id + ) t5 ON c.customer_id = t5pk +LEFT OUTER JOIN ( + SELECT max(coalesce(oi.orderItem_weight, 1)) AS t6rp1, c.customer_id AS t6pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + WHERE o.order_serverId IN (1, 3, 5) + GROUP BY c.customer_id + ) t6 ON c.customer_id = t6pk +LEFT OUTER JOIN ( + SELECT count(ca.category_seasonal) AS t7rp1, c.customer_id AS t7pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + WHERE ca.category_perishable = TRUE + GROUP BY c.customer_id + ) t7 ON c.customer_id = t7pk +LEFT OUTER JOIN ( + SELECT + sum(creditCard_zip) AS t8rp1, + sum(creditCard_lastChargeAmount) AS t8rp2, + c.customer_id AS t8pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number + GROUP BY c.customer_id + ) t8 ON c.customer_id = t8pk +WHERE t.taxRecord_value > 149670.0 +GROUP BY a.address_state, t1rp1, t2rp1 +ORDER BY g0, p0, p1 +LIMIT 500; diff --git a/vortex-bench/src/appian/queries/q03.sql b/vortex-bench/src/appian/queries/q03.sql new file mode 100644 index 00000000000..a5af31e7c39 --- /dev/null +++ b/vortex-bench/src/appian/queries/q03.sql @@ -0,0 +1,37 @@ +SELECT + c.customer_priority AS g0, + t1rp1 AS g1, + t.taxRecord_bracket AS g2, + sum(oi.orderItem_weight) AS p0, + max(ca.category_demandScore) AS p1, + max(ca.category_auditDate) AS p2, + CAST(avg(ca.category_valuation) AS int) AS p3, + sum(t1rp2) AS p4, + sum( + CASE + WHEN p.product_inventoryLastOrderedOn - ca.category_auditDate > 300 THEN 1 + WHEN p.product_inventoryLastOrderedOn - ca.category_auditDate > 150 THEN 10 + WHEN p.product_inventoryLastOrderedOn - ca.category_auditDate > 0 THEN 100 + ELSE 1000 + END +(c.customer_priority * a.address_zone)) AS p5 +FROM OrderItemView oi +LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id +LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id +LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number +LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id +LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId +LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId +LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name +LEFT OUTER JOIN ( + SELECT + min(cc.creditCard_expirationDate) AS t1rp1, + sum(cc.creditCard_lastChargeAmount) AS t1rp2, + c.customer_id AS t1pk + FROM CustomerView c + LEFT OUTER JOIN CreditCardView cc ON c.customer_id = cc.creditCard_customerId + GROUP BY c.customer_id + ) t1 ON c.customer_id = t1pk +WHERE cc.creditCard_lastChargeAmount > 90.0 AND p.product_price > 34.0 +GROUP BY c.customer_priority, t1rp1, t.taxRecord_bracket +ORDER BY p1, p3, g2 +LIMIT 500; diff --git a/vortex-bench/src/appian/queries/q04.sql b/vortex-bench/src/appian/queries/q04.sql new file mode 100644 index 00000000000..8a3c74e21d7 --- /dev/null +++ b/vortex-bench/src/appian/queries/q04.sql @@ -0,0 +1,50 @@ +SELECT + t2rp1 AS g0, + t3rp1 AS g1, + t4rp1 AS g2, + CAST(avg(cc.creditCard_lastChargeAmount) AS int) AS p0, + min(cc.creditCard_lastChargeTimestamp) AS p1, + count(DISTINCT (cc.creditCard_holder)) AS p2 +FROM CategoryView ca +LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName +LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId +LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id +LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number +LEFT OUTER JOIN ( + SELECT sum(taxRecord_bracket) AS t1rp1, ca.category_name AS t1pk + FROM CategoryView ca + LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName + LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId + LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id + LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id + LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId + LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId + GROUP BY ca.category_name + ) t1 ON ca.category_name = t1pk +LEFT OUTER JOIN ( + SELECT max(p.product_likes) AS t2rp1, ca.category_name AS t2pk + FROM CategoryView ca + LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName + GROUP BY ca.category_name + ) t2 ON ca.category_name = t2pk +LEFT OUTER JOIN ( + SELECT sum(oi.orderItem_productGroup) AS t3rp1, ca.category_name AS t3pk + FROM CategoryView ca + LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName + LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId + WHERE oi.orderItem_weight > 15.0 + GROUP BY ca.category_name + ) t3 ON ca.category_name = t3pk +LEFT OUTER JOIN ( + SELECT max(cc.creditCard_zip) AS t4rp1, ca.category_name AS t4pk + FROM CategoryView ca + LEFT OUTER JOIN ProductView p ON ca.category_name = p.product_categoryName + LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId + LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id + LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number + GROUP BY ca.category_name + ) t4 ON ca.category_name = t4pk +WHERE t1rp1 > 6 +GROUP BY t2rp1, t3rp1, t4rp1 +ORDER BY g1, p2 +LIMIT 500; diff --git a/vortex-bench/src/appian/queries/q05.sql b/vortex-bench/src/appian/queries/q05.sql new file mode 100644 index 00000000000..8707a8e6744 --- /dev/null +++ b/vortex-bench/src/appian/queries/q05.sql @@ -0,0 +1,24 @@ +SELECT t.taxRecord_rate AS g0, t2rp1 AS g1, min(c.customer_balance) AS p0 +FROM TaxRecordView t +LEFT OUTER JOIN AddressView a ON t.taxRecord_addressId = a.address_id +LEFT OUTER JOIN CustomerView c ON a.address_customerId = c.customer_id +LEFT OUTER JOIN ( + SELECT min(o.order_placedOn) AS t1rp1, t.taxRecord_id AS t1pk + FROM TaxRecordView t + LEFT OUTER JOIN AddressView a ON t.taxRecord_addressId = a.address_id + LEFT OUTER JOIN OrderView o ON a.address_customerId = o.order_customerId + GROUP BY t.taxRecord_id + ) t1 ON t.taxRecord_id = t1pk +LEFT OUTER JOIN ( + SELECT sum(p.product_price * oi.orderItem_quantity) AS t2rp1, t.taxRecord_id AS t2pk + FROM TaxRecordView t + LEFT OUTER JOIN AddressView a ON t.taxRecord_addressId = a.address_id + LEFT OUTER JOIN OrderView o ON a.address_customerId = o.order_customerId + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + GROUP BY t.taxRecord_id + ) t2 ON t.taxRecord_id = t2pk +WHERE t1rp1 > '2020-01-14 12:12:30.0' +GROUP BY t.taxRecord_rate, t2rp1 +ORDER BY p0 +LIMIT 500; diff --git a/vortex-bench/src/appian/queries/q06.sql b/vortex-bench/src/appian/queries/q06.sql new file mode 100644 index 00000000000..71d558781c0 --- /dev/null +++ b/vortex-bench/src/appian/queries/q06.sql @@ -0,0 +1,47 @@ +SELECT + t1rp2 AS g0, + sum(t1rp3) / sum(t1rp4) AS p0, + sum( + CASE + WHEN t1rp5 > 1 THEN 1 + WHEN t2rp1 > 20200 THEN 2 + WHEN t1rp6 > 15 THEN 3 + WHEN t3rp1 > 150 THEN 4 + ELSE 5 + END) AS p1 +FROM ProductView p +LEFT OUTER JOIN ( + SELECT + avg(a.address_valuation) AS t1rp1, + sum(a.address_zone) AS t1rp2, + sum(a.address_zone) AS t1rp3, + count(a.address_zone) AS t1rp4, + avg(o.order_serverId) AS t1rp5, + avg(c.customer_balance) AS t1rp6, + p.product_id AS t1pk + FROM ProductView p + LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId + LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id + LEFT OUTER JOIN AddressView a ON o.order_customerId = a.address_customerId + LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id + GROUP BY p.product_id + ) t1 ON p.product_id = t1pk +LEFT OUTER JOIN ( + SELECT min(a.address_zip) AS t2rp1, p.product_id AS t2pk + FROM ProductView p + LEFT OUTER JOIN OrderItemView oi ON p.product_id = oi.orderItem_productId + LEFT OUTER JOIN OrderView o ON oi.orderItem_orderId = o.order_id + LEFT OUTER JOIN AddressView a ON o.order_customerId = a.address_customerId + WHERE a.address_state IN ('PA', 'CA', 'VA', 'MA', 'ME', 'MD', 'CO', 'MO') + GROUP BY p.product_id + ) t2 ON p.product_id = t2pk +LEFT OUTER JOIN ( + SELECT ca.category_warehouseSqft AS t3rp1, p.product_id AS t3pk + FROM ProductView p + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + WHERE ca.category_seasonal = TRUE + ) t3 ON p.product_id = t3pk +WHERE t1rp1 > 10000.0 +GROUP BY t1rp2 +ORDER BY p0 +LIMIT 500; diff --git a/vortex-bench/src/appian/queries/q07.sql b/vortex-bench/src/appian/queries/q07.sql new file mode 100644 index 00000000000..a3165f267bb --- /dev/null +++ b/vortex-bench/src/appian/queries/q07.sql @@ -0,0 +1,35 @@ +SELECT + t1rp1 AS g0, + t2rp1 AS g1, + c.customer_age AS g2, + c.customer_balance AS g3, + count(c.customer_name) AS p0, + sum(c.customer_age) AS p1 +FROM CustomerView c +LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId +LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId +LEFT OUTER JOIN ( + SELECT avg(oi.orderItem_weight) AS t1rp1, c.customer_id AS t1pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN CreditCardView cc ON o.order_creditCardNumber = cc.creditCard_number + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + WHERE creditCard_cvv IN (113, 115, 117, 119, 121) + GROUP BY c.customer_id + ) t1 ON c.customer_id = t1pk +LEFT OUTER JOIN ( + SELECT + avg((oi.orderItem_quantity * p.product_price) /(oi.orderItem_weight + oi.orderItem_sku)) AS t2rp1, + c.customer_id AS t2pk + FROM CustomerView c + LEFT OUTER JOIN OrderView o ON c.customer_id = o.order_customerId + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + WHERE ca.category_name IN ('Pet', 'Food', 'Game', 'Software') + GROUP BY c.customer_id + ) t2 ON c.customer_id = t2pk +WHERE t.taxRecord_bracketThreshold IN (22, 24, 27, 29) +GROUP BY t1rp1, t2rp1, c.customer_age, c.customer_balance +ORDER BY p0, p1 +LIMIT 500; diff --git a/vortex-bench/src/appian/queries/q08.sql b/vortex-bench/src/appian/queries/q08.sql new file mode 100644 index 00000000000..e32f5f242f1 --- /dev/null +++ b/vortex-bench/src/appian/queries/q08.sql @@ -0,0 +1,65 @@ +SELECT + t4rp1 AS g0, + t5rp1 AS g1, + sum(creditCard_lastChargeAmount) AS p0, + min(t6rp1) AS p1, + sum(t3rp2) AS p2 +FROM CreditCardView cc +LEFT OUTER JOIN ( + SELECT min(order_id) AS t1rp1, creditCard_number AS t1pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + WHERE order_slaProbability > 0.125 + GROUP BY creditCard_number + ) t1 ON cc.creditCard_number = t1pk +LEFT OUTER JOIN ( + SELECT sum(orderItem_weight) AS t2rp1, creditCard_number AS t2pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + GROUP BY creditCard_number + ) t2 ON cc.creditCard_number = t2pk +LEFT OUTER JOIN ( + SELECT + min(address_zip) AS t3rp1, + sum(taxRecord_bracketThreshold) AS t3rp2, + creditCard_number AS t3pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + LEFT OUTER JOIN CustomerView c ON o.order_customerId = c.customer_id + LEFT OUTER JOIN AddressView a ON c.customer_id = a.address_customerId + LEFT OUTER JOIN TaxRecordView t ON a.address_id = t.taxRecord_addressId + GROUP BY creditCard_number + ) t3 ON cc.creditCard_number = t3pk +LEFT OUTER JOIN ( + SELECT sum(product_price) AS t4rp1, creditCard_number AS t4pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + WHERE orderItem_weight < 25.0 + GROUP BY creditCard_number + ) t4 ON cc.creditCard_number = t4pk +LEFT OUTER JOIN ( + SELECT sum(category_regulationProbability) AS t5rp1, creditCard_number AS t5pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + GROUP BY creditCard_number + ) t5 ON cc.creditCard_number = t5pk +LEFT OUTER JOIN ( + SELECT min(product_inventoryLastOrderedOn) AS t6rp1, creditCard_number AS t6pk + FROM CreditCardView cc + LEFT OUTER JOIN OrderView o ON cc.creditCard_number = o.order_creditCardNumber + LEFT OUTER JOIN OrderItemView oi ON o.order_id = oi.orderItem_orderId + LEFT OUTER JOIN ProductView p ON oi.orderItem_productId = p.product_id + LEFT OUTER JOIN CategoryView ca ON p.product_categoryName = ca.category_name + WHERE product_price < 200.0 + GROUP BY creditCard_number + ) t6 ON cc.creditCard_number = t6pk +WHERE t1rp1 > 10000 OR t2rp1 > 15 OR t3rp1 > 20200 +GROUP BY t4rp1, t5rp1 +ORDER BY p0, p1, p2 +LIMIT 500; From acc9d1452919bac4878007617ad7e503d04a5772 Mon Sep 17 00:00:00 2001 From: mprammer Date: Tue, 19 May 2026 10:26:34 -0400 Subject: [PATCH 3/4] bench: align Appian query layout with TPC-H Move the Appian .sql files from `vortex-bench/src/appian/queries/q0N.sql` to `vortex-bench/appian/qN.sql` and load them at runtime through `appian_queries()`, mirroring `tpch_queries()` and `tpcds_queries()`. The prior `include_str!` setup was a workspace-novel pattern; this matches the existing TPC-H convention so reviewers don't have to learn a new one. Side effect: query indices in bench output are now 1-based (q1..q8) like TPC-H, instead of the 0-based numbering the old `enumerate()` produced. No historical Appian results to break since this is a new benchmark. Co-Authored-By: Claude Signed-off-by: mprammer --- .../appian/queries/q01.sql => appian/q1.sql} | 0 .../appian/queries/q02.sql => appian/q2.sql} | 0 .../appian/queries/q03.sql => appian/q3.sql} | 0 .../appian/queries/q04.sql => appian/q4.sql} | 0 .../appian/queries/q05.sql => appian/q5.sql} | 0 .../appian/queries/q06.sql => appian/q6.sql} | 0 .../appian/queries/q07.sql => appian/q7.sql} | 0 .../appian/queries/q08.sql => appian/q8.sql} | 0 vortex-bench/src/appian/mod.rs | 36 ++++++++++--------- 9 files changed, 20 insertions(+), 16 deletions(-) rename vortex-bench/{src/appian/queries/q01.sql => appian/q1.sql} (100%) rename vortex-bench/{src/appian/queries/q02.sql => appian/q2.sql} (100%) rename vortex-bench/{src/appian/queries/q03.sql => appian/q3.sql} (100%) rename vortex-bench/{src/appian/queries/q04.sql => appian/q4.sql} (100%) rename vortex-bench/{src/appian/queries/q05.sql => appian/q5.sql} (100%) rename vortex-bench/{src/appian/queries/q06.sql => appian/q6.sql} (100%) rename vortex-bench/{src/appian/queries/q07.sql => appian/q7.sql} (100%) rename vortex-bench/{src/appian/queries/q08.sql => appian/q8.sql} (100%) diff --git a/vortex-bench/src/appian/queries/q01.sql b/vortex-bench/appian/q1.sql similarity index 100% rename from vortex-bench/src/appian/queries/q01.sql rename to vortex-bench/appian/q1.sql diff --git a/vortex-bench/src/appian/queries/q02.sql b/vortex-bench/appian/q2.sql similarity index 100% rename from vortex-bench/src/appian/queries/q02.sql rename to vortex-bench/appian/q2.sql diff --git a/vortex-bench/src/appian/queries/q03.sql b/vortex-bench/appian/q3.sql similarity index 100% rename from vortex-bench/src/appian/queries/q03.sql rename to vortex-bench/appian/q3.sql diff --git a/vortex-bench/src/appian/queries/q04.sql b/vortex-bench/appian/q4.sql similarity index 100% rename from vortex-bench/src/appian/queries/q04.sql rename to vortex-bench/appian/q4.sql diff --git a/vortex-bench/src/appian/queries/q05.sql b/vortex-bench/appian/q5.sql similarity index 100% rename from vortex-bench/src/appian/queries/q05.sql rename to vortex-bench/appian/q5.sql diff --git a/vortex-bench/src/appian/queries/q06.sql b/vortex-bench/appian/q6.sql similarity index 100% rename from vortex-bench/src/appian/queries/q06.sql rename to vortex-bench/appian/q6.sql diff --git a/vortex-bench/src/appian/queries/q07.sql b/vortex-bench/appian/q7.sql similarity index 100% rename from vortex-bench/src/appian/queries/q07.sql rename to vortex-bench/appian/q7.sql diff --git a/vortex-bench/src/appian/queries/q08.sql b/vortex-bench/appian/q8.sql similarity index 100% rename from vortex-bench/src/appian/queries/q08.sql rename to vortex-bench/appian/q8.sql diff --git a/vortex-bench/src/appian/mod.rs b/vortex-bench/src/appian/mod.rs index 99a9f29fe80..44f8815f9fb 100644 --- a/vortex-bench/src/appian/mod.rs +++ b/vortex-bench/src/appian/mod.rs @@ -22,6 +22,8 @@ //! against the lowercased Parquet schema, while DuckDB's case-insensitive unquoted //! identifier resolution makes the original case irrelevant. +use std::fs; +use std::path::Path; use std::path::PathBuf; use std::process::Command; @@ -30,6 +32,7 @@ use anyhow::bail; use glob::Pattern; use tracing::info; use url::Url; +use vortex::error::VortexExpect; use crate::Benchmark; use crate::BenchmarkDataset; @@ -70,19 +73,20 @@ const TABLES: &[&str] = &[ "taxrecordview", ]; -/// Eight join-heavy queries from `duckdb/duckdb:benchmark/appian_benchmarks/queries/`. -/// Embedded byte-identically at compile time so upstream refreshes are a pure copy -/// into `queries/`. -const QUERIES: &[&str] = &[ - include_str!("queries/q01.sql"), - include_str!("queries/q02.sql"), - include_str!("queries/q03.sql"), - include_str!("queries/q04.sql"), - include_str!("queries/q05.sql"), - include_str!("queries/q06.sql"), - include_str!("queries/q07.sql"), - include_str!("queries/q08.sql"), -]; +/// Eight join-heavy queries from `duckdb/duckdb:benchmark/appian_benchmarks/queries/`, +/// stored byte-identically under `vortex-bench/appian/q{1..8}.sql` (sibling of the TPC-H +/// `tpch/q*.sql` layout). Upstream refreshes are a pure copy into that directory. +pub fn appian_queries() -> impl Iterator { + (1..=8).map(|q| (q, appian_query(q))) +} + +fn appian_query(query_idx: usize) -> String { + let path = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("appian") + .join(format!("q{query_idx}")) + .with_extension("sql"); + fs::read_to_string(path).vortex_expect("cannot load appian query from file") +} /// Benchmark over the [Appian benchmark suite from DuckDB][upstream]. /// @@ -117,7 +121,7 @@ impl AppianBenchmark { #[async_trait::async_trait] impl Benchmark for AppianBenchmark { fn queries(&self) -> anyhow::Result> { - Ok(QUERIES.iter().map(|s| s.to_string()).enumerate().collect()) + Ok(appian_queries().collect()) } async fn generate_base_data(&self) -> anyhow::Result<()> { @@ -126,7 +130,7 @@ impl Benchmark for AppianBenchmark { } let parquet_dir = self.parquet_dir()?; - std::fs::create_dir_all(&parquet_dir)?; + fs::create_dir_all(&parquet_dir)?; // Idempotency: if every target Parquet is already in place, do nothing. if TABLES @@ -215,7 +219,7 @@ impl Benchmark for AppianBenchmark { /// Run a single `duckdb` invocation that returns, for each upstream Appian table, a /// projection string of the form `"OrigName" AS "origname", ...` so the `COPY` statements /// below can lowercase every column name without enumerating them by hand. -fn discover_projections(blob: &std::path::Path) -> anyhow::Result> { +fn discover_projections(blob: &Path) -> anyhow::Result> { // `chr(31)` (unit separator) keeps `table_name` and the projection list distinct in // the single-column `-list` output without colliding with `|` (list separator) or // `,` (projection delimiter). From 9f6b3157a74429e271835035c1ac0f2b7ea98bbe Mon Sep 17 00:00:00 2001 From: mprammer Date: Tue, 19 May 2026 11:03:10 -0400 Subject: [PATCH 4/4] bench: drop private intra-doc link in appian module docs The module-level `//!` doc had `[`TABLES`]` as an intra-doc link to a private const, which `Rust (docs)` CI flags via `-D warnings` -> `rustdoc::private-intra-doc-links`. Dropping the bracket link to plain code formatting; the reference is in the same file and a reader can find it visually without the navigation aid. Co-Authored-By: Claude Signed-off-by: mprammer --- vortex-bench/src/appian/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-bench/src/appian/mod.rs b/vortex-bench/src/appian/mod.rs index 44f8815f9fb..3f22bf9c07f 100644 --- a/vortex-bench/src/appian/mod.rs +++ b/vortex-bench/src/appian/mod.rs @@ -17,7 +17,7 @@ //! while the Parquet schema and registered table names preserve case → field-not-found). //! //! The conversion below lowercases every column at COPY time, and the table names in -//! [`TABLES`] are already lowercase. Both engines then resolve the verbatim camelCase +//! `TABLES` are already lowercase. Both engines then resolve the verbatim camelCase //! queries the same way: DataFusion lowercases the query identifiers and matches them //! against the lowercased Parquet schema, while DuckDB's case-insensitive unquoted //! identifier resolution makes the original case irrelevant.