Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions rust/lance-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,16 @@ log.workspace = true
libc = { version = "0.2" }

[dev-dependencies]
criterion.workspace = true
proptest.workspace = true
rstest.workspace = true

[features]
datafusion = ["dep:datafusion-common", "dep:datafusion-sql"]

[[bench]]
name = "row_addr_mask"
harness = false

[lints]
workspace = true
292 changes: 292 additions & 0 deletions rust/lance-core/benches/row_addr_mask.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Benchmarks for `RowAddrMask` / `RowAddrTreeMap`.
//!
//! These benchmarks are deliberately structured to expose the row-cardinality
//! scaling weakness of the current per-row bitmap representation. Producers
//! (e.g. scalar-index `search` implementations) and consumers (e.g.
//! `mask_to_offset_ranges`) are frequently range-shaped, but every operation
//! must round-trip through `Partial(RoaringBitmap)` and therefore costs O(N)
//! in the number of rows, not O(R) in the number of distinct ranges.
//!
//! Each benchmark varies the number of rows while keeping the number of
//! ranges fixed at 1. A range-aware representation should make these
//! near-constant time; today they are linear in N.
//!
//! Run with `cargo bench -p lance-core --bench row_addr_mask`.

use std::ops::Range;

use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
use lance_core::utils::mask::{RowAddrMask, RowAddrTreeMap};

/// Row counts we sweep across. Chosen to cover the realistic range of
/// matches a zonemap produces for an `IS NULL`-like predicate on a single
/// fragment: a few thousand rows up through tens of millions.
const ROW_COUNTS: &[u64] = &[10_000, 100_000, 1_000_000, 10_000_000];

fn make_range_mask(num_rows: u64) -> RowAddrTreeMap {
// Build a mask covering a single contiguous run in fragment 0.
// This is the exact shape a scalar-index search produces when it
// determines a contiguous chunk of zones matches.
let mut map = RowAddrTreeMap::new();
map.insert_range(0..num_rows);
map
}

/// Producer cost: building a mask from one contiguous Range.
///
/// Today this is O(N) — every bit gets inserted into a roaring bitmap.
/// With a range-aware representation it would be O(1) (push a single run).
fn bench_insert_range(c: &mut Criterion) {
let mut group = c.benchmark_group("insert_range_single_run");
for &n in ROW_COUNTS {
group.throughput(Throughput::Elements(n));
group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, &n| {
b.iter(|| {
let mut map = RowAddrTreeMap::new();
map.insert_range(0..n);
std::hint::black_box(map);
});
});
}
group.finish();
}

/// Consumer cost: iterating every row address in a dense mask.
///
/// `into_addr_iter` walks set bits one at a time. For a contiguous run
/// of N rows this is O(N) — even though the rows are trivially
/// representable as a single Range. This is what `mask_to_offset_ranges`
/// does after intersecting with a source segment: it pays per-row
/// iteration cost only to immediately collapse the addresses back into
/// ranges via `GroupingIterator`.
fn bench_iter_addrs(c: &mut Criterion) {
let mut group = c.benchmark_group("into_addr_iter_single_run");
for &n in ROW_COUNTS {
let map = make_range_mask(n);
group.throughput(Throughput::Elements(n));
group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| {
b.iter(|| {
// SAFETY: the map only contains Partial selections; no Full entries.
let count: u64 = unsafe { map.clone().into_addr_iter() }.count() as u64;
std::hint::black_box(count);
});
});
}
group.finish();
}

/// Best-achievable iteration over the same data.
///
/// `Iter::next_range` walks the bitmap's run containers in O(num_runs).
/// For a single contiguous run this should be ~constant time — the
/// public `RowAddrMask` API gives no way to surface that today, so the
/// performance is currently inaccessible to callers. Comparing this to
/// `into_addr_iter_single_run` quantifies the speedup a range-aware
/// representation could deliver to consumers.
fn bench_iter_runs(c: &mut Criterion) {
let mut group = c.benchmark_group("next_range_iter_single_run");
for &n in ROW_COUNTS {
// Use the same underlying roaring bitmap shape that `make_range_mask`
// produces internally (one fragment, one contiguous run).
let mut bitmap = roaring::RoaringBitmap::new();
bitmap.insert_range(0..(n as u32));
group.throughput(Throughput::Elements(n));
group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| {
b.iter(|| {
let mut iter = bitmap.iter();
let mut runs: u64 = 0;
while iter.next_range().is_some() {
runs += 1;
}
std::hint::black_box(runs);
});
});
}
group.finish();
}

/// Set intersection of two range-shaped masks.
///
/// Both inputs are single contiguous runs that overlap in their middle
/// half (so the output is itself a single contiguous run). With per-row
/// bitmaps this is O(N) — the entire bitmap participates in the AND.
/// With ranges it would be O(1).
fn bench_intersect_ranges(c: &mut Criterion) {
let mut group = c.benchmark_group("intersect_two_runs");
for &n in ROW_COUNTS {
let lhs = make_range_mask(n);
let rhs_range = (n / 4)..(3 * n / 4);
let mut rhs = RowAddrTreeMap::new();
rhs.insert_range(rhs_range);
group.throughput(Throughput::Elements(n));
group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| {
b.iter(|| {
let mut tmp = lhs.clone();
tmp &= &rhs;
std::hint::black_box(tmp);
});
});
}
group.finish();
}

/// Full round trip: build a source range bitmap, AND with a mask, iterate
/// each surviving bit. This is the exact slow path of
/// `mask_to_offset_ranges` in `lance-table/src/rowids.rs:387`. Profiling
/// a 10M-row zonemap `IS NULL` query showed this consuming ~55% of the
/// hot-loop time (~495 ms of 889 ms). The benchmark separates the
/// per-row producer/consumer cost from the rest of the scan pipeline so
/// it can be tracked in isolation.
fn bench_range_to_ranges_round_trip(c: &mut Criterion) {
let mut group = c.benchmark_group("mask_to_offset_ranges_inner_loop");
for &n in ROW_COUNTS {
// The mask selects the back half of a 2N-row fragment.
let mask_range = n..(2 * n);
let mask = RowAddrMask::AllowList(RowAddrTreeMap::from(mask_range));
// The source segment covers the whole fragment.
let src: Range<u64> = 0..(2 * n);
group.throughput(Throughput::Elements(n));
group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| {
b.iter(|| {
// Mimic the slow path: materialize source range, AND with mask,
// iterate to count survivors (a stand-in for whatever the
// consumer actually does — e.g. GroupingIterator).
let mut ids = RowAddrTreeMap::from(src.clone());
ids.mask(&mask);
let count = unsafe { ids.into_addr_iter() }.count();
std::hint::black_box(count);
});
});
}
group.finish();
}

/// Many small runs vs one big run with the same total cardinality.
///
/// A range-aware representation should be O(num_runs), so the
/// `single_run` case should be ~K times faster than the `K_runs` case.
/// Today they are essentially equal: the cost is dictated by the number
/// of rows, not the number of runs.
fn bench_runs_vs_rows(c: &mut Criterion) {
let total_rows: u64 = 1_000_000;
let mut group = c.benchmark_group("insert_runs_constant_cardinality");

group.throughput(Throughput::Elements(total_rows));
group.bench_function("single_run_1M", |b| {
b.iter(|| {
let mut map = RowAddrTreeMap::new();
map.insert_range(0..total_rows);
std::hint::black_box(map);
});
});

for k in [10u64, 100, 1_000, 10_000] {
let run_size = total_rows / k;
// Stride between runs is 2 * run_size so the bitmap is half full.
let stride = run_size * 2;
group.bench_function(format!("{k}_runs_1M_total"), |b| {
b.iter(|| {
let mut map = RowAddrTreeMap::new();
for i in 0..k {
let start = i * stride;
map.insert_range(start..(start + run_size));
}
std::hint::black_box(map);
});
});
}
group.finish();
}

/// New producer API: `insert_run` stores the run as-is without inflating to
/// bits. Compare against `insert_range_single_run` to see the savings on
/// the producer side.
fn bench_insert_run(c: &mut Criterion) {
let mut group = c.benchmark_group("insert_run_single_run");
for &n in ROW_COUNTS {
let end = (n - 1) as u32;
group.throughput(Throughput::Elements(n));
group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| {
b.iter(|| {
let mut map = RowAddrTreeMap::new();
map.insert_run(0, 0..=end);
std::hint::black_box(map);
});
});
}
group.finish();
}

/// New consumer API: `iter_runs` walks runs directly. For maps built via
/// `insert_run` it is O(num_runs); for maps built via `insert_range` it
/// falls back to roaring's `Iter::next_range` which is O(num_run_containers).
/// Compare against `into_addr_iter_single_run` for the speedup callers see.
fn bench_iter_runs_consumer(c: &mut Criterion) {
let mut group = c.benchmark_group("iter_runs_single_run");
for &n in ROW_COUNTS {
let mut map = RowAddrTreeMap::new();
map.insert_run(0, 0..=(n - 1) as u32);
group.throughput(Throughput::Elements(n));
group.bench_with_input(BenchmarkId::from_parameter(n), &n, |b, _| {
b.iter(|| {
let mut runs: u64 = 0;
for _ in unsafe { map.iter_runs() } {
runs += 1;
}
std::hint::black_box(runs);
});
});
}
group.finish();
}

/// Producer scaling under realistic many-small-runs input. Compare against
/// the `insert_runs_constant_cardinality` group: `insert_run` should scale
/// with run count alone, while `insert_range` pays per-bit cost.
fn bench_insert_run_many(c: &mut Criterion) {
let total_rows: u64 = 1_000_000;
let mut group = c.benchmark_group("insert_run_constant_cardinality");

group.throughput(Throughput::Elements(total_rows));
group.bench_function("single_run_1M", |b| {
b.iter(|| {
let mut map = RowAddrTreeMap::new();
map.insert_run(0, 0..=(total_rows as u32 - 1));
std::hint::black_box(map);
});
});

for k in [10u64, 100, 1_000, 10_000] {
let run_size = total_rows / k;
let stride = run_size * 2;
group.bench_function(format!("{k}_runs_1M_total"), |b| {
b.iter(|| {
let mut map = RowAddrTreeMap::new();
for i in 0..k {
let start = (i * stride) as u32;
let end = start + run_size as u32 - 1;
map.insert_run(0, start..=end);
}
std::hint::black_box(map);
});
});
}
group.finish();
}

criterion_group!(
benches,
bench_insert_range,
bench_insert_run,
bench_iter_addrs,
bench_iter_runs,
bench_iter_runs_consumer,
bench_intersect_ranges,
bench_range_to_ranges_round_trip,
bench_runs_vs_rows,
bench_insert_run_many,
);
criterion_main!(benches);
Loading
Loading