diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml new file mode 100644 index 0000000..db5d163 --- /dev/null +++ b/.github/workflows/Benchmark.yml @@ -0,0 +1,69 @@ +name: Benchmarks + +on: + pull_request: + paths: + - "src/**" + - "bench/**" + - "Project.toml" + - ".github/workflows/Benchmark.yml" + workflow_dispatch: + +# Only one benchmark run per PR at a time; cancel stale ones. +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + pull-requests: write + +jobs: + benchmark: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 # we need both PR head and base branch + - uses: julia-actions/setup-julia@v3 + with: + version: "1.11" + - uses: julia-actions/cache@v2 + - name: Install AirspeedVelocity + run: | + julia -e 'using Pkg; Pkg.add(PackageSpec(name="AirspeedVelocity", version="0.6"))' + # Resolve commit shas explicitly so AirspeedVelocity sees stable refs. + - name: Resolve refs + id: resolve + run: | + PR_SHA=$(git rev-parse HEAD) + BASE_SHA=$(git rev-parse origin/${{ github.base_ref }}) + echo "pr_sha=$PR_SHA" >> "$GITHUB_OUTPUT" + echo "base_sha=$BASE_SHA" >> "$GITHUB_OUTPUT" + - name: Run benchmark comparison + run: | + export JULIA_NUM_THREADS=1 + ~/.julia/bin/benchpkg FindFirstFunctions \ + --rev=${{ steps.resolve.outputs.base_sha }},${{ steps.resolve.outputs.pr_sha }} \ + --bench-on=${{ steps.resolve.outputs.pr_sha }} \ + --output-dir=results/ \ + --tune + - name: Render comparison table + run: | + ~/.julia/bin/benchpkgtable FindFirstFunctions \ + --rev=${{ steps.resolve.outputs.base_sha }},${{ steps.resolve.outputs.pr_sha }} \ + --input-dir=results/ \ + > benchmark_table.md + cat benchmark_table.md + - name: Post benchmark comment + if: github.event_name == 'pull_request' + uses: peter-evans/create-or-update-comment@v4 + with: + issue-number: ${{ github.event.pull_request.number }} + body-path: benchmark_table.md + edit-mode: replace + - name: Upload raw results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: results/ diff --git a/.gitignore b/.gitignore index 58780b8..821cab1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /Manifest.toml +bench/Manifest.toml docs/build docs/Manifest.toml docs/src/assets/Manifest.toml diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..53cef6a --- /dev/null +++ b/NEWS.md @@ -0,0 +1,464 @@ +# FindFirstFunctions.jl NEWS + +## 2.0.0 + +This is a major rewrite of the sorted-search API. The 1.x surface — a +collection of single-purpose, hint-flavoured function names — has been +replaced by a single strategy-dispatched API where the algorithm is chosen +at the call site (or by `Auto`) rather than baked into the function name. + +### Breaking changes — removed names + +The following 1.x functions are gone in 2.0. Each one has a single +canonical 2.x replacement: + +| 1.x | 2.x | +| --- | --- | +| `searchsortedfirstcorrelated(v, x, guess::Integer)` | `searchsortedfirst(BracketGallop(), v, x, guess)` | +| `searchsortedlastcorrelated(v, x, guess::Integer)` | `searchsortedlast(BracketGallop(), v, x, guess)` | +| `searchsortedfirstcorrelated(v, x, g::Guesser)` | `searchsortedfirst(GuesserHint(g), v, x)` | +| `searchsortedlastcorrelated(v, x, g::Guesser)` | `searchsortedlast(GuesserHint(g), v, x)` | +| `searchsortedfirstvec(v, qs)` | `searchsortedfirst!(buf, v, qs)` (caller-owned `buf`) | +| `searchsortedlastvec(v, qs)` | `searchsortedlast!(buf, v, qs)` (caller-owned `buf`) | + +The `*vec` migration shifts buffer ownership to the caller. The in-place +form lets callers reuse buffers across calls, which the allocating form +couldn't. For one-shot use the caller-side allocation is a single +`Vector{Int}(undef, length(qs))` per call site. + +### Breaking changes — made internal + +These helpers backed 1.x public names. In 2.0 they remain in the module as +implementation details of the strategy dispatch, but are no longer +documented or part of the public API: + + - `searchsortedfirstexp` — now backs the `ExpFromLeft` strategy. Use + `searchsortedfirst(ExpFromLeft(), v, x, lo)` instead. + - `bracketstrictlymontonic` — now backs the `BracketGallop` strategy. + Callers wanting a bracket-then-binary-search should use + `searchsortedlast(BracketGallop(), v, x, hint)` / + `searchsortedfirst(BracketGallop(), v, x, hint)`. + +### New: strategy-dispatched search API + +A single pair of generic functions covers every sorted-search algorithm in +the package: + +```julia +searchsortedfirst(strategy, v, x[, hint]; order = Base.Order.Forward) +searchsortedlast(strategy, v, x[, hint]; order = Base.Order.Forward) +``` + +`strategy` is a concrete subtype of `SearchStrategy`. The shipped +strategies are: + + - **`LinearScan`** — walks ±1 from the hint. Cheapest when the hint is + close to the answer; O(n) worst case. + - **`SIMDLinearScan`** — `LinearScan` with the forward walk lowered to + 8-wide SIMD chunks via custom LLVM IR. Specialized for + `DenseVector{Int64}` and `DenseVector{Float64}`; falls back to scalar + `LinearScan` for any other element type. Opt-in only — `Auto` does not + pick it. See `strategies.md` for the NaN / element-type caveats. + - **`BracketGallop`** — bidirectional exponential bracket around the + hint, then bounded binary search. Workhorse hinted strategy. O(1) when + the hint is close, never worse than ~2 log₂ n. + - **`ExpFromLeft`** — exponential search forward from a left-bound hint. + Five linear probes, then doubling, then bounded binary search. Default + `Auto` choice in the sparse-batched path. + - **`InterpolationSearch`** — linear-extrapolation guess refined with + binary search. O(1) per query on uniformly-spaced numeric data, + O(log n) otherwise. + - **`BinaryBracket`** — plain `Base.searchsortedlast` / + `Base.searchsortedfirst`. Used as the no-hint fallback by every other + strategy. + - **`GuesserHint(g::Guesser)`** — `BracketGallop` driven by a + `Guesser`'s integer guess, with the result cached back into the + `Guesser`. + - **`Auto`** — heuristic dispatcher; see "New: `Auto` heuristic" below. + +Strategies are zero-field singletons (except `GuesserHint`, which wraps a +`Guesser`, and `Auto`, which optionally carries a `SearchProperties` +cache). The dispatch is type-stable; pinning a strategy at a call site +costs nothing at runtime. + +### New: batched in-place API + +```julia +searchsortedfirst!(idx_out, v, queries; strategy = Auto(), order = ...) +searchsortedlast!(idx_out, v, queries; strategy = Auto(), order = ...) +``` + +Writes one index per element of `queries` into `idx_out` (which must be +the same length). If `queries` is sorted under `order`, each query's hint +is the previous result, so the total cost for sorted batches is +O(length(v) + length(queries)) under the typical `Auto` choice. If +`queries` is not sorted, the call falls back to per-element +`Base.searchsortedlast` / `Base.searchsortedfirst` with no hint. + +These methods are the replacement for the removed `searchsortedfirstvec` / +`searchsortedlastvec`. The caller owns the output buffer and is free to +reuse it across calls. + +### New: `Auto` heuristic + +`Auto` picks a strategy based on the calling context: + +**Per-query** (`searchsortedlast(Auto(), v, x[, hint])`): + + - No hint, or hint out of range → `BinaryBracket`. + - Hint in range, `length(v) ≤ 16` → `LinearScan`. + - Hint in range, `length(v) > 16` → `BracketGallop`. + +**Batched sorted** (`searchsortedlast!(out, v, queries; strategy = Auto())`) +chooses by the expected average gap in `v`'s index space between +consecutive query results. For numeric data the gap is estimated from the +span ratio `(queries[end] - queries[1]) / (v[end] - v[1])`, so dense-burst +queries clustered inside one segment of `v` are correctly recognized as +gap ≈ 0: + + - `gap ≤ 4` → `LinearScan`. + - `gap ≥ 8`, `length(v) ≥ 1024`, `length(queries) ≥ 2`, not skewed, and + a sampled-linearity probe accepts → `InterpolationSearch`. + - otherwise → `ExpFromLeft`. + +The sampled-linearity probe reads 11 elements (~25 ns) and accepts when +every interior point sits within 0.1% of the straight line through `v[1]` +and `v[end]`. The 0.1% tolerance is tight by design: at large `n` the +order-statistic variance of random-sorted data is small enough that a 5% +threshold would falsely pass on irregular data. + +Skew detection on the query distribution adds an additional gate: if the +median query is more than 20% off the midpoint of the query span (and +`m ≥ 10` so the median is meaningful), `Auto` picks `ExpFromLeft` even +on linear `v`, because consecutive queries land in the same region and +the previous-result hint is worth more than the linear-extrapolation +guess. Skew detection is gated on `m ≥ 10` to avoid the median sampling +variance dominating for small batches. + +The crossover constants (`_AUTO_LINEAR_THRESHOLD = 16`, +`_AUTO_BATCH_LINEAR_GAP = 4`, `_AUTO_INTERP_MIN_GAP = 8`, +`_AUTO_INTERP_MIN_N = 1024`, `_AUTO_INTERP_MIN_M = 2`, +`_AUTO_LINEAR_REL_TOLERANCE = 1e-3`) were tuned empirically by a regime +grid covering uniform / jittered / log-spaced / two-scale / random `v` +patterns crossed with dense / sparse / clustered / sorted-random query +patterns at vector lengths from 64 to 65536 and batch sizes from 1 to +4096. Across that grid `Auto` is within 1.2× of the per-cell optimum in +90% of cells. + +### New: `SearchProperties` cache for `Auto` + +For callers issuing many short batches against the same sorted vector +(interpolation-segment lookups being the obvious case), `Auto`'s per-call +linearity probe is redundant. The new `SearchProperties` struct caches +the probe result and `Auto(props)` consumes it instead of re-probing: + +```julia +v = collect(0.0:0.001:100.0) +props = SearchProperties(v) # run probes once: ~25 ns + (Float-only) O(n) NaN scan +strat = Auto(props) # Auto holding the cached facts + +queries = sort!(rand(8) .* 100.0) +out = Vector{Int}(undef, length(queries)) +searchsortedlast!(out, v, queries; strategy = strat) +``` + +`SearchProperties` is `isbits` — it travels in registers and copies are +free. `Auto(props)` is itself zero-allocation; the resulting `Auto` is a +single concrete struct, not a parametric type. + +Currently consumed: `props.is_linear` (replaces `_sampled_looks_linear` +in the batched dispatch). The `has_props` and `has_nan` fields are +populated by `SearchProperties(v)` for forward compatibility; the latter +will unlock `SIMDLinearScan` participation in `Auto` once the eligibility +gate is wired in. + +The cache is not invalidated automatically — the caller must reconstruct +`SearchProperties(v)` if `v` mutates. A stale cache is correctness- +preserving (the chosen `InterpolationSearch` falls through to +`BracketGallop` from a bad guess — slow but still O(log n)), so the +worst case is a performance regression, not a wrong answer. + +### New: `SIMDLinearScan` + +A SIMD variant of `LinearScan` that lowers the forward walk past the hint +to 8-wide SIMD chunks via custom LLVM IR. The same scaffolding that backs +`_findfirstequal` (load 8 lanes, vector compare, bitmask, `cttz` on the +mask) is reused for the four predicates needed by positional search: + + - `_simd_first_gt` / `_simd_first_ge` for `Int64` (using `icmp sgt` / + `icmp sge`). + - `_simd_first_gt` / `_simd_first_ge` for `Float64` (using `fcmp ogt` / + `fcmp oge`). + +The IR is generated from a shared template `_simd_scan_ir(t, pred)` +parameterised on LLVM element type and compare predicate. + +Caveats (documented in detail in `strategies.md`): + + - **Element types**: `DenseVector{Int64}` and `DenseVector{Float64}` + only. Other element types (including `Int32`, `UInt64`, `Float32`, + `Date`, `String`) hit the scalar `LinearScan` fallback path. The + dispatch is static, so the fallback costs nothing per call. + - **NaN**: ordered float compares (`fcmp o*`) return false for NaN + operands, so a NaN in `v` is silently skipped by the SIMD scan. + Sorted-Float64 with NaN isn't well-defined under any total order + anyway, so this is consistent with `Base.searchsortedlast` on such + vectors. + - **Forward order only**: non-`Forward` orderings fall back to scalar + `LinearScan`. + - **No hint**: falls back to `BinaryBracket`. + +`Auto` does **not** pick `SIMDLinearScan`. It is opt-in: the regime where +it strictly beats `LinearScan` (long forward walks on Int64/Float64 +DenseVectors) overlaps with the regime where `Auto` already prefers +`BracketGallop` or `ExpFromLeft`. Pin it explicitly when you have a +workload that wants a long forward scan and you know the element type. + +### Documentation restructure + +The single `index.md` from 1.x has been split into five topical pages: + + - **Home** (`index.md`): overview and quick example. + - **Interface and extension rules** (`interface.md`): the public API + surface, the contract a `SearchStrategy` subtype must satisfy, and + how to add a new one with a correctness-check pattern. Notes that + `Auto`'s decision tree is not externally extensible — new strategies + do not auto-register with `Auto`. + - **Search strategies** (`strategies.md`): catalog of every shipped + strategy with a chooser table (best case / worst case / hint usage), + per-strategy notes, the `SIMDLinearScan` caveats, and the + "Equality search" appendix linking to `findfirstequal` / + `findfirstsortedequal` (which are a deliberately-separate API + because their return type differs from positional search). + - **Guessers** (`guessers.md`): the `Guesser` type, its + linear-extrapolation vs. cached-previous-result behaviour, the + `GuesserHint` strategy adapter, and explicit guidance on when *not* + to use a `Guesser`. + - **Auto: heuristics and benchmarks** (`auto.md`): full `Auto` + decision tree for both per-query and batched callers, every + crossover constant with justification, the `SearchProperties` cache + integration, and a self-contained benchmark script that reproduces + the regime-grid comparison. + +### Internal: shared SIMD scan scaffolding + +The LLVM IR pattern used by `_findfirstequal` (load 8 lanes, vector +compare, `cttz` on the bitmask) is now generated by a shared template +`_simd_scan_ir(t, pred)`. `FFE_IR` (equality scan, used by +`findfirstequal` and `findfirstsortedequal`) and the four +`_SIMD_*_IR`s (positional compares for `SIMDLinearScan`) all flow from +that template. Adding a new predicate is a one-line change. + +### New: equality search through the strategy framework + +`findequal(strategy, v, x[, hint])` builds an equality variant on top of +the strategy dispatch. The return type is `Int` (not `Union{Int, Nothing}`); +"not found" is signalled by the sentinel `firstindex(v) - 1` (= `0` on +1-based vectors), matching the convention `Base.searchsortedlast` already +uses for "x precedes all of v". + + - Most strategies are handled generically: + `findequal(strategy, v, x[, hint])` runs + `searchsortedfirst(strategy, v, x[, hint])` and checks whether the + candidate index actually equals `x`. This means + `findequal(BracketGallop(), v, x, hint)`, + `findequal(SIMDLinearScan(), v, x, hint)`, + `findequal(GuesserHint(g), v, x)`, `findequal(Auto(), v, x)`, and + `findequal(BinaryBracket(), v, x)` all work without per-strategy + glue. + - `BisectThenSIMD <: SearchStrategy` is a new strategy that, for + `DenseVector{Int64}`, dispatches `findequal` straight into the + bisect-then-SIMD-equality-scan algorithm that backs + `findfirstsortedequal`. For other element types, falls back to + `BinaryBracket + post-check`. In positional dispatch + (`searchsortedfirst` / `searchsortedlast`) it delegates to + `BinaryBracket` — the bisect-then-equality-scan algorithm can't + answer the positional "where would `x` insert?" question. + +### Bug fix: `BracketGallop`/`InterpolationSearch`/`ExpFromLeft`/`findfirstsortedequal` with duplicates + +Four pre-existing functions returned the wrong index when `v` contained +duplicates of the queried value and the hint or bisection midpoint +landed inside a run of duplicates. All four are fixed in 2.0: + + - `searchsortedfirst(BracketGallop(), v, x, hint)` previously galloped + rightward when `v[hint] == x`, returning the rightmost duplicate + instead of the first. Fixed by adding the companion + `bracketstrictlymontonic_first` that gallops leftward when + `v[hint] >= x`. + - `searchsortedfirst(InterpolationSearch(), v, x)` chains into + `BracketGallop`, so the same bug propagated and is fixed by the + above. + - `searchsortedfirst(ExpFromLeft(), v, x, hint)` previously + exponential-searched from `hint` when `v[hint] == x`, missing + earlier duplicates. Fixed by falling back to a full search whenever + `v[hint] >= x`. + - `findfirstsortedequal(var, vars)` bisected with the predicate + `vars[mid] <= var`, which walked the offset past the first + duplicate when `vars[mid] == var`. Fixed by tightening the predicate + to `<` and updating the window-shrink rule to include the midpoint + when the comparison is false. The fast-path LLVM IR branch is + replaced by plain `ifelse` (Julia compiles it to the same + branchless `select` modulo the `!unpredictable` metadata, which had + minimal observable effect). + +The fix is exercised by the new `findequal` strategy-parity tests on +randomized vectors with frequent duplicates (Int64 in [-50, 50] over +vectors up to length 256, 2000 trials per strategy across all shipped +strategies). + +### Equality search (`findfirstequal`, `findfirstsortedequal`) + +Both names continue to exist in 2.0, returning `Union{Int, Nothing}` as +before. Docstrings refreshed to point at the new +[`findequal`](@ref FindFirstFunctions.findequal) wrapper as the +strategy-framework-compatible alternative. Documentation moved out of +`strategies.md` into a dedicated `equality.md` page since these functions +do not match the strategy-dispatch contract (their return type and +question are different). + +### Exports + +2.0 exports the public API surface (previously the package exported +nothing, requiring `FindFirstFunctions.LinearScan()` qualification): + + - `SearchStrategy`, every concrete strategy + (`LinearScan`, `SIMDLinearScan`, `BracketGallop`, `ExpFromLeft`, + `InterpolationSearch`, `BinaryBracket`, `BisectThenSIMD`, + `GuesserHint`, `Auto`), and the `SearchProperties` cache. + - `Guesser` and `looks_linear`. + - The batched FFF-defined names `searchsortedfirst!` and + `searchsortedlast!` (the non-bang `searchsortedfirst` / + `searchsortedlast` are `Base` extensions, available via `Base`). + - The equality routines `findequal`, `findfirstequal`, + `findfirstsortedequal`. + +`using FindFirstFunctions` is now sufficient to access the full public +API. Downstream code that previously qualified every call (most of the +SciML ecosystem) continues to work — the qualified names still resolve. + +### Auto retuning with SIMDLinearScan integration + +`Auto`'s batched decision tree has been retuned based on a 1080-cell +benchmark sweep covering 5 `v` patterns × 4 query patterns × 5 `n` sizes × +6 `m` sizes × 2 element types. The previous tree fell out of the bench +sweep with median 1.18× / p95 2.09× / max 2.78× slack against the per-cell +optimum; the retuned tree comes in at median 1.04× / p95 1.38× / +max 2.18×. + +New branches and constants: + + - `SIMDLinearScan` is now dispatched by `Auto` in the medium-gap regime + (`gap ∈ (4, _auto_simd_gap_max(v)]`) when `v` is `DenseVector{Int64}` + or `DenseVector{Float64}`. `_auto_simd_gap_max` is 64 for both eltypes. + For Float64 the dispatch consults `SearchProperties.has_nan` if + available; otherwise no-NaN is assumed, consistent with how + `Base.searchsortedlast` already trusts the input is sorted. + - `BracketGallop` is preferred over `ExpFromLeft` at `gap ≥ 16` (new + constant `_AUTO_GALLOP_GAP_MIN`). The five up-front linear probes of + `ExpFromLeft` are guaranteed to miss once the answer is more than five + elements past the previous-result hint, so the doubling-from-`hint` + walk of `BracketGallop` is strictly faster at large gaps. + - Tiered linearity probe for `InterpolationSearch`. The strict + `_AUTO_LINEAR_REL_TOLERANCE = 1e-3` still gates the + `_AUTO_INTERP_MIN_GAP ≤ gap < _AUTO_INTERP_LOOSE_GAP` (8 to 256) range + — only truly uniform data passes. For `gap ≥ _AUTO_INTERP_LOOSE_GAP` + (256), the loose `_AUTO_LINEAR_LOOSE_TOLERANCE = 5e-2` applies, which + accepts approximately linear data (sorted random, jittered) where the + O(√n)/n order-statistic deviation is well below 5 %. `InterpolationSearch` + still loses on log-spaced and two-scale at any gap, and the strict tier + catches those. + +Bug fix: `_estimate_avg_gap` no longer falls back to `n ÷ m` when the +skew flag is set. The fallback caused `SIMDLinearScan` to be picked for +tightly-clustered queries (span_q ≈ 0) where `LinearScan`'s scalar walk +is 5× faster. The skew flag now serves its intended purpose as a binary +InterpolationSearch-unsuitability signal, while the actual gap value is +always the span-based estimate. + +Reproducibility: the full sweep is checked in at `bench/auto_sweep.jl` +with an analysis helper at `bench/analyze.jl`. See `auto.md` for the +decision tree, the per-regime winner distribution, and how to run the +sweep locally. + +### New (opt-in): BitInterpolationSearch for log-spaced Float64 data + +`BitInterpolationSearch` is a variant of `InterpolationSearch` that +reinterprets a positive `DenseVector{Float64}` as `DenseVector{UInt64}` +before computing the extrapolation guess. The IEEE bit pattern is +monotonically increasing with the float value (for positive Float64) and +approximately linear in array index when the underlying data is +log-spaced (geometric). On such data the bit-domain guess can be far +better than the float-domain guess that `InterpolationSearch` would +compute. + +A targeted bench sweep at `bench/bitinterp_sweep.jl` covers 9 v patterns +× 4 q patterns × 6 n sizes (up to 2²⁰) × 7 m sizes, with v patterns +specifically chosen to probe BitInterp's regime: `logspaced` (1 to 10⁶), +`logspaced_wide` (10⁻³ to 10¹⁵), `geometric_dense` (geometric spanning +10⁶), `geometric_sparse` (geometric spanning 10¹²), `power2`, `sqrt`, +`two_decade`, `jittered_log`, and `uniform` (as the negative control). + +Result over 1404 cells: BitInterp wins outright in 59 cells (4.2%) and +sits within 10% of the per-cell best in 75 cells (5.3%). The wins +concentrate in `logspaced_wide` / `logspaced` / `geometric_*` / +`jittered_log` at small m (= 4) and large n (≥ 16384). Margins range +from 1.0× (tie) to 1.43× (`logspaced_wide log_grid n=2²⁰ m=4`). On +non-log-spaced data (`uniform`, `power2`, `sqrt`, `two_decade`) +BitInterp loses — the bit-pattern guess is worse than the float-domain +guess when the data isn't geometric. + +**`Auto` does not pick BitInterp.** The wins are real but narrow +(small batches, very large n, true log-spacing), and adding the dispatch +overhead to Auto's hot path would penalize the much larger set of +non-log-spaced workloads. The strategy is exported as `BitInterpolationSearch` +for callers who know their data is log-spaced and want to pin it. + +Constraints: + - `DenseVector{Float64}` only; non-Float64 dense eltypes fall back to + plain `InterpolationSearch`. + - Requires `v[1] > 0`, `x > 0`, and both finite. Subnormal / + non-finite Float64 bit patterns are not monotonic with float value + under raw reinterpret, so the strategy falls back to `BinaryBracket` + in those cases. + - Forward ordering only. + +### Cleanup: typo fix, FFE_IR unification, tolerance kwarg + + - Internal helper `bracketstrictlymontonic` renamed to + `bracketstrictlymonotonic` (and the companion `_first` variant). + Internal-only — no downstream impact. + - The `FFE_IR` SIMD-equality IR literal is now generated by + `_simd_scan_ir("i64", "eq")` instead of duplicating ~60 lines of + inline LLVM IR. The same template produces the four `>`/`>=` + variants for `SIMDLinearScan`, so all five SIMD primitives share a + single source of truth. + - `SearchProperties(v; linear_tolerance = 1e-3)` exposes the + sampled-linearity probe's tolerance as a kwarg, matching `Guesser`'s + `looks_linear_threshold`. Loosen (e.g. `1e-2`) to widen the regime + where `Auto(props)` picks `InterpolationSearch`; tighten (e.g. + `1e-4`) to be more conservative. Default unchanged at `1e-3`. + +`findequal`'s docstring now explicitly documents the sentinel value +`firstindex(v) - 1` and its behaviour on OffsetArrays. + +### Compatibility + + - **Julia compat**: unchanged from 1.x — `julia = "1.10"`. + - **Downstream PRs**: SciML packages using the removed names need + companion PRs. The first one + [SciML/DataInterpolations.jl#529](https://github.com/SciML/DataInterpolations.jl/pull/529) + is the migration template: drop the legacy imports, route the + `Integer`-hint path through `searchsortedfirst(BracketGallop(), …)` + and the `Guesser` path through `searchsortedfirst(GuesserHint(g), …)`. + +### Test coverage + +47100 tests pass across the strategy dispatch, the batched API, the +`Auto` heuristic on a regime sweep, `SIMDLinearScan` randomized fuzz +(10000 Int64 + 10000 Float64 cases against `Base`), edge cases (empty, +single-element, duplicates, out-of-range hints, x outside the vector +range), fallback paths (Int32, Float32, String, no-hint, reverse +order), and `SearchProperties` cache correctness (output equivalence +against the un-cached path, `isbits` guarantee, behaviour under lying +cache). diff --git a/Project.toml b/Project.toml index e6437c0..655dfb1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "FindFirstFunctions" uuid = "64ca27bc-2ba2-4a57-88aa-44e436879224" -version = "1.8.0" +version = "2.0.0" authors = ["Chris Elrod and contributors"] [deps] @@ -10,13 +10,15 @@ PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" Pkg = "1.10" PrecompileTools = "1" SafeTestsets = "0.1" +StableRNGs = "1" Test = "1.10" julia = "1.10" [extras] Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Pkg", "SafeTestsets", "Test"] +test = ["Pkg", "SafeTestsets", "StableRNGs", "Test"] diff --git a/bench/.gitignore b/bench/.gitignore new file mode 100644 index 0000000..99b8e25 --- /dev/null +++ b/bench/.gitignore @@ -0,0 +1,2 @@ +results.csv +Manifest.toml diff --git a/bench/Project.toml b/bench/Project.toml new file mode 100644 index 0000000..db3a12b --- /dev/null +++ b/bench/Project.toml @@ -0,0 +1,6 @@ +[deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +FindFirstFunctions = "64ca27bc-2ba2-4a57-88aa-44e436879224" +PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/bench/analyze.jl b/bench/analyze.jl new file mode 100644 index 0000000..b050678 --- /dev/null +++ b/bench/analyze.jl @@ -0,0 +1,146 @@ +using DelimitedFiles, Statistics, Printf + +const CSV = joinpath(@__DIR__, "results.csv") + +# Read raw data +raw, header = readdlm(CSV, ','; header = true) +header = vec(string.(header)) +function col(name) + j = findfirst(==(name), header) + j === nothing && error("no column $name") + return j +end + +const STRATS = ["LinearScan", "SIMDLinearScan", "BracketGallop", "ExpFromLeft", "InterpolationSearch"] + +function row_winner(row) + times = Float64[parse(Float64, string(row[col(s)])) for s in STRATS] + j = argmin(times) + return STRATS[j], times[j] +end + +function ratio_to(row, strat_name) + times = Float64[parse(Float64, string(row[col(s)])) for s in STRATS] + best = minimum(times) + return parse(Float64, string(row[col(strat_name)])) / best +end + +println("==> Where does SIMDLinearScan win?") +simd_wins = [] +for i in axes(raw, 1) + local row = raw[i, :] + local winner, _ = row_winner(row) + if winner == "SIMDLinearScan" + push!( + simd_wins, ( + eltype = string(row[col("eltype")]), + v = string(row[col("v_kind")]), + q = string(row[col("q_kind")]), + n = parse(Int, string(row[col("n")])), + m = parse(Int, string(row[col("m")])), + ) + ) + end +end +n_simd_wins = length(simd_wins) +println(" SIMDLinearScan wins in $n_simd_wins cells") +println() + +# Tabulate where SIMD wins by m (gap proxy) +println("==> SIMDLinearScan wins by m (proxy for batched-gap regime):") +by_m = Dict{Int, Int}() +for c in simd_wins + by_m[c.m] = get(by_m, c.m, 0) + 1 +end +for m in sort(collect(keys(by_m))) + @printf(" m=%5d: %d cells\n", m, by_m[m]) +end +println() + +# By eltype +println("==> SIMDLinearScan wins by eltype:") +by_eltype = Dict{String, Int}() +for c in simd_wins + by_eltype[c.eltype] = get(by_eltype, c.eltype, 0) + 1 +end +for k in sort(collect(keys(by_eltype))) + println(" $k: $(by_eltype[k]) cells") +end +println() + +# Now show: for sorted-batched cells where SIMDLinearScan wins, what's the +# ratio of LinearScan/ExpFromLeft to SIMDLinearScan? This tells us the +# magnitude of the improvement. +println("==> SIMDLinearScan win margin over LinearScan and ExpFromLeft:") +println(" (Higher = bigger SIMD speedup over the second-best Auto candidate)") +margins = [] +for i in axes(raw, 1) + row = raw[i, :] + winner, best = row_winner(row) + if winner == "SIMDLinearScan" + t_simd = best + t_lin = parse(Float64, string(row[col("LinearScan")])) + t_exp = parse(Float64, string(row[col("ExpFromLeft")])) + push!( + margins, ( + ratio_lin = t_lin / t_simd, + ratio_exp = t_exp / t_simd, + n = parse(Int, string(row[col("n")])), + m = parse(Int, string(row[col("m")])), + ) + ) + end +end +println(" SIMD vs LinearScan: median $(median(m.ratio_lin for m in margins))x, max $(maximum(m.ratio_lin for m in margins))x") +println(" SIMD vs ExpFromLeft: median $(median(m.ratio_exp for m in margins))x, max $(maximum(m.ratio_exp for m in margins))x") +println() + +# What does Auto pick in the cells where SIMDLinearScan would win? +# Find the m, n, gap regime where SIMDLinearScan beats LinearScan AND ExpFromLeft. +println("==> Best regime for SIMDLinearScan (cells where SIMD wins by >20%):") +significant_simd_wins = filter(c -> c.ratio_lin > 1.2 && c.ratio_exp > 1.2, margins) +println(" $(length(significant_simd_wins)) cells where SIMD beats both LinearScan and ExpFromLeft by >20%") +n_by_m_n = Dict{Tuple{Int, Int}, Int}() +for c in significant_simd_wins + n_by_m_n[(c.n, c.m)] = get(n_by_m_n, (c.n, c.m), 0) + 1 +end +for (n, m) in sort(collect(keys(n_by_m_n))) + @printf(" n=%5d m=%5d: %d cells\n", n, m, n_by_m_n[(n, m)]) +end +println() + +# Compute: in each cell, the GAP. gap = n * span(queries)/span(v) / m +# (the same heuristic Auto uses). Use n/m as a rough proxy since exact gap +# depends on the query distribution which isn't recoverable from CSV alone. +println("==> n/m ratio for SIMD-winning cells (rough gap proxy):") +nm_buckets = Dict{Int, Int}() +for c in simd_wins + bucket = c.n ÷ c.m + # Round to a power-of-2 bucket + log_bucket = bucket == 0 ? 0 : floor(Int, log2(bucket)) + nm_buckets[log_bucket] = get(nm_buckets, log_bucket, 0) + 1 +end +for b in sort(collect(keys(nm_buckets))) + @printf(" n/m in [2^%d, 2^%d): %d cells\n", b, b + 1, nm_buckets[b]) +end +println() + +println("==> What strategy does Auto pick in cells where SIMD would win?") +auto_picks_when_simd = Dict{String, Int}() +for i in axes(raw, 1) + row = raw[i, :] + winner, _ = row_winner(row) + if winner == "SIMDLinearScan" + # We don't know Auto's pick from the CSV, but we know Auto's time. + # The closest strategy time to Auto's time tells us the pick. + t_auto = parse(Float64, string(row[col("Auto")])) + candidates = [(s, parse(Float64, string(row[col(s)]))) for s in STRATS] + # Pick the strategy whose time is closest to Auto's + # (within 20% — heuristic). + closest = argmin(c -> abs(c[2] - t_auto), candidates) + auto_picks_when_simd[closest[1]] = get(auto_picks_when_simd, closest[1], 0) + 1 + end +end +for (s, n) in sort(collect(pairs(auto_picks_when_simd)); by = x -> -x[2]) + println(" Auto -> $s: $n cells (out of $(length(simd_wins)))") +end diff --git a/bench/auto_sweep.jl b/bench/auto_sweep.jl new file mode 100644 index 0000000..dcd41af --- /dev/null +++ b/bench/auto_sweep.jl @@ -0,0 +1,279 @@ +using FindFirstFunctions, StableRNGs, BenchmarkTools, Statistics, Printf + +const F = FindFirstFunctions + +# -------------------------------------------------------------------------- +# Strategy registry — every strategy that competes in the sorted-batched +# regime. BinaryBracket is excluded from the per-cell winner pool because it +# never wins under any reasonable workload (no hint = full binary search +# per query); it's only useful as a fallback. Auto's actual choices are +# LinearScan / SIMDLinearScan / ExpFromLeft / InterpolationSearch. +# -------------------------------------------------------------------------- +const STRATS = [ + ("LinearScan", F.LinearScan()), + ("SIMDLinearScan", F.SIMDLinearScan()), + ("BracketGallop", F.BracketGallop()), + ("ExpFromLeft", F.ExpFromLeft()), + ("InterpolationSearch", F.InterpolationSearch()), +] + +# -------------------------------------------------------------------------- +# v patterns (sorted vectors of length n) +# -------------------------------------------------------------------------- +function make_v(::Val{:uniform}, ::Type{T}, n, seed) where {T <: AbstractFloat} + return collect(T, range(zero(T), T(n); length = n)) +end +function make_v(::Val{:uniform}, ::Type{T}, n, seed) where {T <: Integer} + return collect(T, T(1):T(n)) +end +function make_v(::Val{:jittered}, ::Type{T}, n, seed) where {T <: AbstractFloat} + rng = StableRNG(seed) + base = collect(range(0.0, Float64(n); length = n)) + return convert(Vector{T}, sort!(base .+ 0.1 .* (rand(rng, n) .- 0.5))) +end +function make_v(::Val{:jittered}, ::Type{T}, n, seed) where {T <: Integer} + rng = StableRNG(seed) + raw = sort!(unique!(rand(rng, T(1):T(2n), n))) + return convert(Vector{T}, raw) +end +function make_v(::Val{:logspaced}, ::Type{T}, n, seed) where {T <: AbstractFloat} + return collect(T, exp.(range(0.0, log(1.0e6); length = n))) +end +function make_v(::Val{:logspaced}, ::Type{T}, n, seed) where {T <: Integer} + raw = sort!(unique!(round.(T, exp.(range(0.0, log(Float64(10n)); length = n))))) + return convert(Vector{T}, raw) +end +function make_v(::Val{:twoscale}, ::Type{T}, n, seed) where {T <: AbstractFloat} + h = n ÷ 2 + return convert( + Vector{T}, + sort!(vcat(range(0.0, 0.1n; length = h), range(0.1n, Float64(n); length = n - h))) + ) +end +function make_v(::Val{:twoscale}, ::Type{T}, n, seed) where {T <: Integer} + h = n ÷ 2 + a = round.(T, range(1.0, 0.1n; length = h)) + b = round.(T, range(0.1n + 1, Float64(n); length = n - h)) + return convert(Vector{T}, sort!(vcat(a, b))) +end +function make_v(::Val{:random_sorted}, ::Type{T}, n, seed) where {T <: AbstractFloat} + return convert(Vector{T}, sort!(rand(StableRNG(seed), n) .* n)) +end +function make_v(::Val{:random_sorted}, ::Type{T}, n, seed) where {T <: Integer} + return convert(Vector{T}, sort!(rand(StableRNG(seed), T(1):T(10n), n))) +end + +# -------------------------------------------------------------------------- +# query patterns +# -------------------------------------------------------------------------- +function _to_eltype(v, xs) + T = eltype(v) + return T <: Integer ? convert(Vector{T}, round.(T, xs)) : convert(Vector{T}, xs) +end +function make_q(::Val{:dense}, v, m, seed) + raw = collect(range(Float64(first(v)), Float64(last(v)); length = m)) + return _to_eltype(v, raw) +end +function make_q(::Val{:sparse}, v, m, seed) + rng = StableRNG(seed) + span = Float64(last(v)) - Float64(first(v)) + raw = sort!(Float64(first(v)) .+ rand(rng, m) .* span) + return _to_eltype(v, raw) +end +function make_q(::Val{:clustered}, v, m, seed) + rng = StableRNG(seed) + j = max(1, length(v) ÷ 4) + lo = Float64(v[j]) + hi = Float64(v[min(j + 1, length(v))]) + span = hi - lo + raw = sort!(lo .+ rand(rng, m) .* max(span, 1.0)) + return _to_eltype(v, raw) +end +function make_q(::Val{:sorted_random}, v, m, seed) + rng = StableRNG(seed) + span = Float64(last(v)) - Float64(first(v)) + raw = sort!(Float64(first(v)) .+ rand(rng, m) .* span) + return _to_eltype(v, raw) +end + +# -------------------------------------------------------------------------- +# Bench a single (v, queries, strategy) cell. Returns time in nanoseconds. +# Uses a fixed-repeat loop to avoid BenchmarkTools per-call overhead, which +# dominates fast strategies on small m. +# -------------------------------------------------------------------------- +function bench_one(v, queries, strat, out, reps::Int = 5) + # Warm up + F.searchsortedlast!(out, v, queries; strategy = strat) + # Measure + best = typemax(Float64) + for _ in 1:reps + t = @elapsed F.searchsortedlast!(out, v, queries; strategy = strat) + best = min(best, t) + end + return best * 1.0e9 / length(queries) # ns per query +end + +# -------------------------------------------------------------------------- +# Full sweep +# -------------------------------------------------------------------------- +function run_sweep(; + v_kinds = (:uniform, :jittered, :logspaced, :twoscale, :random_sorted), + q_kinds = (:dense, :sparse, :clustered, :sorted_random), + ns = (256, 1024, 4096, 16_384, 65_536), + ms = (4, 16, 64, 256, 1024, 4096), + eltypes = (Int64, Float64), + seed = 2026, + ) + rows = [] + cell_idx = 0 + n_cells = length(v_kinds) * length(q_kinds) * length(ns) * length(ms) * length(eltypes) + + for T in eltypes, v_kind in v_kinds, q_kind in q_kinds, n in ns, m in ms + m > n && continue + cell_idx += 1 + v = make_v(Val(v_kind), T, n, seed) + q = make_q(Val(q_kind), v, m, seed + 1) + out = Vector{Int}(undef, m) + + # Build SearchProperties for the cached-Auto comparison + props = F.SearchProperties(v) + + times = Dict{String, Float64}() + for (name, strat) in STRATS + t = bench_one(v, q, strat, out) + times[name] = t + end + # Auto (un-cached) and Auto (cached) + times["Auto"] = bench_one(v, q, F.Auto(), out) + times["Auto+props"] = bench_one(v, q, F.Auto(props), out) + + push!( + rows, ( + eltype = string(T), + v_kind = string(v_kind), + q_kind = string(q_kind), + n = n, + m = m, + times = times, + ) + ) + + if cell_idx % 25 == 0 + best_explicit = minimum(times[s] for s in first.(STRATS)) + @printf( + " [%4d/%4d] %s/%s/%s n=%d m=%d best=%.0f ns/q Auto=%.0f Auto+p=%.0f\n", + cell_idx, n_cells, T, v_kind, q_kind, n, m, + best_explicit, times["Auto"], times["Auto+props"] + ) + end + end + return rows +end + +# -------------------------------------------------------------------------- +# Analyze: per-cell winner, Auto slack vs best +# -------------------------------------------------------------------------- +function analyze(rows) + strat_names = first.(STRATS) + summary = Dict{String, Int}() + auto_slacks = Float64[] + auto_cached_slacks = Float64[] + + for r in rows + # Find the per-cell winner among the explicit strategies. + per_cell = [(s, r.times[s]) for s in strat_names] + best_t = minimum(p[2] for p in per_cell) + best_name = first(per_cell[argmin([p[2] for p in per_cell])]) + summary[best_name] = get(summary, best_name, 0) + 1 + push!(auto_slacks, r.times["Auto"] / best_t) + push!(auto_cached_slacks, r.times["Auto+props"] / best_t) + end + + println() + println("Per-cell winners (across $(length(rows)) cells):") + for name in strat_names + n = get(summary, name, 0) + @printf(" %-22s %5d cells (%5.1f%%)\n", name, n, 100 * n / length(rows)) + end + + println() + @printf("Auto slack (Auto-time / per-cell-best-time):\n") + @printf( + " median %.2fx, mean %.2fx, p90 %.2fx, p95 %.2fx, max %.2fx\n", + median(auto_slacks), mean(auto_slacks), + quantile(auto_slacks, 0.9), quantile(auto_slacks, 0.95), + maximum(auto_slacks) + ) + @printf("Auto(props) slack:\n") + @printf( + " median %.2fx, mean %.2fx, p90 %.2fx, p95 %.2fx, max %.2fx\n", + median(auto_cached_slacks), mean(auto_cached_slacks), + quantile(auto_cached_slacks, 0.9), quantile(auto_cached_slacks, 0.95), + maximum(auto_cached_slacks) + ) + + return summary, auto_slacks, auto_cached_slacks +end + +function print_worst_cells(rows, k = 20) + strat_names = first.(STRATS) + println() + println("Worst Auto-slack cells (current Auto picks strategy that's much slower than optimal):") + @printf( + " %-7s %-15s %-15s %6s %6s %-20s slack(Auto) slack(Auto+p)\n", + "eltype", "v_kind", "q_kind", "n", "m", "best (time)" + ) + scored = [ + (r, r.times["Auto"] / minimum(r.times[s] for s in strat_names)) + for r in rows + ] + sort!(scored; by = x -> -x[2]) + for (r, slack) in scored[1:min(k, length(scored))] + best_t = minimum(r.times[s] for s in strat_names) + best_name = strat_names[argmin([r.times[s] for s in strat_names])] + slack_cached = r.times["Auto+props"] / best_t + @printf( + " %-7s %-15s %-15s %6d %6d %-15s %.0fns %5.2fx %5.2fx\n", + r.eltype, r.v_kind, r.q_kind, r.n, r.m, best_name, best_t, + slack, slack_cached + ) + end + return +end + +println("FindFirstFunctions Auto algorithm benchmark sweep") +println("="^70) +println("Strategies:") +for (name, _) in STRATS + println(" ", name) +end +println("Plus: Auto(), Auto(SearchProperties(v))") +println() +println("Starting sweep...") +println() + +@time rows = run_sweep() + +println() +println("Sweep complete.") +println() + +analyze(rows) +print_worst_cells(rows, 30) + +open(joinpath(@__DIR__, "results.csv"), "w") do io + header = ["eltype", "v_kind", "q_kind", "n", "m"] + append!(header, first.(STRATS)) + push!(header, "Auto", "Auto+props") + println(io, join(header, ",")) + for r in rows + cols = String[r.eltype, r.v_kind, r.q_kind, string(r.n), string(r.m)] + for s in first.(STRATS) + push!(cols, string(r.times[s])) + end + push!(cols, string(r.times["Auto"])) + push!(cols, string(r.times["Auto+props"])) + println(io, join(cols, ",")) + end +end +println("\nRaw data: bench/results.csv") diff --git a/bench/bitinterp_sweep.jl b/bench/bitinterp_sweep.jl new file mode 100644 index 0000000..234bb0f --- /dev/null +++ b/bench/bitinterp_sweep.jl @@ -0,0 +1,226 @@ +using FindFirstFunctions, StableRNGs, BenchmarkTools, Statistics, Printf + +const F = FindFirstFunctions + +# Strategies under test. BitInterp is the one we're vetting; the others are +# the alternatives `Auto` might pick or that a user might pin. +const STRATS = [ + ("BitInterp", F.BitInterpolationSearch()), + ("Interp", F.InterpolationSearch()), + ("Bracket", F.BracketGallop()), + ("Exp", F.ExpFromLeft()), + ("SIMD", F.SIMDLinearScan()), + ("Auto", F.Auto()), +] + +# Data shapes covering the regimes BitInterp could conceivably win. +function build_v(::Val{:uniform}, n, seed) + return collect(range(1.0, Float64(n); length = n)) +end +function build_v(::Val{:logspaced}, n, seed) + return collect(exp.(range(0.0, log(1.0e6); length = n))) +end +function build_v(::Val{:logspaced_wide}, n, seed) + # 10⁻³ to 10¹⁵ — ~18 decades, like a particle-physics interpolation table + return collect(exp.(range(log(1.0e-3), log(1.0e15); length = n))) +end +function build_v(::Val{:geometric_dense}, n, seed) + # Geometric with ratio chosen so the series spans 10⁶ — keeps Float64 + # range usable up to n ≈ 10⁷ while keeping the spacing tight. + r = (1.0e6)^(1.0 / max(n - 1, 1)) + return collect(r .^ (0:(n - 1))) +end +function build_v(::Val{:geometric_sparse}, n, seed) + # Geometric spanning 10¹² with sparser spacing. + r = (1.0e12)^(1.0 / max(n - 1, 1)) + return collect(r .^ (0:(n - 1))) +end +function build_v(::Val{:power2}, n, seed) + return collect(Float64(i)^2 for i in 1:n) +end +function build_v(::Val{:sqrt}, n, seed) + return collect(sqrt(Float64(i)) for i in 1:n) +end +function build_v(::Val{:two_decade}, n, seed) + # 80% of values in [1, 2], 20% in [2, 1e6] — extreme density-then-sparse + h = (8n) ÷ 10 + a = collect(range(1.0, 2.0; length = h)) + b = collect(exp.(range(log(2.0), log(1.0e6); length = n - h))) + return sort!(vcat(a[1:(end - 1)], b)) +end +function build_v(::Val{:jittered_log}, n, seed) + base = collect(exp.(range(0.0, log(1.0e6); length = n))) + rng = StableRNG(seed) + return sort!(base .* (1.0 .+ 0.01 .* (rand(rng, n) .- 0.5))) +end + +function build_q(v, m, kind::Symbol, seed) + rng = StableRNG(seed) + if kind == :linear_uniform + # m queries uniformly random in v's full LINEAR range + return sort!(first(v) .+ rand(rng, m) .* (last(v) - first(v))) + elseif kind == :log_uniform + # m queries uniformly random in v's full LOG range — matches the + # distribution of log-spaced v itself + first(v) > 0 || error("log_uniform requires positive v") + return sort!( + exp.(log(first(v)) .+ rand(rng, m) .* (log(last(v)) - log(first(v)))) + ) + elseif kind == :dense_grid + # Evenly-spaced linear grid + return collect(range(first(v), last(v); length = m)) + elseif kind == :log_grid + first(v) > 0 || error("log_grid requires positive v") + return collect(exp.(range(log(first(v)), log(last(v)); length = m))) + else + error("unknown q kind: $kind") + end +end + +function bench_cell(v, q, strat, out, reps = 5) + F.searchsortedlast!(out, v, q; strategy = strat) + best = typemax(Float64) + for _ in 1:reps + t = @elapsed F.searchsortedlast!(out, v, q; strategy = strat) + best = min(best, t) + end + return best * 1.0e9 / length(q) +end + +function correctness_check(v, q, strat) + out = Vector{Int}(undef, length(q)) + expected = searchsortedlast.(Ref(v), q) + F.searchsortedlast!(out, v, q; strategy = strat) + return out == expected +end + +println("BitInterpolationSearch comprehensive sweep") +println("="^70) +println("Testing where BitInterp might win across spacing × density × n × m") +println() + +# Multiple seeds for noise reduction +function run_sweep() + v_kinds = ( + :uniform, :logspaced, :logspaced_wide, + :geometric_dense, :geometric_sparse, + :power2, :sqrt, :two_decade, :jittered_log, + ) + q_kinds = (:linear_uniform, :log_uniform, :dense_grid, :log_grid) + ns = (1024, 4096, 16_384, 65_536, 262_144, 1_048_576) + ms = (4, 16, 64, 256, 1024, 4096, 16_384) + + rows = [] + bit_wins = 0 + bit_within10 = 0 # within 10% of best + bit_within20 = 0 # within 20% of best + total = 0 + + for v_kind in v_kinds, q_kind in q_kinds, n in ns, m in ms + m > n && continue + v = build_v(Val(v_kind), n, 2026) + # Some q_kinds require positive v + if (q_kind == :log_uniform || q_kind == :log_grid) && first(v) <= 0 + continue + end + q = build_q(v, m, q_kind, 2027) + out = Vector{Int}(undef, m) + + # Correctness first — bail loudly if mismatch. + correctness_check(v, q, F.BitInterpolationSearch()) || + error("BitInterp correctness fail on $v_kind/$q_kind n=$n m=$m") + + times = Dict{String, Float64}() + for (name, strat) in STRATS + times[name] = bench_cell(v, q, strat, out) + end + push!(rows, (v_kind, q_kind, n, m, times)) + + # Score + explicit = [(n, t) for (n, t) in pairs(times) if n != "Auto"] + best_t = minimum(t for (_, t) in explicit) + bit_t = times["BitInterp"] + if bit_t <= best_t + bit_wins += 1 + end + if bit_t <= 1.1 * best_t + bit_within10 += 1 + end + if bit_t <= 1.2 * best_t + bit_within20 += 1 + end + total += 1 + end + + println("Cells tested: $total") + @printf( + "BitInterp wins outright: %d cells (%.1f%%)\n", + bit_wins, 100 * bit_wins / total + ) + @printf( + "BitInterp within 10%% of best: %d cells (%.1f%%)\n", + bit_within10, 100 * bit_within10 / total + ) + @printf( + "BitInterp within 20%% of best: %d cells (%.1f%%)\n", + bit_within20, 100 * bit_within20 / total + ) + println() + + # If BitInterp wins anywhere, show those cells. + win_rows = [ + r for r in rows if begin + explicit = [(n, t) for (n, t) in pairs(r[5]) if n != "Auto"] + best_t = minimum(t for (_, t) in explicit) + r[5]["BitInterp"] <= best_t + end + ] + if !isempty(win_rows) + println("Cells where BitInterp wins:") + @printf( + " %-16s %-14s %8s %6s %-10s vs second-best\n", + "v_kind", "q_kind", "n", "m", "BitInterp" + ) + for (v_kind, q_kind, n, m, times) in win_rows + explicit = sort( + [(name, t) for (name, t) in pairs(times) if name != "Auto"]; + by = x -> x[2] + ) + second = explicit[2] + @printf( + " %-16s %-14s %8d %6d %5.1f ns/q vs %s=%5.1f (%.2fx)\n", + v_kind, q_kind, n, m, times["BitInterp"], + second[1], second[2], second[2] / times["BitInterp"] + ) + end + println() + end + + # Show the cells where BitInterp is closest to winning (top 10). + closest = sort( + rows; by = r -> begin + explicit = [(n, t) for (n, t) in pairs(r[5]) if n != "Auto"] + best_t = minimum(t for (_, t) in explicit) + r[5]["BitInterp"] / best_t + end + ) + println("Closest 10 cells (BitInterp/best ratio):") + @printf( + " %-16s %-14s %8s %6s %-10s ratio winner\n", + "v_kind", "q_kind", "n", "m", "BitInterp" + ) + for (v_kind, q_kind, n, m, times) in closest[1:min(10, end)] + explicit = [(name, t) for (name, t) in pairs(times) if name != "Auto"] + best_t = minimum(t for (_, t) in explicit) + winner = first(sort(explicit; by = x -> x[2])) + @printf( + " %-16s %-14s %8d %6d %5.1f ns/q %.2fx %s\n", + v_kind, q_kind, n, m, times["BitInterp"], + times["BitInterp"] / best_t, winner[1] + ) + end + + return rows +end + +@time run_sweep() diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl new file mode 100644 index 0000000..8aba939 --- /dev/null +++ b/benchmark/benchmarks.jl @@ -0,0 +1,132 @@ +# AirspeedVelocity-driven benchmark suite. Each `SUITE[...]` entry is a +# `BenchmarkGroup` (or `@benchmarkable`) that AirspeedVelocity will time on +# both the PR branch and the base branch, then diff. Keep individual cells +# fast (< 200 ms target) so the CI run finishes in a few minutes. + +using BenchmarkTools +using FindFirstFunctions +using StableRNGs + +const SUITE = BenchmarkGroup() + +# Helper: build a sorted Float64 vector with the given pattern. +function _make_v(kind, n, seed = 2026) + rng = StableRNG(seed) + return if kind === :uniform + collect(range(0.0, Float64(n); length = n)) + elseif kind === :logspaced + collect(exp.(range(0.0, log(1.0e6); length = n))) + elseif kind === :random_sorted + sort!(rand(rng, n) .* n) + else + error("unknown v kind $kind") + end +end + +function _make_q(v, m, kind, seed = 2027) + rng = StableRNG(seed) + return if kind === :dense + collect(range(first(v), last(v); length = m)) + elseif kind === :sparse + sort!(first(v) .+ rand(rng, m) .* (last(v) - first(v))) + elseif kind === :clustered + j = max(1, length(v) ÷ 4) + lo = v[j] + hi = v[min(j + 1, length(v))] + span = hi - lo + sort!(lo .+ rand(rng, m) .* max(span, 1.0)) + else + error("unknown q kind $kind") + end +end + +# --------------------------------------------------------------------------- +# Per-strategy single-query micro-benchmarks (Int64, n=1024, hint near answer) +# --------------------------------------------------------------------------- +SUITE["per_query"] = BenchmarkGroup() +let v = collect(Int64, 1:1024), x = Int64(500), hint = 480 + for (name, strategy) in [ + ("LinearScan", LinearScan()), + ("SIMDLinearScan", SIMDLinearScan()), + ("BracketGallop", BracketGallop()), + ("ExpFromLeft", ExpFromLeft()), + ("InterpolationSearch", InterpolationSearch()), + ("BinaryBracket", BinaryBracket()), + ("Auto", Auto()), + ] + SUITE["per_query"][name] = @benchmarkable( + searchsortedlast($strategy, $v, $x, $hint), + evals = 1, samples = 200, + ) + end +end + +# --------------------------------------------------------------------------- +# Batched in-place benchmarks: a small grid of (v_kind, q_kind, n, m). +# Auto vs hand-picked strategies, plus the queries_sorted = true fast path. +# --------------------------------------------------------------------------- +SUITE["batched"] = BenchmarkGroup() +for v_kind in (:uniform, :logspaced, :random_sorted), + q_kind in (:dense, :sparse, :clustered), + (n, m) in ((1024, 64), (16_384, 256), (65_536, 4096)) + + v = _make_v(v_kind, n) + q = _make_q(v, m, q_kind) + out = Vector{Int}(undef, m) + + key = "$(v_kind)/$(q_kind)/n=$n/m=$m" + grp = BenchmarkGroup() + grp["Auto"] = @benchmarkable( + FindFirstFunctions.searchsortedlast!($out, $v, $q; strategy = Auto()), + evals = 1, samples = 50, + ) + grp["Auto+sorted"] = @benchmarkable( + FindFirstFunctions.searchsortedlast!( + $out, $v, $q; + strategy = Auto(), queries_sorted = true, + ), + evals = 1, samples = 50, + ) + grp["Auto+props"] = @benchmarkable( + FindFirstFunctions.searchsortedlast!( + $out, $v, $q; + strategy = Auto(SearchProperties($v)), + ), + evals = 1, samples = 50, + ) + SUITE["batched"][key] = grp +end + +# --------------------------------------------------------------------------- +# SearchProperties construction cost (probe + NaN scan on Float64) +# --------------------------------------------------------------------------- +SUITE["props_construct"] = BenchmarkGroup() +for (name, v) in [ + ("uniform_64k", collect(range(0.0, 100.0; length = 65_536))), + ("logspaced_64k", collect(exp.(range(0.0, log(1.0e6); length = 65_536)))), + ("int64_64k", collect(Int64, 1:65_536)), + ] + SUITE["props_construct"][name] = @benchmarkable( + SearchProperties($v), + evals = 1, samples = 50, + ) +end + +# --------------------------------------------------------------------------- +# Equality search comparison +# --------------------------------------------------------------------------- +SUITE["equality"] = BenchmarkGroup() +let v = collect(Int64, 1:65_536), x = Int64(50_000) + SUITE["equality"]["findequal_BinaryBracket"] = @benchmarkable( + findequal(BinaryBracket(), $v, $x), + evals = 1, samples = 200, + ) + SUITE["equality"]["findequal_BisectThenSIMD"] = @benchmarkable( + findequal(BisectThenSIMD(), $v, $x), + evals = 1, samples = 200, + ) + SUITE["equality"]["findfirstsortedequal"] = @benchmarkable( + findfirstsortedequal($x, $v), + evals = 1, samples = 200, + ) +end diff --git a/docs/make.jl b/docs/make.jl index f3845e9..be60ee1 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -15,7 +15,14 @@ makedocs( assets = ["assets/favicon.ico"], canonical = "https://docs.sciml.ai/FindFirstFunctions/stable/" ), - pages = ["index.md"] + pages = [ + "Home" => "index.md", + "Interface and extension rules" => "interface.md", + "Search strategies" => "strategies.md", + "Guessers" => "guessers.md", + "Auto: heuristics and benchmarks" => "auto.md", + "Equality search" => "equality.md", + ] ) deploydocs(repo = "github.com/SciML/FindFirstFunctions.jl"; push_preview = true) diff --git a/docs/src/auto.md b/docs/src/auto.md new file mode 100644 index 0000000..74fe7ff --- /dev/null +++ b/docs/src/auto.md @@ -0,0 +1,331 @@ +# Auto: heuristics and benchmarks + +[`Auto`](@ref FindFirstFunctions.Auto) is the default strategy for the +batched API. This page documents the decision tree it follows, the +crossover constants embedded in it, and the benchmark sweep used to +validate them. The numbers below are reproducible on any machine — the +script at the end of the page generates the comparison grid. + +## What `Auto` decides + +The decision differs between per-query and batched callers. + +### Per-query: `searchsortedlast(Auto(), v, x[, hint])` + +``` +hint missing or out of axes(v) → BinaryBracket +length(v) ≤ 16 → LinearScan # _AUTO_LINEAR_THRESHOLD +otherwise → BracketGallop +``` + +`Auto` never picks `InterpolationSearch` or `ExpFromLeft` in the per-query +path. Both pay setup costs (endpoint reads for `InterpolationSearch`, 5 +linear probes for `ExpFromLeft`) that the per-query path can't amortize. + +### Batched: `searchsortedlast!(out, v, queries; strategy = Auto())` + +The decision is driven by a single number: the expected average **gap** in +`v`'s index space between consecutive query results. For sorted numeric +queries this is `floor(n * span(queries) / span(v) / m)`; for non-numeric +or pathologically-spaced data it falls back to `n ÷ m`. + +``` +m == 0 → return out unchanged +m == 1 → one direct unhinted call +queries unsorted → unhinted Base.searchsortedlast loop +gap ≤ 4 → LinearScan +4 < gap ≤ _auto_simd_gap_max(v) + AND SIMD-eligible (Int64/Float64) → SIMDLinearScan +gap ≥ 8 AND n ≥ 1024 AND m ≥ 2 + AND not skewed + AND linearity probe accepts → InterpolationSearch +gap ≥ 16 → BracketGallop +otherwise → ExpFromLeft +``` + +`_auto_simd_gap_max(v)` is 64 for `DenseVector{Int64}` and `DenseVector{Float64}`, +and 0 (never picked) for any other element type. For Float64 the SIMD path +requires `v` to be NaN-free; if a populated `SearchProperties(v)` is attached +to `Auto`, the cached `has_nan` flag is consulted, otherwise no-NaN is assumed +(the same trust contract `Base.searchsortedlast` uses for sortedness of `v`). + +The "linearity probe accepts" check is tiered: + + - For `_AUTO_INTERP_MIN_GAP ≤ gap < _AUTO_INTERP_LOOSE_GAP` (8 to 256): + strict 0.1% sampled tolerance — only truly uniform data passes. + - For `gap ≥ _AUTO_INTERP_LOOSE_GAP` (≥ 256): loose 5% tolerance — at + these gap sizes, `InterpolationSearch`'s cache benefit compensates for + a worse guess, and the bounded-binary-search refinement is still + O(log n). This unlocks `InterpolationSearch` on approximately linear + data like sorted random or mildly jittered vectors at large `n`. + +The "skewed" check guards `InterpolationSearch` from the opposite direction: +if the queries are clustered into one region of their span (median query +more than 20% off the midpoint of the query span), `Auto` falls through to +`BracketGallop` / `ExpFromLeft`, because consecutive queries land in the +same neighbourhood and the previous-result hint is worth more than the +linear-extrapolation guess. Skew detection is gated on `m ≥ 10` — for +smaller `m` the median sampling variance overwhelms the signal. + +The `BracketGallop` fallback at `gap ≥ _AUTO_GALLOP_GAP_MIN` (= 16) exists +because, at large gaps, `ExpFromLeft`'s five up-front linear probes are +guaranteed to miss — the answer is much more than 5 elements past the +previous-result hint. `BracketGallop` skips that wasted preamble and starts +doubling from one position past `hint`. + +## Crossover constants + +The constants are defined at the top of `src/FindFirstFunctions.jl` and +reproduced here so they are easy to find from the docs: + +| Constant | Value | What it gates | +|---|---|---| +| `_AUTO_LINEAR_THRESHOLD` | 16 | Per-query `LinearScan` vs `BracketGallop` crossover on hinted calls. | +| `_AUTO_BATCH_LINEAR_GAP` | 4 | Batched `LinearScan` ceiling. | +| `_AUTO_SIMD_GAP_MAX` (Int64 / Float64) | 64 / 64 | Maximum gap for `SIMDLinearScan` on dense Int64 / Float64. | +| `_AUTO_GALLOP_GAP_MIN` | 16 | Above this gap, prefer `BracketGallop` over `ExpFromLeft` when `InterpolationSearch` isn't picked. | +| `_AUTO_INTERP_MIN_GAP` | 8 | Minimum gap below which `InterpolationSearch` is never picked. | +| `_AUTO_INTERP_MIN_N` | 1024 | Minimum `length(v)` below which `InterpolationSearch` is never picked. | +| `_AUTO_INTERP_MIN_M` | 2 | Minimum `length(queries)`; single-query batches skip the heuristic entirely. | +| `_AUTO_INTERP_LOOSE_GAP` | 256 | At this gap, the linearity probe switches to the loose tolerance. | +| `_AUTO_LINEAR_REL_TOLERANCE` | 1.0e-3 | Strict-tier tolerance of the sampled-linearity probe. | +| `_AUTO_LINEAR_LOOSE_TOLERANCE` | 5.0e-2 | Loose-tier tolerance, used for `gap ≥ _AUTO_INTERP_LOOSE_GAP`. | + +These are not user-tunable from outside — they shipped with the version of +the package documented here. Tightening or loosening them requires a PR +with new benchmark numbers across the regime grid below. + +## Why each branch is there + +**`gap ≤ 4 → LinearScan`.** `ExpFromLeft` issues 5 unconditional linear +probes at the start of every call. When the gap is 0 or 1, those probes +are wasted. `LinearScan` walks the same handful of indices with no +bracketing overhead and wins by ~30%. The crossover is empirically at gap +≈ 4 on AVX2 hardware; below that, `LinearScan` is strictly faster. + +**`InterpolationSearch` only on linear `v`.** On uniformly-spaced numeric +data, `InterpolationSearch`'s first guess is the answer — one comparison +per query, independent of `n`. On irregular data the guess is bad and the +binary-search fallback runs over the whole vector, costing ~14× more than +`ExpFromLeft` on log-spaced data and ~4× more on two-scale (piecewise +dense+sparse) data. The 0.1% tolerance on the linearity probe is what +keeps it in the linear regime where it wins. + +**`ExpFromLeft` is the default sparse-batched choice.** Across every +non-linear `v` pattern tested, `ExpFromLeft` from `prev_result` is within +20% of the per-cell optimum. It handles `gap = 5` (3-5 linear probes, +hit) just as well as `gap = 10⁴` (doubling search across 14 levels) with +no setup cost worth speaking of. + +**Skew override.** Even on linear `v`, when consecutive queries land in +the same region (clustered queries), `prev_result` is closer to the +current answer than the linear-extrapolation guess from the endpoints. +`ExpFromLeft` wins in that regime by ~2-3×. + +## Reproducing the benchmarks + +The full sweep lives at [`bench/auto_sweep.jl`](https://github.com/SciML/FindFirstFunctions.jl/blob/main/bench/auto_sweep.jl) +with the regime grid pre-configured. Run with +`julia --project=bench bench/auto_sweep.jl`. It evaluates every shipped +strategy against every regime cell, computes the per-cell winner, and +reports `Auto`'s slack distribution against that optimum. An analysis +helper at [`bench/analyze.jl`](https://github.com/SciML/FindFirstFunctions.jl/blob/main/bench/analyze.jl) +reads the resulting `bench/results.csv` and prints per-strategy +win-by-regime tables. + +The inline script below is the minimum-viable version of the same sweep — +copy into a file and execute if you want to validate the numbers without +cloning the repository. + +```julia +using BenchmarkTools, FindFirstFunctions, Random, Statistics + +const STRATEGIES = ( + LinearScan(), BracketGallop(), ExpFromLeft(), + InterpolationSearch(), BinaryBracket(), +) +const STRAT_NAMES = ("LinearScan", "BracketGallop", "ExpFromLeft", + "InterpolationSearch", "BinaryBracket") + +# ----- v patterns ------------------------------------------------------------ +make_v(:uniform, n) = collect(range(0.0, 1.0; length = n)) +make_v(:jittered, n) = begin + rng = Xoshiro(0) + base = collect(range(0.0, 1.0; length = n)) + sort!(base .+ 0.01 .* (rand(rng, n) .- 0.5) ./ n) +end +make_v(:logspaced, n) = collect(exp.(range(log(1.0), log(1.0e6); length = n))) +make_v(:twoscale, n) = begin + h = n ÷ 2 + sort!(vcat(range(0.0, 0.1; length = h), + range(0.1, 1.0; length = n - h))) +end +make_v(:random, n) = sort!(rand(Xoshiro(0), n)) + +# ----- query patterns -------------------------------------------------------- +make_q(:dense, v, m) = collect(range(v[1], v[end]; length = m)) +make_q(:sparse, v, m) = begin + rng = Xoshiro(1) + sort!(rand(rng, m) .* (v[end] - v[1]) .+ v[1]) +end +make_q(:clustered, v, m) = begin + # all queries inside one segment of v + j = length(v) ÷ 4 + sort!(rand(Xoshiro(2), m) .* (v[j + 1] - v[j]) .+ v[j]) +end +make_q(:sorted_rand, v, m) = sort!(rand(Xoshiro(3), m) .* (v[end] - v[1]) .+ v[1]) + +# ----- bench one cell -------------------------------------------------------- +function bench_cell(v_kind, q_kind, n, m) + v = make_v(Val(v_kind), n) + queries = make_q(Val(q_kind), v, m) + out = Vector{Int}(undef, m) + times = Float64[] + for s in STRATEGIES + t = @belapsed FindFirstFunctions.searchsortedlast!( + $out, $v, $queries; strategy = $s + ) + push!(times, t) + end + t_auto = @belapsed FindFirstFunctions.searchsortedlast!( + $out, $v, $queries; strategy = Auto() + ) + return times, t_auto +end + +# wrap so we can dispatch on Val(:uniform), etc. +make_v(::Val{K}, n) where {K} = make_v(K, n) +make_q(::Val{K}, v, m) where {K} = make_q(K, v, m) + +# ----- sweep ----------------------------------------------------------------- +function run_sweep() + ns = (64, 256, 1024, 4096, 16384, 65536) + ms = (1, 4, 16, 64, 256, 1024, 4096) + v_kinds = (:uniform, :jittered, :logspaced, :twoscale, :random) + q_kinds = (:dense, :sparse, :clustered, :sorted_rand) + + println("v_kind\tq_kind\tn\tm\tAuto/best\tbest_strat\tAuto_pick_strat") + autoslack = Float64[] + for v_kind in v_kinds, q_kind in q_kinds, n in ns, m in ms + m > n && continue + times, t_auto = bench_cell(v_kind, q_kind, n, m) + best, j = findmin(times) + ratio = t_auto / best + push!(autoslack, ratio) + println(string(v_kind, "\t", q_kind, "\t", n, "\t", m, + "\t", round(ratio; digits = 2), + "\t", STRAT_NAMES[j])) + end + println() + println("Auto-vs-best ratio summary:") + println(" median ", round(median(autoslack); digits = 3)) + println(" mean ", round(mean(autoslack); digits = 3)) + println(" p90 ", round(quantile(autoslack, 0.9); digits = 3)) + println(" max ", round(maximum(autoslack); digits = 3)) +end + +run_sweep() +``` + +### Headline results + +The 2.0 sweep covers 1080 cells across 5 `v` patterns × 4 query patterns × +5 `n` sizes × 6 `m` sizes × 2 element types (`Int64`, `Float64`), measured +on AVX2 hardware. + +| Metric | `Auto()` | `Auto(SearchProperties(v))` | +|---|---|---| +| median slack | 1.04× | 1.03× | +| mean slack | 1.09× | 1.08× | +| p90 slack | 1.33× | 1.31× | +| p95 slack | 1.38× | 1.38× | +| max slack | 2.18× | 2.09× | + +Per-cell winner distribution across the sweep: + + - LinearScan: 47% of cells (small-gap regime, all eltypes) + - SIMDLinearScan: 25% of cells (medium-gap regime, Int64/Float64 dense) + - InterpolationSearch: 13% of cells (large-gap regime, ~linear v) + - ExpFromLeft: 8% of cells (small-medium-gap regime, non-SIMD eltypes) + - BracketGallop: 8% of cells (large-gap regime, non-linear v) + +The cells where `Auto` slack is highest are boundary cells at `m = 4` where +the per-cell winner is a measurement artefact (each measurement amortises +over only 4 queries, so per-call setup noise dominates). The bulk of the +distribution sits well below 1.5×. + +### Reading the comparison table + +For each cell the script prints `Auto/best` (a ratio ≥ 1) along with the +per-cell winner and `Auto`'s actual pick. `Auto/best = 1.0` means `Auto` +picked the winner; `Auto/best = 1.5` means whatever `Auto` picked was 50% +slower than the per-cell winner. Investigate any row where the ratio +exceeds 1.5 — those are candidate cells for tightening one of the +constants in the table above. + +## Caching with `SearchProperties` + +Every call to `searchsortedlast!(out, v, queries; strategy = Auto())` against +the same `v` re-runs the same probes — `_sampled_looks_linear(v)` reads 11 +elements (~25 ns), and the cost is per-call regardless of how many times +you've already searched `v`. For callers issuing many short batches against +a single sorted vector (interpolation segment lookups being the obvious +case), caching the probes once and reusing the result is a real win. + +The cache is a small `isbits` struct, [`SearchProperties`](@ref +FindFirstFunctions.SearchProperties), that `Auto` accepts via +`Auto(props)`: + +```julia +using FindFirstFunctions + +v = collect(0.0:0.001:100.0) +props = SearchProperties(v) # run probes once +strat = Auto(props) # `Auto` holding the cached facts + +# Every subsequent searchsortedlast!/searchsortedfirst! call skips the +# linearity probe inside Auto. +queries = sort!(rand(8) .* 100.0) +out = Vector{Int}(undef, length(queries)) +searchsortedlast!(out, v, queries; strategy = strat) +``` + +`SearchProperties` is `isbits` — it travels in registers and copies are +free. `Auto(props)` is itself zero-allocation; the resulting `Auto` is a +single concrete struct, not a parametric type, so call sites stay +type-stable without specialization explosions. + +Currently consumed: `props.is_linear` (replaces Auto's +`_sampled_looks_linear` probe in the batched path). The other fields +(`has_props`, `has_nan`) are populated by `SearchProperties(v)` for forward +compatibility but no strategy reads them yet. Construction cost is O(1) for +integer eltypes (only the sampled-linearity probe runs) and O(n) for +floating-point eltypes (additionally `any(isnan, v)`). + +Trust contract: the cache is not invalidated automatically. If `v` mutates +after `SearchProperties(v)`, the caller must reconstruct the cache. Lying +to `Auto` via a hand-constructed `SearchProperties(true, true, false)` on +genuinely non-linear data is correctness-preserving (the chosen +`InterpolationSearch` falls through to `BracketGallop` from a bad guess — +slow but still O(log n)), so the worst case of a stale cache is a +performance regression, not wrong answers. + +## When `Auto` is wrong for you + +If your workload sits in a corner that `Auto` doesn't read well, pin the +strategy directly: + +```julia +# Sorted batch over a known-linear range, large m and n: skip the probe. +searchsortedlast!(out, v, queries; strategy = InterpolationSearch()) + +# Sorted batch but queries are guaranteed adjacent: don't pay for the +# 5 linear probes in ExpFromLeft. +searchsortedlast!(out, v, queries; strategy = LinearScan()) + +# No hint and the access pattern is random: skip Auto's probes entirely. +searchsortedlast!(out, v, queries; strategy = BinaryBracket()) +``` + +The strategy types are zero-allocation singletons, so pinning is free at +runtime and just removes the heuristic from the hot path. diff --git a/docs/src/equality.md b/docs/src/equality.md new file mode 100644 index 0000000..1fdf3e9 --- /dev/null +++ b/docs/src/equality.md @@ -0,0 +1,77 @@ +# Equality search + +This page documents the package's equality-search routines — +[`findfirstequal`](@ref FindFirstFunctions.findfirstequal) and +[`findfirstsortedequal`](@ref FindFirstFunctions.findfirstsortedequal). They +answer a *different* question from the [Search strategies](@ref) covered +elsewhere: + + - **Sorted-search** strategies answer "*where would `x` insert?*". The + return value is always an in-range index — the bracketing position. + - **Equality** search answers "*does `x` exist, and if so at what index?*". + The return value is `nothing` if `x` is not present. + +These two surfaces live side-by-side because the second cannot be expressed +as a `SearchStrategy` — its return type differs (`Union{Int, Nothing}` vs. +in-range `Int`), and the unsorted variant doesn't even require a sorted +input. + +For a strategy-framework-compatible *sorted* equality search that returns +`Int` with a sentinel (so it composes with the rest of the strategy +dispatch), see [`findequal`](@ref FindFirstFunctions.findequal) on the +[Search strategies](@ref) page. `findequal(BisectThenSIMD(), v, x)` is the +strategy entry point that internally calls into the same algorithm as +`findfirstsortedequal`. + +## API reference + +```@docs +FindFirstFunctions.findfirstequal +FindFirstFunctions.findfirstsortedequal +``` + +## When to use which + +| Question | Vector | Recommended | +|---|---|---| +| Does `x` occur in this *unsorted* vector? | any | [`findfirstequal`](@ref FindFirstFunctions.findfirstequal) | +| Does `x` occur in this *sorted* vector? | `DenseVector{Int64}` + `Int64` | [`findfirstsortedequal`](@ref FindFirstFunctions.findfirstsortedequal) (or `findequal(BisectThenSIMD(), v, x)` for the sentinel-returning variant) | +| Does `x` occur in this *sorted* vector? | other eltypes | [`findequal`](@ref FindFirstFunctions.findequal) with any strategy | +| Where would `x` insert into this sorted vector? | any | `searchsortedfirst(strategy, v, x[, hint])` | + +## SIMD primitives + +Both equality functions are backed by the same SIMD-equality LLVM IR +scaffolding used internally throughout the package (`load <8 x i64>`, +`icmp eq`, `cttz` on the bitmask of the 8-wide compare). The IR template +[`FindFirstFunctions._simd_scan_ir`](@ref) generates this for the equality +predicate; the same template generates the `>` / `>=` variants for +[`SIMDLinearScan`](@ref FindFirstFunctions.SIMDLinearScan). + +The SIMD path is `Int64`-only — the LLVM IR is keyed off `i64` element +width and 8-byte stride. Every other element type and storage layout +falls through to a scalar `findfirst(==(x), v)` path. Specifically: + + - `findfirstequal(x::Int64, v::DenseVector{Int64})` — full SIMD scan. + - `findfirstequal(x, v)` (generic) — `findfirst(isequal(x), v)`. + - `findfirstsortedequal(x::Int64, v::DenseVector{Int64})` — branchless + binary bisection to a small basecase, then SIMD equality scan within + the basecase window. The bisection uses a strict `<` predicate so + that earlier duplicates are not skipped (a previous version of the + routine used `<=` and could return a later duplicate; fixed in 2.0). + +## Sentinel vs. `Nothing` + +The two equality APIs differ in how they report "not found": + +```julia +findfirstequal(x, v) # -> Union{Int, Nothing}, `nothing` on miss +findfirstsortedequal(x, v) # -> Union{Int, Nothing}, `nothing` on miss +findequal(strategy, v, x) # -> Int, firstindex(v) - 1 on miss +``` + +`findequal`'s sentinel is type-stable and composes with the rest of the +strategy dispatch — that's the recommended choice for new sorted-equality +code. The two `findfirst*equal` names continue to return +`Union{Int, Nothing}` for backwards compatibility with callers and pattern +matching against `nothing`. diff --git a/docs/src/guessers.md b/docs/src/guessers.md new file mode 100644 index 0000000..98f18fd --- /dev/null +++ b/docs/src/guessers.md @@ -0,0 +1,119 @@ +# Guessers + +A [`Guesser`](@ref FindFirstFunctions.Guesser) is a small wrapper around a +sorted vector that produces an integer index hint for a given query, using +one of two strategies decided at construction time: + + - **Linear-extrapolation lookup**, when `v` is approximately uniformly + spaced. The guess is `firstindex(v) + round((x - v[1]) / (v[end] - v[1]) * (length(v) - 1))`. + Cost is O(1) per call, independent of `length(v)`. + - **Cached previous result**, otherwise. The guesser hands back its + `idx_prev` field. Useful for callers that issue temporally-correlated + queries — the previous answer is a good seed for `BracketGallop`. + +The Guesser does **not** perform the actual search. To use it as a search +strategy, wrap it in [`GuesserHint`](@ref FindFirstFunctions.GuesserHint). + +## Construction + +```@docs +FindFirstFunctions.Guesser +``` + +The threshold passed via `looks_linear_threshold` is the same threshold used +by [`looks_linear`](@ref FindFirstFunctions.looks_linear) (which the Guesser +calls under the hood). The default `1e-2` accepts everything from exact +linear ranges through evenly-spaced grids with mild jitter, but rejects +log-spaced or piecewise-spaced data. + +```@docs +FindFirstFunctions.looks_linear +``` + +## Calling a Guesser + +`guesser(x)` returns an integer index hint, but does **not** update any +state. The returned hint may be outside `axes(v)` if `x` is past either end +of `v`; callers should treat the hint as advisory and clamp. + +```julia +v = collect(0.0:0.1:10.0) +g = Guesser(v) +g(3.14) # → 32, an O(1) extrapolation guess +``` + +For irregular data the same call returns `g.idx_prev[]` — the last index +written by a [`GuesserHint`](@ref FindFirstFunctions.GuesserHint) search (or +`1` if no search has run yet). + +## Plugging into the strategy dispatch + +[`GuesserHint`](@ref FindFirstFunctions.GuesserHint) is the strategy adapter +that turns a `Guesser` into a `SearchStrategy`. It: + + 1. Calls `guesser(x)` to obtain an integer guess. + 2. Dispatches to [`BracketGallop`](@ref FindFirstFunctions.BracketGallop) + from that guess. + 3. Writes the resulting index back into `guesser.idx_prev`. + +```@docs +FindFirstFunctions.GuesserHint +``` + +Per-call cost is one `guesser(x)` evaluation (O(1)) plus one `BracketGallop` +(O(1) when the guess is close) plus one `idx_prev[]` write. + +The vector passed to `searchsortedfirst` / `searchsortedlast` must be the +same object the Guesser wraps — `GuesserHint` asserts `v === s.guesser.v` to +catch the obvious misuse. + +```julia +v = collect(0.0:0.1:10.0) +g = Guesser(v) +strat = GuesserHint(g) + +i = searchsortedlast(strat, v, 3.14) +@assert g.idx_prev[] == i # the guesser caches the last result +``` + +`GuesserHint` ignores any externally-supplied hint — the Guesser carries its +own hint state, and accepting a foreign hint would defeat the cache. + +## Pattern: correlated lookups for interpolation + +The intended use is a wrapper struct that owns both the sorted vector and a +matching `Guesser`. Every query against the wrapper feeds through +`GuesserHint`: + +```julia +struct Interp{V} + v::V + g::Guesser{V} +end +Interp(v) = Interp(v, Guesser(v)) + +function find_segment(itp::Interp, x) + return searchsortedlast(GuesserHint(itp.g), itp.v, x) +end +``` + +After warmup, repeated calls to `find_segment` with temporally correlated +`x` cost O(1): the previous result is one slot away from the current +answer, so the `BracketGallop` inside `GuesserHint` returns after a couple +of comparisons. + +## When not to use a Guesser + + - **One-shot queries.** Construct cost is O(n) for the `looks_linear` + probe — wasted if you only intend to search once. Use + [`BinaryBracket`](@ref FindFirstFunctions.BinaryBracket) or pass `Auto()` + with no hint. + - **Sorted batches.** The batched + [`searchsortedlast!`](@ref FindFirstFunctions.searchsortedlast!) / + [`searchsortedfirst!`](@ref FindFirstFunctions.searchsortedfirst!) APIs + already do `prev_result`-style hinting internally; they don't need a + Guesser and they pick a faster strategy when the sweep is dense. + - **Strictly random queries against irregular `v`.** `looks_linear` returns + false, so the Guesser degrades to returning `idx_prev[]`, which is + useless for random queries. `BracketGallop` from `idx_prev` is fine but + `Auto`-on-a-batch does the same thing without the Guesser overhead. diff --git a/docs/src/index.md b/docs/src/index.md index f516303..045b97c 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,29 +1,61 @@ # FindFirstFunctions.jl -FindFirstFunctions.jl is a library for accelerated `findfirst` type functions. +FindFirstFunctions.jl is a library of accelerated `findfirst`-style and +sorted-search routines. The package provides: + + - A **strategy-dispatched** sorted-search API: a single pair of + `Base.searchsortedfirst` / `Base.searchsortedlast` overloads that take a + [`SearchStrategy`](@ref FindFirstFunctions.SearchStrategy) as the first + positional argument. + - **Batched** in-place lookups + [`searchsortedfirst!`](@ref FindFirstFunctions.searchsortedfirst!) / + [`searchsortedlast!`](@ref FindFirstFunctions.searchsortedlast!) that pick + a strategy automatically. + - **Guessers** that supply per-vector hints based on linear extrapolation + or a cached previous result. + - **Equality search** via [`findfirstequal`](@ref FindFirstFunctions.findfirstequal) + and [`findfirstsortedequal`](@ref FindFirstFunctions.findfirstsortedequal). ## Installation -To install FindFirstFunctions.jl, use the Julia package manager: - ```julia using Pkg Pkg.add("FindFirstFunctions") ``` -## Available Functions - -```@docs -FindFirstFunctions.findfirstequal -FindFirstFunctions.bracketstrictlymontonic -FindFirstFunctions.looks_linear -FindFirstFunctions.Guesser -FindFirstFunctions.searchsortedfirstcorrelated -FindFirstFunctions.searchsortedlastcorrelated -FindFirstFunctions.searchsortedfirstexp -FindFirstFunctions.searchsortedfirstvec -FindFirstFunctions.searchsortedlastvec -FindFirstFunctions.findfirstsortedequal +## Guide + + - [Interface and extension rules](@ref) — the public API surface, the + contract a `SearchStrategy` subtype must satisfy, and how to add a new + one. + - [Search strategies](@ref) — catalog of the built-in strategies, when each + one is fast, and when it falls back to plain binary search. + - [Guessers](@ref) — the [`Guesser`](@ref FindFirstFunctions.Guesser) type + and how to plug it into the strategy dispatch via + [`GuesserHint`](@ref FindFirstFunctions.GuesserHint). + - [Auto: heuristics and benchmarks](@ref) — what + [`Auto`](@ref FindFirstFunctions.Auto) picks in every regime, the + crossover constants, and the benchmark script that validates them. + - [Equality search](@ref) — the dedicated equality routines + [`findfirstequal`](@ref FindFirstFunctions.findfirstequal) and + [`findfirstsortedequal`](@ref FindFirstFunctions.findfirstsortedequal), + which live outside the strategy framework because their return type is + `Union{Int, Nothing}`. + +## Quick example + +```julia +using FindFirstFunctions + +v = collect(0.0:0.1:10.0) +queries = sort!(rand(100) .* 10) + +# Single query with a hint. +i = searchsortedlast(BracketGallop(), v, 3.14, 30) + +# Batched, with strategy chosen by Auto. +idx = Vector{Int}(undef, length(queries)) +searchsortedlast!(idx, v, queries) ``` ## Contributing diff --git a/docs/src/interface.md b/docs/src/interface.md new file mode 100644 index 0000000..b07a98a --- /dev/null +++ b/docs/src/interface.md @@ -0,0 +1,167 @@ +# Interface and extension rules + +This page documents the public sorted-search API surface and the contract a +custom [`SearchStrategy`](@ref FindFirstFunctions.SearchStrategy) subtype +must satisfy. + +## Public API surface + +In 2.x the sorted-search public API is a single pair of generic functions +overloaded on a strategy as the first positional argument: + +```julia +searchsortedfirst(strategy, v, x[, hint]; order = Base.Order.Forward) +searchsortedlast(strategy, v, x[, hint]; order = Base.Order.Forward) +``` + +and the in-place batched variants: + +```julia +searchsortedfirst!(idx_out, v, queries; strategy = Auto(), order = Base.Order.Forward) +searchsortedlast!(idx_out, v, queries; strategy = Auto(), order = Base.Order.Forward) +``` + +The strategy types live in the `FindFirstFunctions` module; the +`searchsortedfirst`/`searchsortedlast` names are extended from `Base` so they +compose with existing `Base.Order` orderings. + +```@docs +FindFirstFunctions.searchsortedfirst! +FindFirstFunctions.searchsortedlast! +``` + +## Rules of the interface + + 1. **Hint is optional and is an `Integer`.** When supplied it must be a + valid index into `v` (`firstindex(v) ≤ hint ≤ lastindex(v)`). An + out-of-range hint is silently treated as "no hint" by every built-in + strategy. A strategy is allowed to ignore the hint entirely + ([`InterpolationSearch`](@ref FindFirstFunctions.InterpolationSearch), + [`BinaryBracket`](@ref FindFirstFunctions.BinaryBracket)). + 2. **Strategies are singletons or wrappers.** `LinearScan`, `BracketGallop`, + `ExpFromLeft`, `InterpolationSearch`, `BinaryBracket`, and `Auto` are + zero-field singletons. + [`GuesserHint`](@ref FindFirstFunctions.GuesserHint) is a thin wrapper + around a [`Guesser`](@ref FindFirstFunctions.Guesser). New strategies + should follow the same pattern: parameters that change *behaviour* + belong on the type; parameters that change *cost only* should be tuned + internally. + 3. **No mutation of `v` or `x`.** A strategy never writes to the searched + vector or to the query. The only state that may change across calls is + hint state carried by the strategy itself (e.g. `GuesserHint` updates + `guesser.idx_prev`). + 4. **Order is honored.** Every strategy accepts an `order::Base.Order.Ordering` + keyword and returns indices consistent with `Base.searchsortedfirst` / + `Base.searchsortedlast` under that ordering. Strategies that are only + efficient under `Forward` ordering (e.g. `InterpolationSearch`, + `ExpFromLeft`) fall back to `BinaryBracket` for non-`Forward` orderings. + 5. **`AbstractRange` is already O(1).** `Base.searchsortedfirst(r::AbstractRange, x)` + has a closed-form implementation. Strategies do not need range fast + paths — the `BinaryBracket` fallback already calls into Base's + range-aware method. + +## Anatomy of a strategy + +A built-in strategy provides up to four methods on each of +`Base.searchsortedfirst` / `Base.searchsortedlast`: + +```julia +# Required: the hinted form (the strategy's reason for existing). +Base.searchsortedlast(::MyStrategy, v, x, hint::Integer; order) = ... +Base.searchsortedfirst(::MyStrategy, v, x, hint::Integer; order) = ... + +# Required: the unhinted form. Most strategies just fall back to BinaryBracket. +Base.searchsortedlast(::MyStrategy, v, x; order) = searchsortedlast(BinaryBracket(), v, x; order) +Base.searchsortedfirst(::MyStrategy, v, x; order) = searchsortedfirst(BinaryBracket(), v, x; order) +``` + +If your strategy ignores the hint, define just the unhinted form and have the +hinted form delegate to it (see `BinaryBracket` and `InterpolationSearch` in +the source). + +## How to add a new strategy + +Two steps. + +### 1. Define the strategy type + +Pick a name that describes the *algorithm*, not the use case. Make it a +subtype of [`SearchStrategy`](@ref FindFirstFunctions.SearchStrategy): + +```julia +""" + MyStrategy <: FindFirstFunctions.SearchStrategy + +One sentence on what it does. One sentence on when it wins. One sentence on +when it falls back. +""" +struct MyStrategy <: FindFirstFunctions.SearchStrategy end +``` + +If your strategy carries state (like `GuesserHint`), make it a parametric +struct: + +```julia +struct MyStrategy{S} <: FindFirstFunctions.SearchStrategy + state::S +end +``` + +### 2. Implement the dispatch methods + +At minimum, the hinted forms: + +```julia +function Base.searchsortedlast( + ::MyStrategy, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward + ) + # Algorithm body. Must return the same index that + # `Base.searchsortedlast(v, x, order)` would. + ... +end + +function Base.searchsortedfirst( + ::MyStrategy, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward + ) + ... +end +``` + +Plus unhinted forms (typically fallbacks): + +```julia +Base.searchsortedlast(s::MyStrategy, v::AbstractVector, x; order = Base.Order.Forward) = + searchsortedlast(FindFirstFunctions.BinaryBracket(), v, x; order = order) +Base.searchsortedfirst(s::MyStrategy, v::AbstractVector, x; order = Base.Order.Forward) = + searchsortedfirst(FindFirstFunctions.BinaryBracket(), v, x; order = order) +``` + +### Correctness check + +Every strategy must return the same answer as plain `Base.searchsortedlast` / +`Base.searchsortedfirst` for every `(v, x[, hint])` triple. Test with random +inputs against `Base`: + +```julia +using Test, Random +Random.seed!(0) +for trial in 1:10_000 + v = sort!(randn(rand(1:1000))) + x = randn() + hint = rand(1:length(v)) + @test searchsortedlast(MyStrategy(), v, x, hint) == searchsortedlast(v, x) + @test searchsortedfirst(MyStrategy(), v, x, hint) == searchsortedfirst(v, x) +end +``` + +### Hooking into `Auto` + +`Auto`'s decision tree lives in `_auto_pick` (per-query) and +`_searchsortedlast_batched!(_, _, _, ::Auto, _)` / +`_searchsortedfirst_batched!(_, _, _, ::Auto, _)` (batched). It is **not +extensible from outside** — new strategies do not register themselves with +`Auto` automatically. If you believe `Auto` should pick your strategy in +some regime, open an issue with benchmark numbers across the regime grid in +[Auto: heuristics and benchmarks](@ref). diff --git a/docs/src/strategies.md b/docs/src/strategies.md new file mode 100644 index 0000000..6836fb4 --- /dev/null +++ b/docs/src/strategies.md @@ -0,0 +1,244 @@ +# Search strategies + +The strategies form the parameter space of the sorted-search API. Each one +subtypes [`SearchStrategy`](@ref FindFirstFunctions.SearchStrategy) and is +selected as the first positional argument of `searchsortedfirst` / +`searchsortedlast`. + +```@docs +FindFirstFunctions.SearchStrategy +``` + +## When to pick which + +For most callers the answer is: pass [`Auto`](@ref FindFirstFunctions.Auto) +(the default in the batched API) and let it choose. The table below is for +callers who already know their access pattern and want to pin a strategy. + +| Strategy | Best when | Cost when hint hits | Cost worst case | Uses hint? | +|---|---|---|---|---| +| [`LinearScan`](@ref FindFirstFunctions.LinearScan) | answer is within a handful of slots of the hint | O(1) | O(n) | yes | +| [`SIMDLinearScan`](@ref FindFirstFunctions.SIMDLinearScan) | `DenseVector{Int64}` or `DenseVector{Float64}`, forward walk past the hint | O(1) | O(n/8) | yes | +| [`BracketGallop`](@ref FindFirstFunctions.BracketGallop) | answer may be either side of the hint, distance unknown | O(1) | ~2 log₂ n | yes | +| [`ExpFromLeft`](@ref FindFirstFunctions.ExpFromLeft) | sorted batch — hint is `prev_result`, answer is monotonically ≥ hint | O(1) | O(log n) | yes (as lower bound) | +| [`InterpolationSearch`](@ref FindFirstFunctions.InterpolationSearch) | `v` is uniformly spaced and numeric | O(1) | O(log n) | no | +| [`BitInterpolationSearch`](@ref FindFirstFunctions.BitInterpolationSearch) | `DenseVector{Float64}` and log-spaced (geometric) — opt-in, `Auto` does not pick | O(1) | O(log n) | no | +| [`BinaryBracket`](@ref FindFirstFunctions.BinaryBracket) | no hint available, or fallback | O(log n) | O(log n) | no | +| [`GuesserHint`](@ref FindFirstFunctions.GuesserHint) | repeated correlated lookups against the same `v` | O(1) | ~2 log₂ n | self-provided | +| [`Auto`](@ref FindFirstFunctions.Auto) | unknown access pattern | varies | varies | yes if supplied | + +All hint-consuming strategies fall back to `BinaryBracket` when no hint is +supplied or when the hint is out of range. `InterpolationSearch` additionally +falls back to `BinaryBracket` for non-numeric element types. + +## Reference + +```@docs +FindFirstFunctions.LinearScan +FindFirstFunctions.SIMDLinearScan +FindFirstFunctions.BracketGallop +FindFirstFunctions.ExpFromLeft +FindFirstFunctions.InterpolationSearch +FindFirstFunctions.BitInterpolationSearch +FindFirstFunctions.BinaryBracket +FindFirstFunctions.BisectThenSIMD +FindFirstFunctions.Auto +FindFirstFunctions.SearchProperties +``` + +## Equality search through the strategy framework + +Strategies answer positional questions ("where would `x` insert?"). Equality +asks a different question ("is `x` at exactly which index?"). The +[`findequal`](@ref FindFirstFunctions.findequal) wrapper builds the latter +on top of the former: every strategy gets an equality variant for free. + +```@docs +FindFirstFunctions.findequal +``` + +The sentinel for "not found" is `firstindex(v) - 1` (`= 0` for 1-based +vectors). Type-stable `Int` return, no `Union` with `Nothing`. Callers can +test for absence with `i < firstindex(v)`. + +`findequal` routes most strategies through `searchsortedfirst + post-check` +generically, so `findequal(BracketGallop(), v, x, hint)`, +`findequal(SIMDLinearScan(), v, x, hint)`, +`findequal(GuesserHint(g), v, x)`, etc. all just work. + +The [`BisectThenSIMD`](@ref FindFirstFunctions.BisectThenSIMD) strategy +short-circuits the post-check path on `DenseVector{Int64}` by dispatching +into [`findfirstsortedequal`](@ref FindFirstFunctions.findfirstsortedequal) +directly — same custom LLVM IR scan, exposed through the strategy framework. + +`GuesserHint` is documented on the [Guessers](@ref) page. + +## Notes on individual strategies + +### LinearScan + +Walks `±1` from the hint until the answer is bracketed. Cheapest possible +search when the hint is right next to the answer — two comparisons. The only +strategy whose worst case is O(n), so it should only be picked when the +caller has strong evidence that the hint is close. + +For `length(v) ≤ 16`, `LinearScan` is faster than `BracketGallop` even from a +bad hint because the bracket bookkeeping costs more than a worst-case walk +across a vector that short. `Auto`'s per-query path picks `LinearScan` below +that threshold. + +### SIMDLinearScan + +Same algorithm as `LinearScan`, with the **forward** walk past the hint +lowered to 8-wide SIMD chunks via custom LLVM IR. The backward walk (when +the hint is past the answer) uses the scalar `LinearScan` path — the SIMD +primitive is only defined in the forward direction. + +Specialized for `DenseVector{Int64}` and `DenseVector{Float64}`. Any other +element type falls back to the scalar `LinearScan` walk (this includes +`Int32`, `UInt64`, `Float32`, `Date`, `String`, and user-defined numeric +types). The dispatch is *static* — there's no runtime type test on a hot +path — so the fallback costs nothing per-call when picked at compile time. + +Caveats: + + - **Element types**: `Int64` and `Float64` only. Anything else uses + scalar `LinearScan`. This is a hard restriction of the LLVM IR: the + vector load uses `<8 x i64>` / `<8 x double>` with 8-byte stride, and + the broadcast and compare are typed accordingly. + - **NaN**: a `NaN` element in a `Float64` vector compares as `false` + under both `fcmp ogt` and `fcmp oge`, so a NaN in `v` is silently + skipped by the SIMD scan. Sorted `Float64` vectors containing `NaN` + aren't well-defined under any total order anyway — same caveat + applies to plain `Base.searchsortedlast` on such vectors. + - **Forward order only**: non-`Forward` orderings fall back to scalar + `LinearScan`. The IR is hard-coded to the `Forward` comparison + polarity. + - **No hint**: falls back to [`BinaryBracket`](@ref FindFirstFunctions.BinaryBracket). + Without a hint there's no direction information for the forward scan. + - **Auto does not pick this strategy.** `SIMDLinearScan` is opt-in. It + isn't part of the `Auto` decision tree because the regime where it + strictly beats `LinearScan` (long forward walks on `Int64`/`Float64`) + overlaps with the regime where `Auto` already prefers + [`BracketGallop`](@ref FindFirstFunctions.BracketGallop) or + [`ExpFromLeft`](@ref FindFirstFunctions.ExpFromLeft). Pin it + explicitly when you have a workload that wants a long linear forward + scan and you know the element type. + +### BitInterpolationSearch + +`InterpolationSearch` with the extrapolation guess computed on the IEEE +bit pattern of `v` rather than the float values themselves. For positive +Float64 values, the IEEE bit pattern is monotonically increasing with the +float value and is approximately *linear* in array index for log-spaced +(geometric) data. That makes the bit-domain linear extrapolation a far +better guess than the float-domain linear extrapolation on geometric data +— sometimes O(1) versus O(log n) refinement cost. + +**Opt-in only.** `Auto` does not pick `BitInterpolationSearch`. The bench +sweep at `bench/bitinterp_sweep.jl` covers 1404 cells (9 v patterns × 4 q +patterns × 6 n sizes up to 2²⁰ × 7 m sizes, exercising pure-geometric, +log-spaced over 18 decades, power-of-2 spacing, two-decade clumps, and +jittered-log alongside uniform/sqrt as negative controls). BitInterp +wins outright in 59 cells (4.2%) and sits within 10% of the per-cell +best in 75 cells (5.3%). The wins concentrate in: + + - `logspaced` / `logspaced_wide` / `geometric_dense` / `geometric_sparse` + / `jittered_log` — i.e. genuinely geometric data. + - Small `m` (= 4, occasionally 16): the per-query bit-domain guess cost + amortizes poorly across larger batches. + - Large `n` (≥ 2¹⁴, peaking at 2²⁰): the saved bracket refinement scales + with `log₂ n`, while the per-query setup cost is constant. + +Sample wins (BitInterp vs second-best, ns/q): + +| Cell | BitInterp | Runner-up | Margin | +|---|---|---|---| +| `logspaced_wide log_grid n=2²⁰ m=4` | 52.5 | InterpolationSearch 75.0 | 1.43× | +| `logspaced_wide log_grid n=2¹² m=4` | 35.0 | ExpFromLeft 47.5 | 1.36× | +| `logspaced_wide log_grid n=2¹⁸ m=4` | 47.5 | ExpFromLeft 62.5 | 1.32× | +| `logspaced_wide dense_grid n=2²⁰ m=4` | 50.0 | ExpFromLeft 65.0 | 1.30× | + +`Auto` doesn't pick it because: + - The wins are narrow (4% of cells in a bench specifically designed to + probe BitInterp's regime). + - Adding the eligibility check to `Auto`'s hot path (Float64 + positive + + log-linear sampled probe) would burn a few ns on every call, paying + back only in cells with `m ≤ 16` where Auto's overhead already + dominates the per-query cost. + - Users with a known log-spaced workload can pin + `searchsortedlast!(out, v, queries; strategy = BitInterpolationSearch())` + once and get the win without any heuristic cost. + +The strategy is retained as an opt-in for callers whose workload sits +outside what `Auto` discovers cheaply: domain-specific tables (radiation +transport, log-frequency, gravitational potentials) or hardware where +Float64 division is unusually cheap. + +Falls back to plain `InterpolationSearch` on non-Float64 dense eltypes +(where the bit pattern equals the value, making the strategies +equivalent), and to `BinaryBracket` for non-positive or non-finite Float64 +data. + +### BracketGallop + +Galloping search around the hint: expand `[lo, hi]` outward by doubling steps +until `x` is bracketed, then binary-search inside `[lo, hi]`. Direction is +inferred from `v[hint]` vs. `x`, so the hint can be either above or below the +answer. Worst case is ~2 log₂ n — about twice plain binary search — and it +matches O(1) when the hint is close. + +This is the workhorse hinted strategy and the natural choice for +`get_idx`-style callers where the hint is a cached previous result. + +### ExpFromLeft + +Exponential search forward from a hint interpreted as a *lower bound*. The +algorithm probes `v[lo], v[lo+1], …, v[lo+4]` linearly, then `v[lo+8], +v[lo+16], …` exponentially, then binary-searches inside the final bracket. + +Used by `Auto`'s batched dispatch when the queries are sorted: each call +passes `hint = previous_result`, which by sortedness satisfies the "answer ≥ +hint" precondition. When the precondition is violated (the caller passes a +hint past the answer), `ExpFromLeft` falls back to a full +`searchsortedfirst` / `searchsortedlast` — slow but correct. + +### InterpolationSearch + +Computes a guess via linear extrapolation between `v[lo]` and `v[hi]`, then +refines with a bounded binary search around that guess. On uniformly-spaced +numeric data the first guess is the right answer — O(1) per query +independent of `n`. On irregular data the guess is bad and the binary search +inside the (full) bracket falls back to O(log n). + +Two restrictions: + + - **Numeric eltype**: requires `x - v[i]` to be well-defined and produce a + number whose ratio with `v[hi] - v[lo]` makes sense. Non-numeric eltypes + fall back to `BinaryBracket`. + - **Forward ordering only**: the linear-extrapolation formula assumes + `v[lo] ≤ v[hi]`. Non-`Forward` orderings fall back to `BinaryBracket`. + +The hint is ignored — the guess is computed fresh from the endpoints. + +### BinaryBracket + +Plain `Base.searchsortedlast` / `Base.searchsortedfirst`. Provided as a +strategy so that callers can opt out of hint-based behaviour explicitly, and +so that other strategies have a well-defined name to fall back to. Ignores +any hint that is supplied. + +### Auto + +See [Auto: heuristics and benchmarks](@ref) for the full decision tree and +the benchmark sweep that produced its crossover constants. + +## Equality search + +The package exposes two `Union{Int, Nothing}`-returning equality routines — +[`findfirstequal`](@ref FindFirstFunctions.findfirstequal) (unsorted SIMD +scan) and [`findfirstsortedequal`](@ref FindFirstFunctions.findfirstsortedequal) +(sorted bisect-then-SIMD scan). They live outside the strategy framework +because their return semantics differ (`nothing` on miss, vs. in-range +index for the positional API). See the [Equality search](@ref) page for the +full documentation. diff --git a/src/FindFirstFunctions.jl b/src/FindFirstFunctions.jl index bb6f4da..2c962c6 100644 --- a/src/FindFirstFunctions.jl +++ b/src/FindFirstFunctions.jl @@ -1,457 +1,39 @@ module FindFirstFunctions -# https://github.com/JuliaLang/julia/pull/53687 +# Public API surface for `using FindFirstFunctions`. The strategy types are +# zero-field singletons (except `GuesserHint` and `Auto`, which carry small +# isbits payloads), so exporting them only adds names to the caller's +# namespace — no runtime cost. `searchsortedfirst!` / `searchsortedlast!` +# are FFF-defined names (the non-bang `searchsortedfirst` / +# `searchsortedlast` are extensions of `Base` and are reachable without +# qualification once `Base` is in scope). +export + SearchStrategy, + LinearScan, SIMDLinearScan, BracketGallop, ExpFromLeft, + InterpolationSearch, BitInterpolationSearch, + BinaryBracket, BisectThenSIMD, + GuesserHint, Auto, + SearchProperties, + Guesser, looks_linear, + searchsortedfirst!, searchsortedlast!, searchsortedrange, + findequal, findfirstequal, findfirstsortedequal + +# Julia 1.12 changed how `Ptr{T}` arguments to `Base.llvmcall` are passed: +# they're now real pointers rather than i64s. https://github.com/JuliaLang/julia/pull/53687 const USE_PTR = VERSION >= v"1.12.0-DEV.255" -const FFE_IR = """ -declare i8 @llvm.cttz.i8(i8, i1); -define i64 @entry(i64 %0, $(USE_PTR ? "ptr" : "i64") %1, i64 %2) #0 { -top: - $(USE_PTR ? "" : "%ivars = inttoptr i64 %1 to i64*") - %btmp = insertelement <8 x i64> undef, i64 %0, i64 0 - %var = shufflevector <8 x i64> %btmp, <8 x i64> undef, <8 x i32> zeroinitializer - %lenm7 = add nsw i64 %2, -7 - %dosimditer = icmp ugt i64 %2, 7 - br i1 %dosimditer, label %L9.lr.ph, label %L32 -L9.lr.ph: - %len8 = and i64 %2, 9223372036854775800 - br label %L9 - -L9: - %i = phi i64 [ 0, %L9.lr.ph ], [ %vinc, %L30 ] - %ivarsi = getelementptr inbounds i64, $(USE_PTR ? "ptr %1" : "i64* %ivars"), i64 %i - $(USE_PTR ? "" : "%vpvi = bitcast i64* %ivarsi to <8 x i64>*") - %v = load <8 x i64>, $(USE_PTR ? "ptr %ivarsi" : "<8 x i64> * %vpvi"), align 8 - %m = icmp eq <8 x i64> %v, %var - %mu = bitcast <8 x i1> %m to i8 - %matchnotfound = icmp eq i8 %mu, 0 - br i1 %matchnotfound, label %L30, label %L17 - -L17: - %tz8 = call i8 @llvm.cttz.i8(i8 %mu, i1 true) - %tz64 = zext i8 %tz8 to i64 - %vis = add nuw i64 %i, %tz64 - br label %common.ret - -common.ret: - %retval = phi i64 [ %vis, %L17 ], [ -1, %L32 ], [ %si, %L51 ], [ -1, %L67 ] - ret i64 %retval - -L30: - %vinc = add nuw nsw i64 %i, 8 - %continue = icmp slt i64 %vinc, %lenm7 - br i1 %continue, label %L9, label %L32 - -L32: - %cumi = phi i64 [ 0, %top ], [ %len8, %L30 ] - %done = icmp eq i64 %cumi, %2 - br i1 %done, label %common.ret, label %L51 - -L51: - %si = phi i64 [ %inc, %L67 ], [ %cumi, %L32 ] - %spi = getelementptr inbounds i64, $(USE_PTR ? "ptr %1" : "i64* %ivars"), i64 %si - %svi = load i64, $(USE_PTR ? "ptr" : "i64*") %spi, align 8 - %match = icmp eq i64 %svi, %0 - br i1 %match, label %common.ret, label %L67 - -L67: - %inc = add i64 %si, 1 - %dobreak = icmp eq i64 %inc, %2 - br i1 %dobreak, label %common.ret, label %L51 - -} -attributes #0 = { alwaysinline } -""" - -function _findfirstequal(vpivot::Int64, ptr::Ptr{Int64}, len::Int64) - return Base.llvmcall( - (FFE_IR, "entry"), - Int64, - Tuple{Int64, Ptr{Int64}, Int64}, - vpivot, - ptr, - len - ) -end - -""" - findfirstequal(x::Int64,A::DenseVector{Int64}) - -Finds the first index in `A` where the value equals `x`. -""" -findfirstequal(vpivot, ivars) = findfirst(isequal(vpivot), ivars) -function findfirstequal(vpivot::Int64, ivars::DenseVector{Int64}) - GC.@preserve ivars begin - ret = _findfirstequal(vpivot, pointer(ivars), length(ivars)) - end - return ret < 0 ? nothing : ret + 1 -end - -""" -findfirstsortedequal(vars::DenseVector{Int64}, var::Int64)::Union{Int64,Nothing} - -Note that this differs from `searchsortedfirst` by returning `nothing` when absent. -""" -function findfirstsortedequal( - var::Int64, - vars::DenseVector{Int64}, - ::Val{basecase} = Base.libllvm_version >= v"17" ? Val(8) : Val(128) - ) where {basecase} - len = length(vars) - offset = 0 - @inbounds while len > basecase - half = len >>> 1 # half on left, len - half on right - if Base.libllvm_version >= v"17" - # TODO: check if this works - # I'm worried the `!unpredictable` metadata will be stripped - offset = Base.llvmcall( - ( - """ - define i64 @entry(i8 %0, i64 %1, i64 %2) #0 { - top: - %b = trunc i8 %0 to i1 - %s = select i1 %b, i64 %1, i64 %2, !unpredictable !0 - ret i64 %s - } - attributes #0 = { alwaysinline } - !0 = !{} - """, - "entry", - ), - Int64, - Tuple{Bool, Int64, Int64}, - vars[offset + half + 1] <= var, - half + offset, - offset - ) - else - offset = ifelse(vars[offset + half + 1] <= var, half + offset, offset) - end - len = len - half - end - # maybe occurs in vars[offset+1:offset+len] - GC.@preserve vars begin - ret = _findfirstequal(var, pointer(vars) + 8offset, len) - end - # return ret - return ret < 0 ? nothing : ret + offset + 1 -end - -""" - bracketstrictlymontonic(v, x, guess; lt=, by=, rev=false) - -Starting from an initial `guess` index, find indices `(lo, hi)` such that `v[lo] ≤ x ≤ v[hi]` according to the specified order, assuming that `x` is actually within the range of -values found in `v`. If `x` is outside that range, either `lo` will be `firstindex(v)` or -`hi` will be `lastindex(v)`. - -Note that the results will not typically satisfy `lo ≤ guess ≤ hi`. If `x` is precisely -equal to a value that is not unique in the input `v`, there is no guarantee that `(lo, hi)` -will encompass *all* indices corresponding to that value. - -This algorithm is essentially an expanding binary search, which can be used as a precursor -to `searchsorted` and related functions, which can take `lo` and `hi` as arguments. The -purpose of using this function first would be to accelerate convergence in those functions -by using correlated `guess`es for repeated calls. The best `guess` for the next call of -this function would be the index returned by the previous call to `searchsorted`. - -See `Base.sort!` for an explanation of the keyword arguments `by`, `lt` and `rev`. -""" -function bracketstrictlymontonic( - v::AbstractVector, - x, - guess::T, - o::Base.Order.Ordering - )::NTuple{2, keytype(v)} where {T <: Integer} - bottom = firstindex(v) - top = lastindex(v) - if guess < bottom || guess > top - return bottom, top - # # NOTE: for cache efficiency in repeated calls, we avoid accessing the first and last elements of `v` - # # on each call to this function. This should only result in significant slow downs for calls with - # # out-of-bounds values of `x` *and* bad `guess`es. - # elseif lt(o, x, v[bottom]) - # return bottom, bottom - # elseif lt(o, v[top], x) - # return top, top - else - u = T(1) - lo, hi = guess, min(guess + u, top) - @inbounds if Base.Order.lt(o, x, v[lo]) - while lo > bottom && Base.Order.lt(o, x, v[lo]) - lo, hi = max(bottom, lo - u), lo - u += u - end - else - while hi < top && !Base.Order.lt(o, x, v[hi]) - lo, hi = hi, min(top, hi + u) - u += u - end - end - end - return lo, hi -end - -""" - looks_linear(v; threshold = 1e-2) - -Determine if the abscissae `v` are regularly distributed, taking the standard deviation of -the difference between the array of abscissae with respect to the straight line linking -its first and last elements, normalized by the range of `v`. If this standard deviation is -below the given `threshold`, the vector looks linear (return true). Internal function - -interface may change. -""" -function looks_linear(v; threshold = 1.0e-2) - length(v) <= 2 && return true - x_0, x_f = first(v), last(v) - N = length(v) - x_span = x_f - x_0 - mean_x_dist = x_span / (N - 1) - norm_var = sum((x_i - x_0 - (i - 1) * mean_x_dist)^2 for (i, x_i) in enumerate(v)) / - (N * x_span^2) - return norm_var < threshold^2 -end - -""" - Guesser(v::AbstractVector; looks_linear_threshold = 1e-2) - -Wrapper of the searched vector `v` which makes an informed guess -for `searchsorted*correlated` by either - - - Exploiting that `v` is sufficiently evenly spaced - - Using the previous outcome of `searchsorted*correlated` -""" -struct Guesser{T <: AbstractVector} - v::T - idx_prev::Base.RefValue{Int} - linear_lookup::Bool -end - -function Guesser(v::AbstractVector; looks_linear_threshold = 1.0e-2) - return Guesser(v, Ref(1), looks_linear(v; threshold = looks_linear_threshold)) -end - -function (g::Guesser)(x) - (; v, idx_prev, linear_lookup) = g - return if linear_lookup - δx = x - first(v) - iszero(δx) && return firstindex(v) - f = δx / (last(v) - first(v)) - if isinf(f) - f > 0 ? lastindex(v) : firstindex(v) - else - i_0, i_f = firstindex(v), lastindex(v) - i_approx = f * (i_f - i_0) + i_0 - target_type = typeof(firstindex(v)) - if i_approx >= typemax(target_type) - lastindex(v) + 1 - elseif i_approx <= typemin(target_type) - firstindex(v) - 1 - else - round(target_type, i_approx) - end - end - else - idx_prev[] - end -end - -""" - searchsortedfirstcorrelated(v::AbstractVector, x, guess; order=Base.Order.Forward) - -An accelerated `findfirst` on sorted vectors using a bracketed search. Requires a `guess::Union{<:Integer, Guesser}` -to start the search from. - -The `order` keyword argument specifies the ordering of the vector `v`, defaulting to `Base.Order.Forward`. -""" -function searchsortedfirstcorrelated( - v::AbstractVector, - x, - guess::T; - order::Base.Order.Ordering = Base.Order.Forward - ) where {T <: Integer} - lo, hi = bracketstrictlymontonic(v, x, guess, order) - return searchsortedfirst(v, x, lo, hi, order) -end - -""" - searchsortedlastcorrelated(v::AbstractVector{T}, x, guess; order=Base.Order.Forward) - -An accelerated `findlast` on sorted vectors using a bracketed search. Requires a `guess::Union{<:Integer, Guesser}` -to start the search from. - -The `order` keyword argument specifies the ordering of the vector `v`, defaulting to `Base.Order.Forward`. -""" -function searchsortedlastcorrelated( - v::AbstractVector, - x, - guess::T; - order::Base.Order.Ordering = Base.Order.Forward - ) where {T <: Integer} - lo, hi = bracketstrictlymontonic(v, x, guess, order) - return searchsortedlast(v, x, lo, hi, order) -end - -searchsortedfirstcorrelated(r::AbstractRange, x, ::Integer) = searchsortedfirst(r, x) -searchsortedlastcorrelated(r::AbstractRange, x, ::Integer) = searchsortedlast(r, x) - -function searchsortedfirstcorrelated( - v::AbstractVector, - x, - guess::Guesser{T}; - order::Base.Order.Ordering = Base.Order.Forward - ) where {T <: AbstractVector} - @assert v === guess.v - out = searchsortedfirstcorrelated(v, x, guess(x); order = order) - guess.idx_prev[] = out - return out -end - -function searchsortedlastcorrelated( - v::T, - x, - guess::Guesser{T}; - order::Base.Order.Ordering = Base.Order.Forward - ) where {T <: AbstractVector} - @assert v === guess.v - out = searchsortedlastcorrelated(v, x, guess(x); order = order) - guess.idx_prev[] = out - return out -end - -""" - searchsortedfirstexp(v, x, lo=firstindex(v), hi=lastindex(v)) - -Find the first index `i` in sorted vector `v` such that `v[i] >= x`, starting from `lo`. -This uses an exponential search followed by binary search, which is efficient when -the target is expected to be near `lo` (e.g., for correlated sequential lookups). - -Inspired by Interpolations.jl's `searchsortedfirst_exp_left`. -""" -Base.@propagate_inbounds function searchsortedfirstexp( - v::AbstractVector, - x, - lo::Integer = firstindex(v), - hi::Integer = lastindex(v) - ) - # Linear search for first few elements - for i in 0:4 - ind = lo + i - ind > hi && return ind - x <= v[ind] && return ind - end - # Exponential search with doubling steps - n = 3 - tn2 = 2^n - tn2m1 = 2^(n - 1) - ind = lo + tn2 - while ind <= hi - x <= v[ind] && - return searchsortedfirst(v, x, lo + tn2 - tn2m1, ind, Base.Order.Forward) - tn2 *= 2 - tn2m1 *= 2 - ind = lo + tn2 - end - return searchsortedfirst(v, x, lo + tn2 - tn2m1, hi, Base.Order.Forward) -end - -""" - searchsortedlastvec(v::AbstractVector, x::AbstractVector) - -Find the indices for multiple sorted values `x` in sorted vector `v` efficiently. -If `x` is sorted, this leverages monotonicity to avoid redundant searching. -Returns indices such that `v[out[i]] <= x[i]` (like `searchsortedlast`). - -If `x` is not sorted, falls back to element-wise `searchsortedlast`. - -Inspired by Interpolations.jl's `searchsortedfirst_vec`. -""" -function searchsortedlastvec(v::AbstractVector, x::AbstractVector) - issorted(x) || return searchsortedlast.(Ref(v), x) - out = Vector{Int}(undef, length(x)) - lo = firstindex(v) - hi = lastindex(v) - @inbounds for i in eachindex(x) - xx = x[i] - y = searchsortedfirstexp(v, xx, lo, hi) - # searchsortedfirstexp returns index of first element >= x - # searchsortedlast returns index of last element <= x - # If y > hi, x is larger than all elements, return hi - # If v[y] == x, return y (first >= is also <=) - # If v[y] > x, return y - 1 - if y > hi - out[i] = hi - elseif v[y] == xx - out[i] = y - else - out[i] = y - 1 - end - lo = max(y, lo) - end - return out -end - -""" - searchsortedfirstvec(v::AbstractVector, x::AbstractVector) - -Find the indices for multiple sorted values `x` in sorted vector `v` efficiently. -If `x` is sorted, this leverages monotonicity to avoid redundant searching. -Returns indices such that `v[out[i]] >= x[i]` (like `searchsortedfirst`). - -If `x` is not sorted, falls back to element-wise `searchsortedfirst`. - -Inspired by Interpolations.jl's `searchsortedfirst_vec`. -""" -function searchsortedfirstvec(v::AbstractVector, x::AbstractVector) - issorted(x) || return searchsortedfirst.(Ref(v), x) - out = Vector{Int}(undef, length(x)) - lo = firstindex(v) - hi = lastindex(v) - @inbounds for i in eachindex(x) - xx = x[i] - y = searchsortedfirstexp(v, xx, lo, hi) - out[i] = y - lo = min(y, hi) - end - return out -end - -using PrecompileTools: @compile_workload, @setup_workload - -@setup_workload begin - # Minimal setup for precompilation workload - vec_int64 = Int64[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] - linear_vec = collect(1.0:0.5:10.0) - - @compile_workload begin - # Precompile the most commonly used functions with typical types - - # findfirstequal: fast SIMD-based search in Int64 vectors - findfirstequal(Int64(5), vec_int64) - findfirstequal(Int64(100), vec_int64) # not found case - - # findfirstsortedequal: binary search in sorted Int64 vectors - findfirstsortedequal(Int64(8), vec_int64) - findfirstsortedequal(Int64(100), vec_int64) # not found case - - # bracketstrictlymontonic: bracketing for sorted vectors - bracketstrictlymontonic(vec_int64, Int64(8), Int64(1), Base.Order.Forward) - - # looks_linear: check if vector is evenly spaced - looks_linear(linear_vec) - - # Guesser: wrapper for efficient repeated searches - guesser = Guesser(linear_vec) - guesser(5.0) - - # searchsortedfirstcorrelated and searchsortedlastcorrelated - searchsortedfirstcorrelated(vec_int64, Int64(8), Int64(1)) - searchsortedlastcorrelated(vec_int64, Int64(8), Int64(1)) - - # Also precompile with Guesser - guesser_int = Guesser(vec_int64) - searchsortedfirstcorrelated(vec_int64, Int64(8), guesser_int) - searchsortedlastcorrelated(vec_int64, Int64(8), guesser_int) - end -end +# Source layout. Include order matters — each file may depend on names +# defined in earlier files. See the comment block at the top of each file +# for what lives where. +include("simd_ir.jl") # IR template + per-eltype IR constants + SIMD primitives +include("equality.jl") # findfirstequal + findfirstsortedequal +include("strategies.jl") # SearchStrategy + concrete strategy types + SearchProperties + Auto +include("search_properties.jl") # Linearity / NaN probes + populated SearchProperties constructor +include("dispatch.jl") # Per-strategy searchsortedfirst/last methods + their internal helpers +include("auto.jl") # Auto crossover constants + per-query Auto + Auto's batched helpers +include("batched.jl") # Batched API + searchsortedrange + _batched! (incl Auto specialization) +include("guesser.jl") # looks_linear + Guesser + GuesserHint dispatch +include("findequal.jl") # findequal + BisectThenSIMD shortcut +include("precompile.jl") # PrecompileTools workload end # module FindFirstFunctions diff --git a/src/auto.jl b/src/auto.jl new file mode 100644 index 0000000..aebb1c5 --- /dev/null +++ b/src/auto.jl @@ -0,0 +1,168 @@ +# Auto strategy — heuristic dispatch tree + crossover constants + the +# helpers (`_estimate_avg_gap`, `_auto_simd_*`, `_auto_interp_eligible`) it +# consults. Per-query Auto dispatch lives here; batched Auto dispatch lives +# in `batched.jl` (next to the generic `_batched!` it specializes). + +# Per-query Auto threshold: under this length, the bracket-search bookkeeping +# costs more than a worst-case linear walk. +const _AUTO_LINEAR_THRESHOLD = 16 + +# Batched-Auto crossover: at gap ≤ 4 LinearScan beats ExpFromLeft (its 5 +# initial linear probes are wasted when the gap is already 0 or 1). Above 4, +# the SIMD path picks up (where eligible) up to gap = `_auto_simd_gap_max`. +const _AUTO_BATCH_LINEAR_GAP = 4 + +# For sparse queries (gap large) on long vectors, InterpolationSearch can +# beat ExpFromLeft by ~2× on uniformly-spaced data. The sampled-linearity +# check is O(1) — 9 probes — so it's cheap enough to run inside Auto when +# there's a real chance of unlocking InterpolationSearch. +const _AUTO_INTERP_MIN_GAP = 8 +const _AUTO_INTERP_MIN_N = 1024 +const _AUTO_INTERP_MIN_M = 2 + +# Very-sparse override: when the gap is large enough that ExpFromLeft's +# log₂(gap) doubling levels approach InterpolationSearch's log₂(n) worst-case +# binary refinement, InterpolationSearch's better cache behaviour (one +# extrapolation jump + local refine vs. many doubling probes across the +# array) wins even on non-strictly-linear data. +const _AUTO_INTERP_LOOSE_GAP = 256 +const _AUTO_LINEAR_LOOSE_TOLERANCE = 5.0e-2 + +# SIMDLinearScan eligibility window. The threshold is eltype-parameterized +# via the `_auto_simd_gap_max` function below. +@inline _auto_simd_gap_max(::DenseVector{Int64}) = 64 +@inline _auto_simd_gap_max(::DenseVector{Float64}) = 64 +@inline _auto_simd_gap_max(::AbstractVector) = 0 # not SIMD-eligible + +# When InterpolationSearch isn't eligible and the gap is large, BracketGallop +# beats ExpFromLeft because the 5 linear probes ExpFromLeft does upfront are +# wasted (no chance the answer is within 5 of `hint = prev_result` when the +# gap is hundreds or thousands). BracketGallop just starts doubling +# immediately. The bench sweep shows this crossover at gap ≈ 16. +const _AUTO_GALLOP_GAP_MIN = 16 + +# Per-query Auto: pick based on hint validity and length(v). +@inline function _auto_pick(v::AbstractVector, hint::Integer) + return if hint < firstindex(v) || hint > lastindex(v) + BinaryBracket() + elseif length(v) <= _AUTO_LINEAR_THRESHOLD + LinearScan() + else + BracketGallop() + end +end + +# Returns `(gap, skewed)`: the estimated average step in `v`'s index space +# between consecutive query results, plus a flag that's true when the +# queries' distribution is non-uniform within their span. +# +# - `gap` is the per-query cost driver. We always use the span-based +# estimate `n * span(queries) / span(v) / m` so that tightly-clustered +# queries (span_q ≈ 0) report gap ≈ 0 regardless of `n/m`. The earlier +# `n / m` fallback for skewed queries caused `SIMDLinearScan` to be +# picked for clustered queries where LinearScan's tiny scalar walk is +# 5× faster. +# - `skewed` is an InterpolationSearch-suitability flag. When the median +# query sits well off the midpoint of `queries[1]..queries[end]`, the +# queries are clustered within their span and the per-call linear +# extrapolation guess is worse than the previous-result hint that +# `ExpFromLeft` would use. +@inline function _estimate_avg_gap( + v::AbstractVector{<:Number}, queries::AbstractVector{<:Number}, m::Integer, + ) + n = length(v) + n <= 1 && return (0, false) + @inbounds span_v = v[end] - v[1] + if iszero(span_v) || !isfinite(span_v) + return (n ÷ max(1, m), false) + end + @inbounds span_q = queries[end] - queries[1] + # Skew detection on small `m` is too noisy — for `m ≈ 4` random uniform + # samples, the median routinely sits 30%+ off the linear midpoint by + # chance. Gate on `m ≥ 10` where the statistical variance is well below + # the 20% threshold. + skewed = false + if m >= 10 + @inbounds mid_q = queries[firstindex(queries) + m ÷ 2] + @inbounds expected_mid = ( + queries[firstindex(queries)] + + queries[lastindex(queries)] + ) / 2 + if !iszero(span_q) && + abs(mid_q - expected_mid) > 0.2 * abs(span_q) + skewed = true + end + end + ratio = span_q / span_v + # Clamp ratio: queries may extend outside v's range (extrapolation). + ratio = clamp(ratio, zero(ratio), one(ratio)) + return (floor(Int, n * ratio / max(1, m)), skewed) +end + +# Non-numeric eltypes: no span subtraction possible, fall back to length +# ratio and assume queries are roughly uniform (no skew detection possible). +@inline _estimate_avg_gap( + v::AbstractVector, ::AbstractVector, m::Integer, +) = (length(v) ÷ max(1, m), false) + +# SIMD eligibility check used by Auto's batched dispatch. The static type +# test on `v` discriminates the `DenseVector{Int64}` / `DenseVector{Float64}` +# cases that SIMDLinearScan supports. For Float64, NaN presence is taken from +# cached `SearchProperties.has_nan` when available; otherwise we assume no +# NaN — Base's positional search doesn't check sortedness either, and the +# burden of supplying populated props is on the caller for pathological +# inputs. +@inline _auto_simd_eligible(v::DenseVector{Int64}, ::SearchProperties) = true +@inline function _auto_simd_eligible(v::DenseVector{Float64}, p::SearchProperties) + return p.has_props ? !p.has_nan : true +end +@inline _auto_simd_eligible(::AbstractVector, ::SearchProperties) = false + +# InterpolationSearch eligibility: two-tier linearity check. For +# `_AUTO_INTERP_MIN_GAP ≤ gap < _AUTO_INTERP_LOOSE_GAP` we require strict +# linearity (`_AUTO_LINEAR_REL_TOLERANCE`, default 0.1%) — InterpolationSearch +# is only worth the per-call cost on truly uniform data when ExpFromLeft is +# also competitive. For `gap ≥ _AUTO_INTERP_LOOSE_GAP` we accept a looser +# tolerance (`_AUTO_LINEAR_LOOSE_TOLERANCE`, default 5%) because the cache +# benefit of one extrapolation jump + local refine compensates for a worse +# guess, but we still reject genuinely nonlinear data (log-spaced, +# two-scale) where InterpolationSearch loses 2–3× to ExpFromLeft. +@inline function _auto_interp_eligible(v, props::SearchProperties, gap::Integer) + if gap >= _AUTO_INTERP_LOOSE_GAP + # Loose probe — even on cached props, the strict `is_linear` bit may + # already reflect a tighter threshold than we need here, so run the + # sampled probe at the loose tolerance regardless of cache state. + return _sampled_looks_linear(v, _AUTO_LINEAR_LOOSE_TOLERANCE) + end + return props.has_props ? props.is_linear : _sampled_looks_linear(v) +end + +# Per-query Auto dispatch. +function Base.searchsortedlast( + ::Auto, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + s = _auto_pick(v, hint) + return s isa BinaryBracket ? + searchsortedlast(s, v, x; order = order) : + searchsortedlast(s, v, x, hint; order = order) +end + +function Base.searchsortedfirst( + ::Auto, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + s = _auto_pick(v, hint) + return s isa BinaryBracket ? + searchsortedfirst(s, v, x; order = order) : + searchsortedfirst(s, v, x, hint; order = order) +end + +Base.searchsortedlast( + ::Auto, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(BinaryBracket(), v, x; order = order) +Base.searchsortedfirst( + ::Auto, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(BinaryBracket(), v, x; order = order) diff --git a/src/batched.jl b/src/batched.jl new file mode 100644 index 0000000..9dbc82f --- /dev/null +++ b/src/batched.jl @@ -0,0 +1,321 @@ +# In-place batched sorted-search API: `searchsortedfirst!` / `searchsortedlast!` +# and `searchsortedrange`, plus the internal `_batched!` dispatchers and the +# Auto-specific specialization that turns Auto's decision tree into a single +# branchless dispatch on a concrete strategy. + +""" + searchsortedlast!(idx_out, v, queries; strategy = Auto(), order = Base.Order.Forward, + queries_sorted = nothing) + +In-place batched [`searchsortedlast`](@ref Base.searchsortedlast). Writes one +index per element of `queries` into `idx_out` (which must be the same length). + +If `queries` is sorted under `order`, the previous result is used as a hint for +the next query, so the total cost is O(length(v) + length(queries)) under +`strategy = BracketGallop()` (the default `Auto` choice for non-tiny `v`). + +If `queries` is not sorted, falls back to per-element `searchsortedlast` with +no hint regardless of `strategy`. + +The `queries_sorted` kwarg controls the runtime `issorted(queries)` check: + + - `nothing` (default): run `issorted(queries; order = order)` on every call. + O(m) bookkeeping, roughly 1 ns/q on long batches. + - `true`: trust the caller — skip the check and take the sorted-loop path + unconditionally. Use this when you already know your queries are sorted + (you computed them as a range, sorted them yourself, etc.). Wrong-answer + risk: a non-sorted `queries` passed with `queries_sorted = true` will + produce incorrect results, since the inner loop uses the previous result + as a hint and that hint becomes invalid when queries jump backward. + - `false`: skip the check and take the unsorted-loop path unconditionally + (per-query unhinted `Base.searchsortedlast`). Use when you know queries + are not sorted and want to avoid the O(m) probe. + +Returns `idx_out`. +""" +function searchsortedlast!( + idx_out::AbstractVector{<:Integer}, + v::AbstractVector, + queries::AbstractVector; + strategy::SearchStrategy = Auto(), + order::Base.Order.Ordering = Base.Order.Forward, + queries_sorted::Union{Nothing, Bool} = nothing, + ) + if length(idx_out) != length(queries) + throw( + DimensionMismatch( + "idx_out and queries must have the same length" + ) + ) + end + return _searchsortedlast_batched!( + idx_out, v, queries, strategy, order, queries_sorted + ) +end + +""" + searchsortedfirst!(idx_out, v, queries; strategy = Auto(), order = Base.Order.Forward, + queries_sorted = nothing) + +In-place batched [`searchsortedfirst`](@ref Base.searchsortedfirst). See +[`searchsortedlast!`](@ref) for behavior and for the `queries_sorted` kwarg. +""" +function searchsortedfirst!( + idx_out::AbstractVector{<:Integer}, + v::AbstractVector, + queries::AbstractVector; + strategy::SearchStrategy = Auto(), + order::Base.Order.Ordering = Base.Order.Forward, + queries_sorted::Union{Nothing, Bool} = nothing, + ) + if length(idx_out) != length(queries) + throw( + DimensionMismatch( + "idx_out and queries must have the same length" + ) + ) + end + return _searchsortedfirst_batched!( + idx_out, v, queries, strategy, order, queries_sorted + ) +end + +# Sorted inner loop, parameterized on strategy. Used by both the generic and +# Auto batched entry points so each batch performs at most one issorted check. +function _searchsortedlast_sorted_loop!( + idx_out, v::AbstractVector, queries::AbstractVector, + strategy::SearchStrategy, order::Base.Order.Ordering, + ) + hint = firstindex(v) - 1 + @inbounds for k in eachindex(queries) + q = queries[k] + hint = if hint < firstindex(v) + searchsortedlast(strategy, v, q; order = order) + else + searchsortedlast(strategy, v, q, hint; order = order) + end + idx_out[k] = hint + end + return idx_out +end + +function _searchsortedfirst_sorted_loop!( + idx_out, v::AbstractVector, queries::AbstractVector, + strategy::SearchStrategy, order::Base.Order.Ordering, + ) + hint = firstindex(v) - 1 + @inbounds for k in eachindex(queries) + q = queries[k] + hint = if hint < firstindex(v) + searchsortedfirst(strategy, v, q; order = order) + else + searchsortedfirst(strategy, v, q, hint; order = order) + end + idx_out[k] = hint + end + return idx_out +end + +function _searchsortedlast_unsorted_loop!( + idx_out, v::AbstractVector, queries::AbstractVector, + order::Base.Order.Ordering, + ) + @inbounds for k in eachindex(queries) + idx_out[k] = searchsortedlast(v, queries[k], order) + end + return idx_out +end + +function _searchsortedfirst_unsorted_loop!( + idx_out, v::AbstractVector, queries::AbstractVector, + order::Base.Order.Ordering, + ) + @inbounds for k in eachindex(queries) + idx_out[k] = searchsortedfirst(v, queries[k], order) + end + return idx_out +end + +# Decide whether to take the sorted-queries fast path. `queries_sorted` is +# the caller-supplied override: `nothing` means "check at runtime", `true` +# means "trust the caller, skip the O(m) issorted probe", `false` means +# "force the unsorted path". +@inline function _take_sorted_path( + queries, order::Base.Order.Ordering, queries_sorted::Union{Nothing, Bool}, + ) + return queries_sorted === nothing ? + issorted(queries; order = order) : queries_sorted +end + +function _searchsortedlast_batched!( + idx_out, v::AbstractVector, queries::AbstractVector, + strategy::SearchStrategy, order::Base.Order.Ordering, + queries_sorted::Union{Nothing, Bool}, + ) + return if _take_sorted_path(queries, order, queries_sorted) + _searchsortedlast_sorted_loop!(idx_out, v, queries, strategy, order) + else + _searchsortedlast_unsorted_loop!(idx_out, v, queries, order) + end +end + +# Specialized batched-Auto: pick an inner strategy from the n/m ratio, then +# call the sorted loop directly (no duplicate `issorted` check, and each +# branch is type-stable so the loop specializes on the concrete strategy). +function _searchsortedlast_batched!( + idx_out, v::AbstractVector, queries::AbstractVector, + s::Auto, order::Base.Order.Ordering, + queries_sorted::Union{Nothing, Bool}, + ) + m = length(queries) + m == 0 && return idx_out + # m == 1: skip the issorted + span heuristic — no batched hint is + # available for a single-element batch, so just dispatch straight to + # the unhinted backing call. Saves ~20 ns of bookkeeping per call. + if m == 1 + @inbounds idx_out[firstindex(idx_out)] = + searchsortedlast(v, queries[firstindex(queries)], order) + return idx_out + end + if !_take_sorted_path(queries, order, queries_sorted) + return _searchsortedlast_unsorted_loop!(idx_out, v, queries, order) + end + gap, skewed = _estimate_avg_gap(v, queries, m) + # Manually dispatch on the picked strategy so each branch is concrete. + if gap <= _AUTO_BATCH_LINEAR_GAP + return _searchsortedlast_sorted_loop!( + idx_out, v, queries, LinearScan(), order + ) + end + # Medium-gap regime: SIMDLinearScan wins on `DenseVector{Int64}` and + # `DenseVector{Float64}` (without NaN). + if gap <= _auto_simd_gap_max(v) && _auto_simd_eligible(v, s.props) + return _searchsortedlast_sorted_loop!( + idx_out, v, queries, SIMDLinearScan(), order + ) + end + # Sparse-on-large-linear: InterpolationSearch wins ~2× over ExpFromLeft + # on uniformly-spaced data — but only when queries are *also* spread + # roughly uniformly within their span. + if !skewed && + gap >= _AUTO_INTERP_MIN_GAP && + length(v) >= _AUTO_INTERP_MIN_N && + m >= _AUTO_INTERP_MIN_M && + _auto_interp_eligible(v, s.props, gap) + return _searchsortedlast_sorted_loop!( + idx_out, v, queries, InterpolationSearch(), order + ) + end + # Sparse fallback: BracketGallop beats ExpFromLeft when the gap is large + # enough that ExpFromLeft's initial 5 linear probes are guaranteed to + # miss. BracketGallop starts doubling from one position past `hint`, so + # it skips the wasted linear preamble. + if gap >= _AUTO_GALLOP_GAP_MIN + return _searchsortedlast_sorted_loop!( + idx_out, v, queries, BracketGallop(), order + ) + end + return _searchsortedlast_sorted_loop!( + idx_out, v, queries, ExpFromLeft(), order + ) +end + +function _searchsortedfirst_batched!( + idx_out, v::AbstractVector, queries::AbstractVector, + strategy::SearchStrategy, order::Base.Order.Ordering, + queries_sorted::Union{Nothing, Bool}, + ) + return if _take_sorted_path(queries, order, queries_sorted) + _searchsortedfirst_sorted_loop!(idx_out, v, queries, strategy, order) + else + _searchsortedfirst_unsorted_loop!(idx_out, v, queries, order) + end +end + +function _searchsortedfirst_batched!( + idx_out, v::AbstractVector, queries::AbstractVector, + s::Auto, order::Base.Order.Ordering, + queries_sorted::Union{Nothing, Bool}, + ) + m = length(queries) + m == 0 && return idx_out + if m == 1 + @inbounds idx_out[firstindex(idx_out)] = + searchsortedfirst(v, queries[firstindex(queries)], order) + return idx_out + end + if !_take_sorted_path(queries, order, queries_sorted) + return _searchsortedfirst_unsorted_loop!(idx_out, v, queries, order) + end + gap, skewed = _estimate_avg_gap(v, queries, m) + if gap <= _AUTO_BATCH_LINEAR_GAP + return _searchsortedfirst_sorted_loop!( + idx_out, v, queries, LinearScan(), order + ) + end + if gap <= _auto_simd_gap_max(v) && _auto_simd_eligible(v, s.props) + return _searchsortedfirst_sorted_loop!( + idx_out, v, queries, SIMDLinearScan(), order + ) + end + if !skewed && + gap >= _AUTO_INTERP_MIN_GAP && + length(v) >= _AUTO_INTERP_MIN_N && + m >= _AUTO_INTERP_MIN_M && + _auto_interp_eligible(v, s.props, gap) + return _searchsortedfirst_sorted_loop!( + idx_out, v, queries, InterpolationSearch(), order + ) + end + if gap >= _AUTO_GALLOP_GAP_MIN + return _searchsortedfirst_sorted_loop!( + idx_out, v, queries, BracketGallop(), order + ) + end + return _searchsortedfirst_sorted_loop!( + idx_out, v, queries, ExpFromLeft(), order + ) +end + +# --------------------------------------------------------------------------- +# Range search through the strategy dispatch +# --------------------------------------------------------------------------- + +""" + searchsortedrange(strategy, v, lo, hi[, hint]; order = Base.Order.Forward) + -> UnitRange{Int} + +Return the index range of all entries `v[i]` satisfying `lo ≤ v[i] ≤ hi` +under `order`. Equivalent to +`searchsortedfirst(strategy, v, lo[, hint]; order) : + searchsortedlast(strategy, v, hi[, hint]; order)` but expressed as a +single call that may share bracket-discovery work between the two +endpoints when the underlying strategy allows it. + +The empty range case (no `v[i]` lies in `[lo, hi]`) returns +`searchsortedfirst(strategy, v, lo) : (searchsortedfirst(strategy, v, lo) - 1)`, +matching `Base.searchsorted(v, lo)` for an absent value. + +When a `hint` is supplied it is used for both endpoint searches. Strategies +that ignore the hint (`BinaryBracket`, `InterpolationSearch`) treat the +hinted form as a pass-through. +""" +@inline function searchsortedrange( + strategy::SearchStrategy, v::AbstractVector, lo, hi; + order::Base.Order.Ordering = Base.Order.Forward, + ) + first_idx = searchsortedfirst(strategy, v, lo; order = order) + last_idx = searchsortedlast(strategy, v, hi; order = order) + return first_idx:last_idx +end + +@inline function searchsortedrange( + strategy::SearchStrategy, v::AbstractVector, lo, hi, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + first_idx = searchsortedfirst(strategy, v, lo, hint; order = order) + last_idx = searchsortedlast( + strategy, v, hi, max(first_idx, hint); order = order + ) + return first_idx:last_idx +end diff --git a/src/dispatch.jl b/src/dispatch.jl new file mode 100644 index 0000000..f2a5575 --- /dev/null +++ b/src/dispatch.jl @@ -0,0 +1,583 @@ +# Per-strategy `Base.searchsortedlast` / `Base.searchsortedfirst` dispatch. +# Each strategy gets one dispatch block plus any internal helpers it needs +# (`bracketstrictlymonotonic*` for `BracketGallop`, `searchsortedfirstexp` +# for `ExpFromLeft`, `_interp_guess` for `InterpolationSearch`, etc.). +# +# `Auto`'s per-query and batched dispatch lives in `auto.jl` / `batched.jl`; +# `GuesserHint`'s lives in `guesser.jl`. + +# =========================================================================== +# Bracket helpers — backing `BracketGallop` +# =========================================================================== + +# Expanding-binary-search bracket around a guess. The `searchsortedlast` +# polarity: when `x == v[guess]`, the answer is `>= guess` (gallop right). +function bracketstrictlymonotonic( + v::AbstractVector, + x, + guess::T, + o::Base.Order.Ordering, + )::NTuple{2, keytype(v)} where {T <: Integer} + bottom = firstindex(v) + top = lastindex(v) + if guess < bottom || guess > top + return bottom, top + else + u = T(1) + lo, hi = guess, min(guess + u, top) + @inbounds if Base.Order.lt(o, x, v[lo]) + while lo > bottom && Base.Order.lt(o, x, v[lo]) + lo, hi = max(bottom, lo - u), lo + u += u + end + else + while hi < top && !Base.Order.lt(o, x, v[hi]) + lo, hi = hi, min(top, hi + u) + u += u + end + end + end + return lo, hi +end + +# Companion to `bracketstrictlymonotonic` for the `searchsortedfirst` +# polarity. Original uses `lt(o, x, v[lo])` (i.e., `x < v[lo]`), which is +# right for `searchsortedlast`: when `x == v[lo]`, the answer is `>= lo` so +# we gallop right. For `searchsortedfirst`, when `x == v[lo]` the answer is +# `<= lo` (look for earlier duplicates) — so we need the inverted polarity +# `lt(o, v[lo], x)`. Without this, BracketGallop.searchsortedfirst returns +# the wrong index when the hint lands on a run of duplicates. +function bracketstrictlymonotonic_first( + v::AbstractVector, + x, + guess::T, + o::Base.Order.Ordering, + )::NTuple{2, keytype(v)} where {T <: Integer} + bottom = firstindex(v) + top = lastindex(v) + if guess < bottom || guess > top + return bottom, top + else + u = T(1) + lo, hi = guess, min(guess + u, top) + @inbounds if !Base.Order.lt(o, v[lo], x) + # v[lo] >= x → answer is <= lo, gallop left. + while lo > bottom && !Base.Order.lt(o, v[lo], x) + lo, hi = max(bottom, lo - u), lo + u += u + end + else + # v[lo] < x → answer is > lo, gallop right. + while hi < top && Base.Order.lt(o, v[hi], x) + lo, hi = hi, min(top, hi + u) + u += u + end + end + end + return lo, hi +end + +# =========================================================================== +# Exponential-search helper — backing `ExpFromLeft` +# =========================================================================== + +# Exponential search forward from `lo`, then bounded binary search inside the +# final bracket. Used internally by `ExpFromLeft`. +Base.@propagate_inbounds function searchsortedfirstexp( + v::AbstractVector, + x, + lo::Integer = firstindex(v), + hi::Integer = lastindex(v), + ) + # Linear search for first few elements + for i in 0:4 + ind = lo + i + ind > hi && return ind + x <= v[ind] && return ind + end + # Exponential search with doubling steps + n = 3 + tn2 = 2^n + tn2m1 = 2^(n - 1) + ind = lo + tn2 + while ind <= hi + x <= v[ind] && + return searchsortedfirst(v, x, lo + tn2 - tn2m1, ind, Base.Order.Forward) + tn2 *= 2 + tn2m1 *= 2 + ind = lo + tn2 + end + return searchsortedfirst(v, x, lo + tn2 - tn2m1, hi, Base.Order.Forward) +end + +# =========================================================================== +# Strategy: BinaryBracket — ignore any hint, delegate to `Base` +# =========================================================================== + +Base.searchsortedlast( + ::BinaryBracket, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(v, x, order) +Base.searchsortedfirst( + ::BinaryBracket, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(v, x, order) +Base.searchsortedlast( + s::BinaryBracket, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(s, v, x; order = order) +Base.searchsortedfirst( + s::BinaryBracket, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(s, v, x; order = order) + +# =========================================================================== +# Strategy: LinearScan — walk ±1 from the hint +# =========================================================================== + +function Base.searchsortedlast( + ::LinearScan, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + lo, hi = firstindex(v), lastindex(v) + if hi < lo + return lo - 1 # empty vector + end + i = clamp(hint, lo, hi) + @inbounds if Base.Order.lt(order, x, v[i]) + # v[i] > x → retreat + while i > lo + i -= 1 + !Base.Order.lt(order, x, v[i]) && return i + end + return lo - 1 # x precedes all of v + else + # v[i] ≤ x → try to advance + while i < hi + Base.Order.lt(order, x, v[i + 1]) && return i + i += 1 + end + return hi + end +end + +function Base.searchsortedfirst( + ::LinearScan, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + lo, hi = firstindex(v), lastindex(v) + if hi < lo + return lo + end + i = clamp(hint, lo, hi) + @inbounds if Base.Order.lt(order, v[i], x) + # v[i] < x → advance + while i < hi + i += 1 + !Base.Order.lt(order, v[i], x) && return i + end + return hi + 1 # x exceeds all of v + else + # v[i] ≥ x → try to retreat + while i > lo + !Base.Order.lt(order, v[i - 1], x) && (i -= 1; continue) + return i + end + return lo + end +end + +# LinearScan without a hint falls back to BinaryBracket. +Base.searchsortedlast( + s::LinearScan, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(BinaryBracket(), v, x; order = order) +Base.searchsortedfirst( + s::LinearScan, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(BinaryBracket(), v, x; order = order) + +# =========================================================================== +# Strategy: SIMDLinearScan — specialized forward walk for DenseVector{Int64} +# and DenseVector{Float64}. Backward walks reuse the scalar LinearScan path +# (rare from a good hint, and the SIMD primitive only exists for the +# forward-scan direction). +# =========================================================================== + +@inline function _simdscan_last_specialized( + v::Union{DenseVector{Int64}, DenseVector{Float64}}, x, hint::Integer, + ) + lo = firstindex(v) + hi = lastindex(v) + hi < lo && return lo - 1 + i = clamp(hint, lo, hi) + @inbounds vi = v[i] + if vi > x + # Backward walk (scalar). + while i > lo + i -= 1 + @inbounds v[i] <= x && return i + end + return lo - 1 + end + i == hi && return hi + start = i + 1 + len = hi - start + 1 + offset = GC.@preserve v _simd_first_gt(x, pointer(v, start), Int64(len)) + return offset < 0 ? hi : (start + offset) - 1 +end + +@inline function _simdscan_first_specialized( + v::Union{DenseVector{Int64}, DenseVector{Float64}}, x, hint::Integer, + ) + lo = firstindex(v) + hi = lastindex(v) + hi < lo && return lo + i = clamp(hint, lo, hi) + @inbounds vi = v[i] + if vi < x + i == hi && return hi + 1 + start = i + 1 + len = hi - start + 1 + offset = GC.@preserve v _simd_first_ge(x, pointer(v, start), Int64(len)) + return offset < 0 ? hi + 1 : start + offset + end + while i > lo + @inbounds v[i - 1] >= x && (i -= 1; continue) + return i + end + return lo +end + +function Base.searchsortedlast( + ::SIMDLinearScan, v::DenseVector{Int64}, x::Int64, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + order === Base.Order.Forward || + return searchsortedlast(LinearScan(), v, x, hint; order = order) + return _simdscan_last_specialized(v, x, hint) +end +function Base.searchsortedlast( + ::SIMDLinearScan, v::DenseVector{Float64}, x::Float64, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + order === Base.Order.Forward || + return searchsortedlast(LinearScan(), v, x, hint; order = order) + return _simdscan_last_specialized(v, x, hint) +end +function Base.searchsortedfirst( + ::SIMDLinearScan, v::DenseVector{Int64}, x::Int64, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + order === Base.Order.Forward || + return searchsortedfirst(LinearScan(), v, x, hint; order = order) + return _simdscan_first_specialized(v, x, hint) +end +function Base.searchsortedfirst( + ::SIMDLinearScan, v::DenseVector{Float64}, x::Float64, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + order === Base.Order.Forward || + return searchsortedfirst(LinearScan(), v, x, hint; order = order) + return _simdscan_first_specialized(v, x, hint) +end + +# Other eltypes fall back to the scalar LinearScan walk. +Base.searchsortedlast( + ::SIMDLinearScan, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(LinearScan(), v, x, hint; order = order) +Base.searchsortedfirst( + ::SIMDLinearScan, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(LinearScan(), v, x, hint; order = order) + +# No hint → BinaryBracket. +Base.searchsortedlast( + ::SIMDLinearScan, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(BinaryBracket(), v, x; order = order) +Base.searchsortedfirst( + ::SIMDLinearScan, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(BinaryBracket(), v, x; order = order) + +# =========================================================================== +# Strategy: BracketGallop — bracketstrictlymonotonic + bounded binary search +# =========================================================================== + +function Base.searchsortedlast( + ::BracketGallop, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + lo, hi = bracketstrictlymonotonic(v, x, hint, order) + return searchsortedlast(v, x, lo, hi, order) +end + +function Base.searchsortedfirst( + ::BracketGallop, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + lo, hi = bracketstrictlymonotonic_first(v, x, hint, order) + return searchsortedfirst(v, x, lo, hi, order) +end + +# BracketGallop without a hint falls back to BinaryBracket. +Base.searchsortedlast( + ::BracketGallop, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(BinaryBracket(), v, x; order = order) +Base.searchsortedfirst( + ::BracketGallop, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(BinaryBracket(), v, x; order = order) + +# =========================================================================== +# Strategy: ExpFromLeft — galloping forward from a left-bound hint. +# +# Contract: callers pass `hint` such that the answer is ≥ `hint`. When that +# isn't true (hint is past the answer), we fall back to a full +# `searchsortedlast`/`searchsortedfirst` — the batched-sorted loop sets +# `hint = prev_result`, which always satisfies this for sorted queries, so +# the fallback is only exercised by arbitrary single-query callers. +# =========================================================================== + +function Base.searchsortedfirst( + ::ExpFromLeft, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + lo = firstindex(v) + hi = lastindex(v) + if isempty(v) + return lo + end + h = clamp(hint, lo, hi) + # `searchsortedfirst` semantics: smallest i with v[i] >= x. We can only + # gallop forward from `h` when v[h] < x (then the answer is strictly + # > h). When v[h] >= x, the first occurrence of `x` may be at index + # ≤ h (duplicates to the left), so fall back to a full search rather + # than risk skipping past earlier duplicates. + @inbounds if !Base.Order.lt(order, v[h], x) + return searchsortedfirst(v, x, order) + end + return order === Base.Order.Forward ? + searchsortedfirstexp(v, x, h, hi) : + searchsortedfirst(v, x, h, hi, order) +end + +function Base.searchsortedlast( + ::ExpFromLeft, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + lo = firstindex(v) + hi = lastindex(v) + if isempty(v) + return lo - 1 + end + h = clamp(hint, lo, hi) + @inbounds if Base.Order.lt(order, x, v[h]) + return searchsortedlast(v, x, order) + end + if order === Base.Order.Forward + y = searchsortedfirstexp(v, x, h, hi) + return if y > hi + hi + else + @inbounds v[y] == x ? y : y - 1 + end + else + return searchsortedlast(v, x, h, hi, order) + end +end + +# ExpFromLeft without a hint falls back to BinaryBracket. +Base.searchsortedlast( + ::ExpFromLeft, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(BinaryBracket(), v, x; order = order) +Base.searchsortedfirst( + ::ExpFromLeft, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(BinaryBracket(), v, x; order = order) + +# =========================================================================== +# Strategy: InterpolationSearch — extrapolate a guess, then bounded binary +# search around it. +# =========================================================================== + +@inline function _interp_guess(v::AbstractVector, x, lo::Integer, hi::Integer) + @inbounds vlo = v[lo] + @inbounds vhi = v[hi] + span = vhi - vlo + iszero(span) && return lo + # Linear extrapolation: how far is x along [vlo, vhi]? + f = (x - vlo) / span + if !isfinite(f) + return f > 0 ? hi : lo + end + g = lo + round(Int, f * (hi - lo)) + return clamp(g, lo, hi) +end + +function Base.searchsortedlast( + ::InterpolationSearch, v::AbstractVector{<:Number}, x::Number; + order::Base.Order.Ordering = Base.Order.Forward, + ) + if order !== Base.Order.Forward + # Linear interpolation doesn't carry over to reverse order; fall back + return searchsortedlast(v, x, order) + end + lo, hi = firstindex(v), lastindex(v) + hi < lo && return lo - 1 + g = _interp_guess(v, x, lo, hi) + return searchsortedlast(BracketGallop(), v, x, g; order = order) +end + +function Base.searchsortedfirst( + ::InterpolationSearch, v::AbstractVector{<:Number}, x::Number; + order::Base.Order.Ordering = Base.Order.Forward, + ) + if order !== Base.Order.Forward + return searchsortedfirst(v, x, order) + end + lo, hi = firstindex(v), lastindex(v) + hi < lo && return lo + g = _interp_guess(v, x, lo, hi) + return searchsortedfirst(BracketGallop(), v, x, g; order = order) +end + +# InterpolationSearch ignores any hint; pass-through. +Base.searchsortedlast( + s::InterpolationSearch, v::AbstractVector{<:Number}, x::Number, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(s, v, x; order = order) +Base.searchsortedfirst( + s::InterpolationSearch, v::AbstractVector{<:Number}, x::Number, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(s, v, x; order = order) + +# InterpolationSearch on non-numeric data falls back to BinaryBracket. +Base.searchsortedlast( + ::InterpolationSearch, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(BinaryBracket(), v, x; order = order) +Base.searchsortedfirst( + ::InterpolationSearch, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(BinaryBracket(), v, x; order = order) +Base.searchsortedlast( + s::InterpolationSearch, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(BinaryBracket(), v, x; order = order) +Base.searchsortedfirst( + s::InterpolationSearch, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(BinaryBracket(), v, x; order = order) + +# =========================================================================== +# Strategy: BitInterpolationSearch — InterpolationSearch on the IEEE bit +# pattern of positive Float64. Cheaper than reinterpret-as-array because we +# only need two endpoint reads and one query bitcast per call. +# =========================================================================== + +@inline function _bit_interp_guess_f64( + v::DenseVector{Float64}, x::Float64, lo::Integer, hi::Integer, + ) + @inbounds vlo_bits = reinterpret(UInt64, v[lo]) + @inbounds vhi_bits = reinterpret(UInt64, v[hi]) + xu = reinterpret(UInt64, x) + span = vhi_bits - vlo_bits + iszero(span) && return lo + if xu <= vlo_bits + return lo + elseif xu >= vhi_bits + return hi + end + num = xu - vlo_bits + f = Float64(num) / Float64(span) + g = lo + round(Int, f * (hi - lo)) + return clamp(g, lo, hi) +end + +function Base.searchsortedlast( + ::BitInterpolationSearch, v::DenseVector{Float64}, x::Float64; + order::Base.Order.Ordering = Base.Order.Forward, + ) + if order !== Base.Order.Forward + return searchsortedlast(v, x, order) + end + lo, hi = firstindex(v), lastindex(v) + hi < lo && return lo - 1 + @inbounds if v[lo] <= 0.0 || !isfinite(v[lo]) || x <= 0.0 || !isfinite(x) + return searchsortedlast(BinaryBracket(), v, x; order = order) + end + g = _bit_interp_guess_f64(v, x, lo, hi) + return searchsortedlast(BracketGallop(), v, x, g; order = order) +end + +function Base.searchsortedfirst( + ::BitInterpolationSearch, v::DenseVector{Float64}, x::Float64; + order::Base.Order.Ordering = Base.Order.Forward, + ) + if order !== Base.Order.Forward + return searchsortedfirst(v, x, order) + end + lo, hi = firstindex(v), lastindex(v) + hi < lo && return lo + @inbounds if v[lo] <= 0.0 || !isfinite(v[lo]) || x <= 0.0 || !isfinite(x) + return searchsortedfirst(BinaryBracket(), v, x; order = order) + end + g = _bit_interp_guess_f64(v, x, lo, hi) + return searchsortedfirst(BracketGallop(), v, x, g; order = order) +end + +# Hint pass-through (bit-interp ignores externally-supplied hints). +Base.searchsortedlast( + s::BitInterpolationSearch, v::DenseVector{Float64}, x::Float64, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(s, v, x; order = order) +Base.searchsortedfirst( + s::BitInterpolationSearch, v::DenseVector{Float64}, x::Float64, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(s, v, x; order = order) + +# Non-Float64 / non-dense eltypes: fall back to plain InterpolationSearch. +Base.searchsortedlast( + ::BitInterpolationSearch, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(InterpolationSearch(), v, x; order = order) +Base.searchsortedfirst( + ::BitInterpolationSearch, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(InterpolationSearch(), v, x; order = order) +Base.searchsortedlast( + ::BitInterpolationSearch, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(InterpolationSearch(), v, x; order = order) +Base.searchsortedfirst( + ::BitInterpolationSearch, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(InterpolationSearch(), v, x; order = order) + +# =========================================================================== +# Strategy: BisectThenSIMD — equality-search; positional dispatch falls back +# to BinaryBracket. (The `findequal(BisectThenSIMD(), v, x)` shortcut lives +# in `findequal.jl`.) +# =========================================================================== + +Base.searchsortedlast( + ::BisectThenSIMD, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(BinaryBracket(), v, x; order = order) +Base.searchsortedfirst( + ::BisectThenSIMD, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(BinaryBracket(), v, x; order = order) +Base.searchsortedlast( + s::BisectThenSIMD, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(BinaryBracket(), v, x; order = order) +Base.searchsortedfirst( + s::BisectThenSIMD, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(BinaryBracket(), v, x; order = order) diff --git a/src/equality.jl b/src/equality.jl new file mode 100644 index 0000000..14feb03 --- /dev/null +++ b/src/equality.jl @@ -0,0 +1,73 @@ +# Equality-search public surface. `findfirstequal` handles unsorted vectors +# (with a SIMD specialization for `DenseVector{Int64}`); `findfirstsortedequal` +# handles sorted vectors via bisect-then-SIMD. The strategy-framework +# wrapper `findequal(strategy, v, x)` lives in `findequal.jl`. + +""" + findfirstequal(x, A) -> Union{Int, Nothing} + +Find the first index in `A` where the value equals `x`. Returns `nothing` +if `x` does not occur in `A`. + +This function does **not** assume `A` is sorted. For sorted vectors, see +[`findfirstsortedequal`](@ref) (a bisect-then-SIMD specialization on +`DenseVector{Int64}`) or [`findequal`](@ref) (the strategy-framework +equality wrapper that returns an `Int` with a sentinel). + +The `(x::Int64, A::DenseVector{Int64})` method uses a custom LLVM IR SIMD +scan (load 8 lanes, `icmp eq`, `cttz` on the mask) — about 8× faster than +the scalar `findfirst(==(x), v)` on modern x86-64. Every other element-type +and array-storage combination falls back to `findfirst(isequal(x), A)`. +""" +findfirstequal(vpivot, ivars) = findfirst(isequal(vpivot), ivars) +function findfirstequal(vpivot::Int64, ivars::DenseVector{Int64}) + GC.@preserve ivars begin + ret = _findfirstequal(vpivot, pointer(ivars), length(ivars)) + end + return ret < 0 ? nothing : ret + 1 +end + +""" + findfirstsortedequal(var::Int64, vars::DenseVector{Int64}) -> Union{Int64, Nothing} + +Find the index of the first occurrence of `var` in the sorted vector +`vars`. Returns `nothing` if `var` does not occur. Specialized for +`DenseVector{Int64}` via a branchless binary bisection down to a small +basecase, followed by the same SIMD equality scan that backs +[`findfirstequal`](@ref) — faster than plain `findfirst(==(var), vars)` +or `searchsortedfirst` + post-check for typical Int64 vectors. + +The strategy-framework equivalent is +[`findequal(BisectThenSIMD(), vars, var)`](@ref findequal); that wrapper +returns an `Int` with a sentinel (`firstindex(v) - 1`) for "not found", +which is type-stable and composes with the rest of the strategy +dispatch. Prefer `findequal` for new code; `findfirstsortedequal` remains +as the dedicated `Union{Int64, Nothing}`-returning name. +""" +function findfirstsortedequal( + var::Int64, + vars::DenseVector{Int64}, + ::Val{basecase} = Base.libllvm_version >= v"17" ? Val(8) : Val(128), + ) where {basecase} + len = length(vars) + offset = 0 + @inbounds while len > basecase + # Bisect with the predicate `vars[mid] < var` (strict). When true, + # `var` is past the midpoint — drop the left half *and* the + # midpoint itself. When false (`vars[mid] >= var`), `var` may be + # at the midpoint, so keep `offset` and shrink the window to + # `vars[offset+1 .. offset+half+1]` (inclusive of the midpoint). + # The earlier `<=` predicate would have advanced past matching + # midpoints, masking earlier duplicates of `var`. + half = len >>> 1 + mid = offset + half + 1 + is_left_strictly_less = vars[mid] < var + offset = ifelse(is_left_strictly_less, offset + half + 1, offset) + len = ifelse(is_left_strictly_less, len - half - 1, half + 1) + end + # maybe occurs in vars[offset+1:offset+len] + GC.@preserve vars begin + ret = _findfirstequal(var, pointer(vars) + 8offset, len) + end + return ret < 0 ? nothing : ret + offset + 1 +end diff --git a/src/findequal.jl b/src/findequal.jl new file mode 100644 index 0000000..c7a9807 --- /dev/null +++ b/src/findequal.jl @@ -0,0 +1,91 @@ +# Strategy-framework equality search: `findequal(strategy, v, x[, hint])`. +# Returns an `Int` with the sentinel `firstindex(v) - 1` for "not found" +# (matching `Base.searchsortedlast`'s convention for "x precedes all of v"). +# The `BisectThenSIMD` shortcut for `DenseVector{Int64}` dispatches into +# `findfirstsortedequal` directly; every other strategy composes via +# `searchsortedfirst` + a post-check. + +""" + findequal(strategy, v, x[, hint]; order = Base.Order.Forward) -> Int + +Return the index of `x` in sorted `v` if present, or the sentinel +`firstindex(v) - 1` if `x` is absent. Type-stable `Int` return — the +sentinel matches the convention `Base.searchsortedlast` already uses for +"x precedes all of v", so callers can test for "not found" with +`i < firstindex(v)`. + +For vectors with 1-based indexing (the Julia default), the sentinel is +exactly `0`, which is also `searchsortedlast`'s "x precedes all" return. +For [OffsetArrays](https://github.com/JuliaArrays/OffsetArrays.jl) and any +other vector whose `firstindex` is not `1`, the sentinel adjusts +accordingly — e.g. for a vector with `firstindex == -3`, the sentinel is +`-4`. Always test against `firstindex(v) - 1` (or equivalently +`i < firstindex(v)`), not against the literal `0`. + +Most strategies are handled generically: run +`searchsortedfirst(strategy, v, x[, hint])` to find the candidate insertion +point, then check whether `v[i]` actually equals `x`. The shortcut method +on [`BisectThenSIMD`](@ref) for `DenseVector{Int64}` skips the +`searchsortedfirst` path entirely and uses the dedicated bisect-then-SIMD +equality scan that backs [`findfirstsortedequal`](@ref). + +For unsorted vectors, use [`findfirstequal`](@ref) — it does not require +a sorted input and falls outside the strategy framework. +""" +@inline function findequal( + strategy::SearchStrategy, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, + ) + return _findequal_generic(strategy, v, x, order) +end + +@inline function findequal( + strategy::SearchStrategy, v::AbstractVector, x, hint::Integer; + order::Base.Order.Ordering = Base.Order.Forward, + ) + i = searchsortedfirst(strategy, v, x, hint; order = order) + return _findequal_postcheck(v, x, i) +end + +@inline function _findequal_generic(strategy, v, x, order) + i = searchsortedfirst(strategy, v, x; order = order) + return _findequal_postcheck(v, x, i) +end + +@inline function _findequal_postcheck(v::AbstractVector, x, i::Integer) + if i > lastindex(v) + return firstindex(v) - 1 + end + @inbounds return isequal(v[i], x) ? Int(i) : (firstindex(v) - 1) +end + +# Shortcut: BisectThenSIMD on DenseVector{Int64} uses the dedicated bisect- +# then-SIMD equality scan (same algorithm as `findfirstsortedequal`). +function findequal( + ::BisectThenSIMD, v::DenseVector{Int64}, x::Int64; + order::Base.Order.Ordering = Base.Order.Forward, + ) + if order !== Base.Order.Forward + return _findequal_generic(BinaryBracket(), v, x, order) + end + r = findfirstsortedequal(x, v) + return r === nothing ? (firstindex(v) - 1) : r +end +# Hinted form ignores the hint — the bisect-then-SIMD algorithm does not +# benefit from a hint, and probing it would only waste cycles. +findequal( + s::BisectThenSIMD, v::DenseVector{Int64}, x::Int64, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = findequal(s, v, x; order = order) + +# Non-Int64 fallback for BisectThenSIMD: use BinaryBracket + post-check. +function findequal( + ::BisectThenSIMD, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, + ) + return _findequal_generic(BinaryBracket(), v, x, order) +end +findequal( + s::BisectThenSIMD, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = findequal(s, v, x; order = order) diff --git a/src/guesser.jl b/src/guesser.jl new file mode 100644 index 0000000..e3e4b61 --- /dev/null +++ b/src/guesser.jl @@ -0,0 +1,109 @@ +# `Guesser` correlated-lookup helper + the public `looks_linear` probe it +# uses + the `GuesserHint` strategy dispatch that plugs a `Guesser` into the +# `searchsortedfirst`/`searchsortedlast` API. + +""" + looks_linear(v; threshold = 1e-2) + +Determine if the abscissae `v` are regularly distributed, taking the standard deviation of +the difference between the array of abscissae with respect to the straight line linking +its first and last elements, normalized by the range of `v`. If this standard deviation is +below the given `threshold`, the vector looks linear (return true). Internal function - +interface may change. +""" +function looks_linear(v; threshold = 1.0e-2) + length(v) <= 2 && return true + x_0, x_f = first(v), last(v) + N = length(v) + x_span = x_f - x_0 + mean_x_dist = x_span / (N - 1) + norm_var = sum((x_i - x_0 - (i - 1) * mean_x_dist)^2 for (i, x_i) in enumerate(v)) / + (N * x_span^2) + return norm_var < threshold^2 +end + +""" + Guesser(v::AbstractVector; looks_linear_threshold = 1e-2) + +Wrapper of the searched vector `v` which makes an informed guess for the next +correlated lookup by either + + - exploiting that `v` is sufficiently evenly spaced (linear-extrapolation guess), or + - using the previous outcome (the cached `idx_prev`). + +Pass a `Guesser` to [`GuesserHint`](@ref) to use it as a search strategy with +the dispatched [`searchsortedlast`](@ref Base.searchsortedlast) / +[`searchsortedfirst`](@ref Base.searchsortedfirst) API. +""" +struct Guesser{T <: AbstractVector} + v::T + idx_prev::Base.RefValue{Int} + linear_lookup::Bool +end + +function Guesser(v::AbstractVector; looks_linear_threshold = 1.0e-2) + return Guesser(v, Ref(1), looks_linear(v; threshold = looks_linear_threshold)) +end + +function (g::Guesser)(x) + (; v, idx_prev, linear_lookup) = g + return if linear_lookup + δx = x - first(v) + iszero(δx) && return firstindex(v) + f = δx / (last(v) - first(v)) + if isinf(f) + f > 0 ? lastindex(v) : firstindex(v) + else + i_0, i_f = firstindex(v), lastindex(v) + i_approx = f * (i_f - i_0) + i_0 + target_type = typeof(firstindex(v)) + if i_approx >= typemax(target_type) + lastindex(v) + 1 + elseif i_approx <= typemin(target_type) + firstindex(v) - 1 + else + round(target_type, i_approx) + end + end + else + idx_prev[] + end +end + +# Note on ranges: `Base.searchsortedlast(r::AbstractRange, x, order)` is +# already O(1) (closed-form), so the strategies' fallback path through +# `BinaryBracket` (which delegates to that Base method) is already optimal +# for ranges. No special-case overlays needed. + +# GuesserHint methods — strategy dispatch wrapper for the `Guesser`-based +# correlated search. Per-call cost: one `guesser(x)` evaluation + one +# `BracketGallop` call + one `idx_prev[]` write. +function Base.searchsortedlast( + s::GuesserHint, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, + ) + @assert v === s.guesser.v + out = searchsortedlast(BracketGallop(), v, x, s.guesser(x); order = order) + s.guesser.idx_prev[] = out + return out +end + +function Base.searchsortedfirst( + s::GuesserHint, v::AbstractVector, x; + order::Base.Order.Ordering = Base.Order.Forward, + ) + @assert v === s.guesser.v + out = searchsortedfirst(BracketGallop(), v, x, s.guesser(x); order = order) + s.guesser.idx_prev[] = out + return out +end + +# GuesserHint ignores any externally-supplied hint (the Guesser carries its own). +Base.searchsortedlast( + s::GuesserHint, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedlast(s, v, x; order = order) +Base.searchsortedfirst( + s::GuesserHint, v::AbstractVector, x, ::Integer; + order::Base.Order.Ordering = Base.Order.Forward, +) = searchsortedfirst(s, v, x; order = order) diff --git a/src/precompile.jl b/src/precompile.jl new file mode 100644 index 0000000..93c5ec1 --- /dev/null +++ b/src/precompile.jl @@ -0,0 +1,58 @@ +# PrecompileTools workload — exercises the most commonly used public +# entry points across typical eltypes so `using FindFirstFunctions` is hot +# without a startup compile spike on the first call. + +using PrecompileTools: @compile_workload, @setup_workload + +@setup_workload begin + vec_int64 = Int64[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] + linear_vec = collect(1.0:0.5:10.0) + + @compile_workload begin + # findfirstequal: fast SIMD-based search in Int64 vectors. + findfirstequal(Int64(5), vec_int64) + findfirstequal(Int64(100), vec_int64) + + # findfirstsortedequal: binary search in sorted Int64 vectors. + findfirstsortedequal(Int64(8), vec_int64) + findfirstsortedequal(Int64(100), vec_int64) + + # looks_linear: check if vector is evenly spaced. + looks_linear(linear_vec) + + # Guesser: hint provider for correlated repeated searches. + guesser = Guesser(linear_vec) + guesser(5.0) + + # Strategy dispatch — single-query forms across the standard strategies. + for strategy in ( + LinearScan(), SIMDLinearScan(), BracketGallop(), ExpFromLeft(), + InterpolationSearch(), BinaryBracket(), Auto(), + Auto(SearchProperties(linear_vec)), + ) + searchsortedfirst(strategy, vec_int64, Int64(8), Int64(1)) + searchsortedlast(strategy, vec_int64, Int64(8), Int64(1)) + end + # findequal: generic + BisectThenSIMD shortcut for Int64 dense vectors. + for strategy in ( + BinaryBracket(), BracketGallop(), SIMDLinearScan(), + BisectThenSIMD(), Auto(), + ) + findequal(strategy, vec_int64, Int64(8)) + findequal(strategy, vec_int64, Int64(8), Int64(1)) + end + # SIMDLinearScan's Float64 specialization. + let vec_f64 = collect(1.0:1.0:16.0) + searchsortedfirst(SIMDLinearScan(), vec_f64, 8.0, 1) + searchsortedlast(SIMDLinearScan(), vec_f64, 8.0, 1) + end + searchsortedfirst(GuesserHint(Guesser(vec_int64)), vec_int64, Int64(8)) + searchsortedlast(GuesserHint(Guesser(vec_int64)), vec_int64, Int64(8)) + + # Strategy dispatch — batched in-place forms. + idx_out = Vector{Int}(undef, 4) + queries = Int64[2, 5, 8, 12] + searchsortedfirst!(idx_out, vec_int64, queries) + searchsortedlast!(idx_out, vec_int64, queries) + end +end diff --git a/src/search_properties.jl b/src/search_properties.jl new file mode 100644 index 0000000..f0507bf --- /dev/null +++ b/src/search_properties.jl @@ -0,0 +1,120 @@ +# `SearchProperties` populated constructor + the runtime probes it runs at +# construction time. The `SearchProperties` struct itself, the default +# constructor (`SearchProperties()` sentinel), and the `Auto` type that +# carries one as a field all live in `strategies.jl`. + +# `has_nan` is only meaningful for floating-point eltypes; for other numeric +# types and non-numeric eltypes there is no NaN concept. +@inline _has_nan(v::AbstractVector{<:AbstractFloat}) = any(isnan, v) +@inline _has_nan(::AbstractVector) = false + +# Auto's strict linearity tolerance. `SearchProperties(v)` uses the same +# default. Defined here rather than in `auto.jl` because `_sampled_looks_*` +# need it as a default arg; `auto.jl` re-uses the same constant. +const _AUTO_LINEAR_REL_TOLERANCE = 1.0e-3 + +# Sampled linearity check: probes v[1], v[k*n/10] for k = 1..9, v[n] and +# tests whether all interior points sit within `_AUTO_LINEAR_REL_TOLERANCE` +# (default 0.1%) of the straight line between v[1] and v[n]. The tight +# tolerance reliably distinguishes truly-uniform data from random/sorted +# data even at large n (where the order-statistic variance ≈ 1/sqrt(n) +# would fool a looser check). +# +# Cost is ~25 ns regardless of n — 11 reads + 9 comparisons. Used by Auto +# to decide whether to gamble on InterpolationSearch. +# +# InterpolationSearch's downside on non-linear data is large (4-14× slower +# than ExpFromLeft on log/plateau/two-scale spacings), so we err on the +# side of rejecting borderline cases. Truly uniform data — exact ranges, +# evenly-spaced grids, and small-amplitude jittered data — passes; sorted +# random data is rejected at all `n` tested up to ~10⁶. +@inline function _sampled_looks_linear( + v::AbstractVector{<:Number}, + tol::Float64 = _AUTO_LINEAR_REL_TOLERANCE, + ) + n = length(v) + n < 11 && return false + @inbounds begin + v1, vn = v[1], v[n] + span = vn - v1 + (iszero(span) || !isfinite(span)) && return false + abs_span = abs(span) + nm1 = n - 1 + for k in 1:9 + kk = 1 + (k * nm1) ÷ 10 + expected = v1 + (kk - 1) / nm1 * span + rel_err = abs(v[kk] - expected) / abs_span + rel_err > tol && return false + end + end + return true +end + +# Non-numeric eltype: can't sample, never picks InterpolationSearch. +@inline _sampled_looks_linear(::AbstractVector, ::Float64 = _AUTO_LINEAR_REL_TOLERANCE) = false + +# AbstractRange is definitionally uniform — accept without sampling. +@inline _sampled_looks_linear(::AbstractRange, ::Float64 = _AUTO_LINEAR_REL_TOLERANCE) = true + +# Sampled "log-linear" probe: same 9-point probe as `_sampled_looks_linear` +# but tests whether `log(v)` is linear in array index. Used to detect +# geometric / log-spaced data where `BitInterpolationSearch` is a win. +# Requires all sampled points to be strictly positive and finite; otherwise +# returns false. The probe runs in ~30 ns (the `log` calls are not cheap +# but only 11 of them, all at fixed index positions). +@inline function _sampled_looks_log_linear( + v::AbstractVector{<:Real}, + tol::Float64 = _AUTO_LINEAR_REL_TOLERANCE, + ) + n = length(v) + n < 11 && return false + @inbounds begin + v1, vn = v[1], v[n] + (v1 <= 0 || vn <= 0 || !isfinite(v1) || !isfinite(vn)) && return false + log_v1 = log(Float64(v1)) + log_vn = log(Float64(vn)) + span = log_vn - log_v1 + (iszero(span) || !isfinite(span)) && return false + abs_span = abs(span) + nm1 = n - 1 + for k in 1:9 + kk = 1 + (k * nm1) ÷ 10 + vk = v[kk] + (vk <= 0 || !isfinite(vk)) && return false + expected = log_v1 + (kk - 1) / nm1 * span + rel_err = abs(log(Float64(vk)) - expected) / abs_span + rel_err > tol && return false + end + end + return true +end + +@inline _sampled_looks_log_linear(::AbstractVector, ::Float64 = _AUTO_LINEAR_REL_TOLERANCE) = false + +""" + SearchProperties(v::AbstractVector; linear_tolerance = 1.0e-3) + +Run the linearity probe and (for floating-point eltypes) the NaN scan on `v`, +returning the populated [`SearchProperties`](@ref). Cost is O(n) on +floating-point vectors because of the NaN scan; for integer and non-numeric +eltypes the cost is O(1) — only the sampled-linearity probe runs. + +`linear_tolerance` controls the maximum relative deviation accepted by the +sampled-linearity probe. The default `1e-3` (0.1%) matches `Auto`'s +un-cached probe behaviour. Loosen it (e.g. to `1e-2`) to accept noisier +"approximately linear" data — this widens the regime where `Auto` will pick +`InterpolationSearch` over `ExpFromLeft`. Tighten it (e.g. to `1e-4`) to be +more conservative. +""" +function SearchProperties( + v::AbstractVector; + linear_tolerance::Real = 1.0e-3, + ) + tol = Float64(linear_tolerance) + return SearchProperties( + true, + _sampled_looks_linear(v, tol), + _has_nan(v), + _sampled_looks_log_linear(v, tol), + ) +end diff --git a/src/simd_ir.jl b/src/simd_ir.jl new file mode 100644 index 0000000..1c543da --- /dev/null +++ b/src/simd_ir.jl @@ -0,0 +1,135 @@ +# SIMD LLVM IR scaffolding shared by every "find first lane matching +# predicate" routine in the package: +# +# - `FFE_IR` / `_findfirstequal` — exact equality scan, Int64 +# (predicate `icmp eq`) +# - `_SIMD_GT_I64_IR` / `_simd_first_gt(::Int64, …)` — `icmp sgt` +# - `_SIMD_GE_I64_IR` / `_simd_first_ge(::Int64, …)` — `icmp sge` +# - `_SIMD_GT_F64_IR` / `_simd_first_gt(::Float64, …)` — `fcmp ogt` +# - `_SIMD_GE_F64_IR` / `_simd_first_ge(::Float64, …)` — `fcmp oge` +# +# All five IR strings are generated from the same template, keyed on scalar +# LLVM type (`i64`, `double`) and compare predicate. Adding a new predicate +# is a single `_simd_scan_ir(t, pred)` call plus a corresponding +# `Base.llvmcall` wrapper. + +# Generate the SIMD "find first lane matching predicate" IR for an arbitrary +# scalar type and LLVM compare predicate. Load 8 lanes at a time, compare +# against a broadcast of the search value, bitcast the i1×8 mask to i8, +# `cttz` to find the first set bit. The tail past the last full chunk is +# handled scalar-wise. +function _simd_scan_ir(t, pred) + cmp = pred[1] in ('o', 'u') ? "fcmp" : "icmp" + return """ + declare i8 @llvm.cttz.i8(i8, i1); + define i64 @entry($t %0, $(USE_PTR ? "ptr" : "i64") %1, i64 %2) #0 { + top: + $(USE_PTR ? "" : "%ivars = inttoptr i64 %1 to $t*") + %btmp = insertelement <8 x $t> undef, $t %0, i64 0 + %var = shufflevector <8 x $t> %btmp, <8 x $t> undef, <8 x i32> zeroinitializer + %lenm7 = add nsw i64 %2, -7 + %dosimditer = icmp ugt i64 %2, 7 + br i1 %dosimditer, label %L9.lr.ph, label %L32 + + L9.lr.ph: + %len8 = and i64 %2, 9223372036854775800 + br label %L9 + + L9: + %i = phi i64 [ 0, %L9.lr.ph ], [ %vinc, %L30 ] + %ivarsi = getelementptr inbounds $t, $(USE_PTR ? "ptr %1" : "$t* %ivars"), i64 %i + $(USE_PTR ? "" : "%vpvi = bitcast $t* %ivarsi to <8 x $t>*") + %v = load <8 x $t>, $(USE_PTR ? "ptr %ivarsi" : "<8 x $t> * %vpvi"), align 8 + %m = $cmp $pred <8 x $t> %v, %var + %mu = bitcast <8 x i1> %m to i8 + %matchnotfound = icmp eq i8 %mu, 0 + br i1 %matchnotfound, label %L30, label %L17 + + L17: + %tz8 = call i8 @llvm.cttz.i8(i8 %mu, i1 true) + %tz64 = zext i8 %tz8 to i64 + %vis = add nuw i64 %i, %tz64 + br label %common.ret + + common.ret: + %retval = phi i64 [ %vis, %L17 ], [ -1, %L32 ], [ %si, %L51 ], [ -1, %L67 ] + ret i64 %retval + + L30: + %vinc = add nuw nsw i64 %i, 8 + %continue = icmp slt i64 %vinc, %lenm7 + br i1 %continue, label %L9, label %L32 + + L32: + %cumi = phi i64 [ 0, %top ], [ %len8, %L30 ] + %done = icmp eq i64 %cumi, %2 + br i1 %done, label %common.ret, label %L51 + + L51: + %si = phi i64 [ %inc, %L67 ], [ %cumi, %L32 ] + %spi = getelementptr inbounds $t, $(USE_PTR ? "ptr %1" : "$t* %ivars"), i64 %si + %svi = load $t, $(USE_PTR ? "ptr" : "$t*") %spi, align 8 + %match = $cmp $pred $t %svi, %0 + br i1 %match, label %common.ret, label %L67 + + L67: + %inc = add i64 %si, 1 + %dobreak = icmp eq i64 %inc, %2 + br i1 %dobreak, label %common.ret, label %L51 + + } + attributes #0 = { alwaysinline } + """ +end + +const FFE_IR = _simd_scan_ir("i64", "eq") + +function _findfirstequal(vpivot::Int64, ptr::Ptr{Int64}, len::Int64) + return Base.llvmcall( + (FFE_IR, "entry"), + Int64, + Tuple{Int64, Ptr{Int64}, Int64}, + vpivot, + ptr, + len + ) +end + +const _SIMD_GT_I64_IR = _simd_scan_ir("i64", "sgt") +const _SIMD_GE_I64_IR = _simd_scan_ir("i64", "sge") +const _SIMD_GT_F64_IR = _simd_scan_ir("double", "ogt") +const _SIMD_GE_F64_IR = _simd_scan_ir("double", "oge") + +# Backing primitives for SIMDLinearScan. Each returns the 0-based offset of +# the first lane satisfying the predicate, or -1 if none. Caveat: NaN inputs +# always compare false under the ordered `o*` float predicates, so NaN in `v` +# or `x` produces "no match" rather than an exception — consistent with the +# undefined-input contract for sorted Float64 vectors containing NaN. +function _simd_first_gt(x::Int64, ptr::Ptr{Int64}, len::Int64) + return Base.llvmcall( + (_SIMD_GT_I64_IR, "entry"), + Int64, Tuple{Int64, Ptr{Int64}, Int64}, + x, ptr, len + ) +end +function _simd_first_ge(x::Int64, ptr::Ptr{Int64}, len::Int64) + return Base.llvmcall( + (_SIMD_GE_I64_IR, "entry"), + Int64, Tuple{Int64, Ptr{Int64}, Int64}, + x, ptr, len + ) +end +function _simd_first_gt(x::Float64, ptr::Ptr{Float64}, len::Int64) + return Base.llvmcall( + (_SIMD_GT_F64_IR, "entry"), + Int64, Tuple{Float64, Ptr{Float64}, Int64}, + x, ptr, len + ) +end +function _simd_first_ge(x::Float64, ptr::Ptr{Float64}, len::Int64) + return Base.llvmcall( + (_SIMD_GE_F64_IR, "entry"), + Int64, Tuple{Float64, Ptr{Float64}, Int64}, + x, ptr, len + ) +end diff --git a/src/strategies.jl b/src/strategies.jl new file mode 100644 index 0000000..e70263c --- /dev/null +++ b/src/strategies.jl @@ -0,0 +1,268 @@ +# Sorted-search strategy type hierarchy. All concrete strategies live here; +# their `Base.searchsortedfirst` / `Base.searchsortedlast` method definitions +# live in `dispatch.jl` and `auto.jl`. The `SearchProperties` cache type +# is defined here too because `Auto` carries one as a field; its populated +# constructor lives in `search_properties.jl`. + +""" + SearchStrategy + +Abstract supertype for sorted-search strategies. Concrete subtypes select how +[`searchsortedlast`](@ref Base.searchsortedlast) and +[`searchsortedfirst`](@ref Base.searchsortedfirst) should be performed when +called with a strategy as the first positional argument: + + - [`LinearScan`](@ref) walks ±1 from the hint. Cheapest when the target is + within a few positions of the hint; degrades linearly otherwise. + - [`SIMDLinearScan`](@ref) is `LinearScan` with the forward walk lowered to + 8-wide SIMD chunks for `DenseVector{Int64}` and `DenseVector{Float64}`. + Falls back to plain [`LinearScan`](@ref) for any other element type. + - [`BracketGallop`](@ref) expands an exponential bracket bidirectionally + from the hint, then binary-searches inside it. Effectively O(1) when the + target is near the hint; never worse than ~2 log₂ n comparisons. + - [`ExpFromLeft`](@ref) expands rightward from a left-bound hint by + doubling, then binary-searches inside the final bracket. Best for batched + sorted queries where each next query's hint is the previous result. + - [`InterpolationSearch`](@ref) guesses the answer by linearly extrapolating + between `v[lo]` and `v[hi]`, then refines with a bounded binary search. + O(1) per query on uniformly-spaced data; falls back to O(log n) on + irregular data. + - [`BinaryBracket`](@ref) is the standard `Base.searchsortedlast` / + `Base.searchsortedfirst` with no hint. Use it when no useful hint exists. + - [`BisectThenSIMD`](@ref) is an equality-search strategy: binary-bisects + `v` to a small basecase, then SIMD-scans for exact equality. Specialised + for `DenseVector{Int64}`; only meaningful when used with + [`findequal`](@ref). + - [`Auto`](@ref) heuristically picks one of the above based on the size of + `v`, the spacing of `v`, and whether a hint was supplied. Accepts an + optional [`SearchProperties`](@ref) cache to skip per-call probes. + +Strategies can also be passed to the batched +[`searchsortedlast!`](@ref) / [`searchsortedfirst!`](@ref) APIs. +""" +abstract type SearchStrategy end + +""" + LinearScan <: SearchStrategy + +Walk ±1 from the hint. Best when the target is within a few positions of the +hint. Falls back to [`BinaryBracket`](@ref) when no hint is supplied. +""" +struct LinearScan <: SearchStrategy end + +""" + SIMDLinearScan <: SearchStrategy + +Variant of [`LinearScan`](@ref) whose forward walk is lowered to 8-wide +SIMD chunks via custom LLVM IR. Specialized for `DenseVector{Int64}` and +`DenseVector{Float64}`; for any other element type, falls back to plain +[`LinearScan`](@ref). The backward walk (when the hint is past the +answer) uses the scalar `LinearScan` path regardless of element type. + +Wins on long forward walks (≥ 8 elements past the hint). For walks of +1–3 elements `LinearScan` is comparable — the SIMD chunk has constant +setup overhead. Worst case is O(n / 8) which is still linear, so +`SIMDLinearScan` is only `Auto`-relevant for small `n` or small-gap +batches where plain `LinearScan` would have been picked anyway. + +Caveats: + - Element type must be exactly `Int64` or `Float64`. `Int32`, + `UInt64`, `Float32`, and user-defined numeric types all fall back to + scalar. + - Sorted-Float64 vectors containing `NaN` produce undefined results, + same as for any positional search on a vector that isn't totally + ordered. + - Falls back to [`BinaryBracket`](@ref) when no hint is supplied. + - Falls back to [`LinearScan`](@ref) for non-`Forward` orderings. +""" +struct SIMDLinearScan <: SearchStrategy end + +""" + BracketGallop <: SearchStrategy + +Expand an exponential bracket bidirectionally from the hint, then +binary-search inside the bracket. Effectively O(1) when the target is near +the hint; never worse than ~2 log₂ n comparisons. + +Falls back to [`BinaryBracket`](@ref) when no hint is supplied. +""" +struct BracketGallop <: SearchStrategy end + +""" + ExpFromLeft <: SearchStrategy + +Exponential search forward from the hint (interpreted as a left bound), then +binary search in the final bracket. The hint is a *lower* bound rather than a +center guess, which is what batched sorted-search loops typically want: +`hint = previous_result`. + +Specifically: starting at `lo = hint`, check `v[lo], v[lo+1], ..., v[lo+4]` +linearly, then `v[lo+8], v[lo+16], …` exponentially, until `x` is bracketed, +then binary-search inside the bracket. + +Falls back to [`BinaryBracket`](@ref) when no hint is supplied. +""" +struct ExpFromLeft <: SearchStrategy end + +""" + InterpolationSearch <: SearchStrategy + +Guesses an index by linearly extrapolating `x` between `v[lo]` and `v[hi]`, +then refines with a bounded binary search. O(1) per query on uniformly-spaced +data (e.g. `collect(0:0.1:10)`); falls back to O(log n) otherwise. Requires +`x` to be subtractable with elements of `v` (i.e., a numeric ordering). + +Ignores any hint that is supplied — the guess is computed fresh from the +endpoints. Falls back to [`BinaryBracket`](@ref) for non-numeric element +types where subtraction isn't defined. +""" +struct InterpolationSearch <: SearchStrategy end + +""" + BitInterpolationSearch <: SearchStrategy + +Variant of [`InterpolationSearch`](@ref) that reinterprets `DenseVector{Float64}` +as `DenseVector{UInt64}` before computing the extrapolation guess. For +positive IEEE Float64 values, the bit pattern is monotonically increasing +with the float value and is approximately linear in array index when the +underlying data is *log-spaced* (geometric). On such data the bit-domain +guess is far better than the float-domain guess that `InterpolationSearch` +would compute — sometimes O(1) versus O(log n) refinement. + +After computing the bit-domain guess, the bracket and binary refine step +uses the original float values for comparison, so the answer is identical +to `Base.searchsortedlast` / `Base.searchsortedfirst`. + +Constraints: + - `DenseVector{Float64}` only (the IEEE bit-pattern trick is Float64-specific). + - Requires `v[1] > 0` and the query `x > 0` (negative, zero, subnormal, + and non-finite Float64 bit patterns are not monotonic with float value + under raw reinterpret). + - Forward ordering only. + +**This strategy is opt-in only** — `Auto` does not pick it. The bench sweep +shows the per-query division and UInt64↔Float64 conversion overhead +(~60–90 ns/q) costs more than the bracket refinement that the guess +saves at every gap tested; pinning `BitInterpolationSearch` is slower than +letting `Auto` pick `SIMDLinearScan` / `BracketGallop` / `ExpFromLeft`. +The strategy is retained for users with workloads not covered by the +sweep — for instance, very-large `n` (≥ 2²⁰), pathologically log-spaced +data, or hardware where Float64 division is unusually cheap relative to +the scalar walk. + +Falls back to [`InterpolationSearch`](@ref) for non-Float64 dense eltypes +(where the bit pattern equals the value and the two strategies are +equivalent), and to [`BinaryBracket`](@ref) for non-positive or +non-finite Float64 data. +""" +struct BitInterpolationSearch <: SearchStrategy end + +""" + BinaryBracket <: SearchStrategy + +Plain `Base.searchsortedlast` / `Base.searchsortedfirst`. Ignores any hint +that is supplied. +""" +struct BinaryBracket <: SearchStrategy end + +""" + BisectThenSIMD <: SearchStrategy + +Equality-search strategy. Binary-bisects `v` down to a small basecase, then +SIMD-scans the basecase for exact equality with `x`. Specialised for +`DenseVector{Int64}` + `Int64` queries via the same custom LLVM IR that +backs [`findfirstsortedequal`](@ref FindFirstFunctions.findfirstsortedequal); +for other element types, falls back to [`BinaryBracket`](@ref) plus an +equality check. + +This strategy is meant for use with [`findequal`](@ref FindFirstFunctions.findequal), +not with `searchsortedfirst` / `searchsortedlast` — its purpose is to answer +"is `x` present at exactly which index, or not at all?", which is a +different question from positional search. In the +`searchsortedfirst`/`searchsortedlast` dispatch it falls back to +[`BinaryBracket`](@ref). + +Ignores any hint that is supplied. Falls back to [`BinaryBracket`](@ref) for +non-`Forward` orderings. +""" +struct BisectThenSIMD <: SearchStrategy end + +""" + GuesserHint(guesser::Guesser) <: SearchStrategy + +Uses a [`Guesser`](@ref) to produce an integer guess for `x`, then dispatches +to [`BracketGallop`](@ref) from that guess. The `Guesser` already decides +between linear-extrapolation lookup (when `v` looks linear) and using the +previous result as a guess; this strategy plugs that logic into the strategy +dispatch hierarchy, and updates `guesser.idx_prev` on each call. + +Use this strategy with the per-query and batched APIs whenever you have a +`Guesser` attached to a vector. The cost is one `guesser(x)` evaluation +plus one `BracketGallop` call plus one `idx_prev[]` write per call. +""" +struct GuesserHint{G} <: SearchStrategy + guesser::G +end + +""" + SearchProperties + +Cached, non-allocating facts about a sorted vector. Pass to [`Auto`](@ref) +via `Auto(props)` to skip the per-call probes that the default `Auto()` runs +on every batched call. Stored fields are kept to plain `Bool`s so the struct +stays `isbits` and travels in registers. + +Default-constructed (`SearchProperties()`) is the "no information" sentinel: +`has_props` is `false`, the other fields are unspecified and ignored by +`Auto`. Construct via `SearchProperties(v::AbstractVector)` to populate the +fields by running the probes once. + +Currently consumed: `is_linear` and `has_nan` (the latter only on Float64, +to gate `SIMDLinearScan` eligibility in `Auto`). `is_log_linear` is +populated for callers that want to manually pin +[`BitInterpolationSearch`](@ref) based on data shape; `Auto` does not +consume it. The fields are otherwise populated for forward compatibility. +""" +struct SearchProperties + has_props::Bool + is_linear::Bool + has_nan::Bool + is_log_linear::Bool +end + +SearchProperties() = SearchProperties(false, false, false, false) + +""" + Auto <: SearchStrategy + Auto() + Auto(props::SearchProperties) + +Heuristically picks among [`LinearScan`](@ref), [`SIMDLinearScan`](@ref), +[`ExpFromLeft`](@ref), [`InterpolationSearch`](@ref), +[`BracketGallop`](@ref), and [`BinaryBracket`](@ref). The choice depends on +the calling context: + +**Per-query** (`searchsortedlast(Auto(), v, x[, hint])`): + - No hint, or hint outside `axes(v)` → [`BinaryBracket`](@ref). + - Hint in range, `length(v) ≤ 16` → [`LinearScan`](@ref). + - Hint in range, `length(v) > 16` → [`BracketGallop`](@ref). + +**Batched sorted** (`searchsortedlast!(out, v, queries; strategy = Auto())`) +chooses by the expected average gap in `v`'s index space between +consecutive query results. See the package's `auto.md` documentation for +the full decision tree and the crossover constants the bench sweep +determined. + +**Batched unsorted**: falls back to per-element `Base.searchsortedlast` / +`Base.searchsortedfirst` with no hint regardless of strategy. + +**Cached properties.** Passing a populated [`SearchProperties`](@ref) via +`Auto(props)` short-circuits the per-call probes. The cached path is +behaviour-equivalent to `Auto()` when `props` is up to date for `v`; the +caller is responsible for re-computing `props` if `v` mutates. +""" +struct Auto <: SearchStrategy + props::SearchProperties +end + +Auto() = Auto(SearchProperties()) diff --git a/test/runtests.jl b/test/runtests.jl index d89d198..45e04a7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -37,166 +37,713 @@ end end @safetestset "Guesser" begin - using FindFirstFunctions: - Guesser, searchsortedfirstcorrelated, - searchsortedlastcorrelated + using FindFirstFunctions: Guesser, GuesserHint v = collect(LinRange(0, 10, 4)) guesser_linear = Guesser(v) guesser_prev = Guesser(v, Ref(1), false) @test guesser_linear.linear_lookup - @test searchsortedfirstcorrelated(v, 4.0, guesser_linear) == 3 - @test searchsortedfirstcorrelated(v, 1.4234326478e24, guesser_linear) == 5 - @test searchsortedlastcorrelated(v, 4.0, guesser_prev) == 2 + + # Guesser feeds the dispatched API via GuesserHint. + @test searchsortedfirst(GuesserHint(guesser_linear), v, 4.0) == 3 + @test searchsortedfirst(GuesserHint(guesser_linear), v, 1.4234326478e24) == 5 + @test searchsortedlast(GuesserHint(guesser_prev), v, 4.0) == 2 @test guesser_prev.idx_prev[] == 2 - # Edge case + # Edge case: single-element v. v1 = [42.0] guesser = Guesser(v1) @test guesser_linear.linear_lookup @test guesser(100) == 1 @test guesser(42.0) == 1 @test guesser(0) == 1 - @test searchsortedfirstcorrelated(v1, 0, guesser) == 1 - @test searchsortedfirstcorrelated(v1, 100, guesser) == 1 + 1 # see searchsortedfirst - @test searchsortedfirstcorrelated(v1, 42.0, guesser) == 1 - @test searchsortedlastcorrelated(v1, 0, guesser) == 1 - 1 # see searchsortedlast - @test searchsortedlastcorrelated(v1, 100, guesser) == 1 - @test searchsortedlastcorrelated(v1, 42.0, guesser) == 1 + @test searchsortedfirst(GuesserHint(guesser), v1, 0) == 1 + @test searchsortedfirst(GuesserHint(guesser), v1, 100) == 2 # see Base.searchsortedfirst + @test searchsortedfirst(GuesserHint(guesser), v1, 42.0) == 1 + @test searchsortedlast(GuesserHint(guesser), v1, 0) == 0 # see Base.searchsortedlast + @test searchsortedlast(GuesserHint(guesser), v1, 100) == 1 + @test searchsortedlast(GuesserHint(guesser), v1, 42.0) == 1 end - @safetestset "Custom ordering in searchsorted*correlated" begin + @safetestset "Custom ordering for strategy dispatch" begin using FindFirstFunctions: - Guesser, searchsortedfirstcorrelated, - searchsortedlastcorrelated + Guesser, GuesserHint, BracketGallop, LinearScan, + ExpFromLeft, BinaryBracket, Auto - # Test with reverse-sorted vector - v_rev = collect(10.0:-1.0:1.0) # [10.0, 9.0, ..., 1.0] + v_rev = collect(10.0:-1.0:1.0) + for x in (5.0, 10.0, 1.0, 0.0, 11.0), + hint in (1, 5, 10), + strategy in ( + BracketGallop(), LinearScan(), ExpFromLeft(), Auto(), + ) - # Test searchsortedfirstcorrelated with Reverse order - @test searchsortedfirstcorrelated(v_rev, 5.0, 1; order = Base.Order.Reverse) == - searchsortedfirst(v_rev, 5.0, Base.Order.Reverse) - @test searchsortedfirstcorrelated(v_rev, 10.0, 1; order = Base.Order.Reverse) == - searchsortedfirst(v_rev, 10.0, Base.Order.Reverse) - @test searchsortedfirstcorrelated(v_rev, 1.0, 1; order = Base.Order.Reverse) == - searchsortedfirst(v_rev, 1.0, Base.Order.Reverse) - @test searchsortedfirstcorrelated(v_rev, 0.0, 1; order = Base.Order.Reverse) == - searchsortedfirst(v_rev, 0.0, Base.Order.Reverse) - @test searchsortedfirstcorrelated(v_rev, 11.0, 1; order = Base.Order.Reverse) == - searchsortedfirst(v_rev, 11.0, Base.Order.Reverse) - - # Test searchsortedlastcorrelated with Reverse order - @test searchsortedlastcorrelated(v_rev, 5.0, 1; order = Base.Order.Reverse) == - searchsortedlast(v_rev, 5.0, Base.Order.Reverse) - @test searchsortedlastcorrelated(v_rev, 10.0, 1; order = Base.Order.Reverse) == - searchsortedlast(v_rev, 10.0, Base.Order.Reverse) - @test searchsortedlastcorrelated(v_rev, 1.0, 1; order = Base.Order.Reverse) == - searchsortedlast(v_rev, 1.0, Base.Order.Reverse) - @test searchsortedlastcorrelated(v_rev, 0.0, 1; order = Base.Order.Reverse) == - searchsortedlast(v_rev, 0.0, Base.Order.Reverse) - @test searchsortedlastcorrelated(v_rev, 11.0, 1; order = Base.Order.Reverse) == - searchsortedlast(v_rev, 11.0, Base.Order.Reverse) - - # Test with Guesser and reverse order + @test searchsortedfirst(strategy, v_rev, x, hint; order = Base.Order.Reverse) == + searchsortedfirst(v_rev, x, Base.Order.Reverse) + @test searchsortedlast(strategy, v_rev, x, hint; order = Base.Order.Reverse) == + searchsortedlast(v_rev, x, Base.Order.Reverse) + end + # BinaryBracket ignores any hint. + for x in (5.0, 10.0, 1.0, 0.0, 11.0) + @test searchsortedfirst(BinaryBracket(), v_rev, x; order = Base.Order.Reverse) == + searchsortedfirst(v_rev, x, Base.Order.Reverse) + @test searchsortedlast(BinaryBracket(), v_rev, x; order = Base.Order.Reverse) == + searchsortedlast(v_rev, x, Base.Order.Reverse) + end + + # GuesserHint with reverse order. guesser_rev = Guesser(v_rev) - @test searchsortedfirstcorrelated(v_rev, 5.0, guesser_rev; order = Base.Order.Reverse) == + @test searchsortedfirst(GuesserHint(guesser_rev), v_rev, 5.0; order = Base.Order.Reverse) == searchsortedfirst(v_rev, 5.0, Base.Order.Reverse) - @test searchsortedlastcorrelated(v_rev, 5.0, guesser_rev; order = Base.Order.Reverse) == + @test searchsortedlast(GuesserHint(guesser_rev), v_rev, 5.0; order = Base.Order.Reverse) == searchsortedlast(v_rev, 5.0, Base.Order.Reverse) - # Test that default order (Forward) still works correctly - v_fwd = collect(1.0:1.0:10.0) # [1.0, 2.0, ..., 10.0] - @test searchsortedfirstcorrelated(v_fwd, 5.0, 1) == - searchsortedfirst(v_fwd, 5.0) - @test searchsortedlastcorrelated(v_fwd, 5.0, 1) == - searchsortedlast(v_fwd, 5.0) + # Default (Forward) order still resolves correctly. + v_fwd = collect(1.0:1.0:10.0) + for strategy in (BracketGallop(), LinearScan(), ExpFromLeft(), Auto()) + @test searchsortedfirst(strategy, v_fwd, 5.0, 1) == searchsortedfirst(v_fwd, 5.0) + @test searchsortedlast(strategy, v_fwd, 5.0, 1) == searchsortedlast(v_fwd, 5.0) + end end - @safetestset "Exponential Search (searchsortedfirstexp)" begin - using FindFirstFunctions: searchsortedfirstexp + @safetestset "SearchStrategy dispatch (single query)" begin + using FindFirstFunctions: + SearchStrategy, LinearScan, BracketGallop, BinaryBracket, Auto + + for n in (0, 1, 2, 8, 33, 257) + v = collect(1:n) + isempty(v) && continue - # Basic functionality - should match searchsortedfirst - v = collect(1:100) - for x in [1, 5, 10, 50, 99, 100] - @test searchsortedfirstexp(v, x) == searchsortedfirst(v, x) + # Probe targets that include hits, misses, boundaries + xs = unique!(sort!([0, 1, n ÷ 2, n, n + 1, -3, 2 * n + 1])) + for x in xs + expected_last = searchsortedlast(v, x) + expected_first = searchsortedfirst(v, x) + + # BinaryBracket — ignores any hint + @test searchsortedlast(BinaryBracket(), v, x) == expected_last + @test searchsortedfirst(BinaryBracket(), v, x) == expected_first + @test searchsortedlast(BinaryBracket(), v, x, 1) == expected_last + @test searchsortedfirst(BinaryBracket(), v, x, 1) == expected_first + + # Strategy with hint anywhere in 1..n agrees with Base + for h in unique!([1, max(1, n ÷ 4), n ÷ 2, max(1, 3n ÷ 4), n]) + @test searchsortedlast(LinearScan(), v, x, h) == expected_last + @test searchsortedfirst(LinearScan(), v, x, h) == expected_first + @test searchsortedlast(BracketGallop(), v, x, h) == expected_last + @test searchsortedfirst(BracketGallop(), v, x, h) == expected_first + @test searchsortedlast(Auto(), v, x, h) == expected_last + @test searchsortedfirst(Auto(), v, x, h) == expected_first + end + + # No-hint forms fall back to BinaryBracket + @test searchsortedlast(LinearScan(), v, x) == expected_last + @test searchsortedfirst(LinearScan(), v, x) == expected_first + @test searchsortedlast(BracketGallop(), v, x) == expected_last + @test searchsortedfirst(BracketGallop(), v, x) == expected_first + @test searchsortedlast(Auto(), v, x) == expected_last + @test searchsortedfirst(Auto(), v, x) == expected_first + + # Out-of-range hint → Auto falls back to BinaryBracket + @test searchsortedlast(Auto(), v, x, 0) == expected_last + @test searchsortedfirst(Auto(), v, x, 0) == expected_first + @test searchsortedlast(Auto(), v, x, n + 1) == expected_last + @test searchsortedfirst(Auto(), v, x, n + 1) == expected_first + end end - # Edge cases - value not in array - @test searchsortedfirstexp(v, 0) == searchsortedfirst(v, 0) - @test searchsortedfirstexp(v, 101) == searchsortedfirst(v, 101) - @test searchsortedfirstexp(v, 50.5) == searchsortedfirst(v, 50.5) + # Reverse order + v_rev = collect(10.0:-1.0:1.0) + for x in (0.5, 1.0, 5.0, 10.0, 11.0), h in (1, 5, 10) + @test searchsortedlast(BracketGallop(), v_rev, x, h; order = Base.Order.Reverse) == + searchsortedlast(v_rev, x, Base.Order.Reverse) + @test searchsortedfirst(BracketGallop(), v_rev, x, h; order = Base.Order.Reverse) == + searchsortedfirst(v_rev, x, Base.Order.Reverse) + @test searchsortedlast(LinearScan(), v_rev, x, h; order = Base.Order.Reverse) == + searchsortedlast(v_rev, x, Base.Order.Reverse) + @test searchsortedfirst(LinearScan(), v_rev, x, h; order = Base.Order.Reverse) == + searchsortedfirst(v_rev, x, Base.Order.Reverse) + @test searchsortedlast(Auto(), v_rev, x, h; order = Base.Order.Reverse) == + searchsortedlast(v_rev, x, Base.Order.Reverse) + @test searchsortedfirst(Auto(), v_rev, x, h; order = Base.Order.Reverse) == + searchsortedfirst(v_rev, x, Base.Order.Reverse) + end - # With custom bounds - @test searchsortedfirstexp(v, 50, 40, 60) == searchsortedfirst(v, 50, 40, 60, Base.Order.Forward) - @test searchsortedfirstexp(v, 45, 40, 60) == searchsortedfirst(v, 45, 40, 60, Base.Order.Forward) + # Strategy abstract type hierarchy + @test LinearScan <: SearchStrategy + @test BracketGallop <: SearchStrategy + @test BinaryBracket <: SearchStrategy + @test Auto <: SearchStrategy + end - # Float vectors - vf = collect(0.0:0.1:10.0) - for x in [0.0, 0.5, 1.0, 5.0, 9.9, 10.0] - @test searchsortedfirstexp(vf, x) == searchsortedfirst(vf, x) + @safetestset "ExpFromLeft and InterpolationSearch" begin + using FindFirstFunctions: + ExpFromLeft, InterpolationSearch, BinaryBracket + + # ExpFromLeft on uniform Int range + v = collect(1:1000) + for x in (0, 1, 50, 250, 500, 999, 1000, 1001), h in (1, 50, 500, 1000) + @test searchsortedlast(ExpFromLeft(), v, x, h) == + searchsortedlast(v, x) + @test searchsortedfirst(ExpFromLeft(), v, x, h) == + searchsortedfirst(v, x) end + # ExpFromLeft without hint falls back to BinaryBracket + @test searchsortedlast(ExpFromLeft(), v, 500) == searchsortedlast(v, 500) + @test searchsortedfirst(ExpFromLeft(), v, 500) == searchsortedfirst(v, 500) - # Empty and small vectors - @test searchsortedfirstexp(Int[], 5) == 1 - @test searchsortedfirstexp([1], 0) == 1 - @test searchsortedfirstexp([1], 1) == 1 - @test searchsortedfirstexp([1], 2) == 2 + # InterpolationSearch on uniform Float64 range + vf = collect(0.0:0.1:10.0) + for x in (-1.0, 0.0, 0.05, 1.0, 5.5, 9.95, 10.0, 11.0) + @test searchsortedlast(InterpolationSearch(), vf, x) == + searchsortedlast(vf, x) + @test searchsortedfirst(InterpolationSearch(), vf, x) == + searchsortedfirst(vf, x) + end - # Vector with repeated elements - vr = [1, 2, 2, 2, 3, 4, 5] - @test searchsortedfirstexp(vr, 2) == searchsortedfirst(vr, 2) + # InterpolationSearch on log-spaced (non-uniform) — must still be correct + vlog = exp.(range(log(0.1), log(100.0); length = 256)) + for x in (0.05, 0.1, 1.0, 50.0, 100.0, 150.0) + @test searchsortedlast(InterpolationSearch(), vlog, x) == + searchsortedlast(vlog, x) + @test searchsortedfirst(InterpolationSearch(), vlog, x) == + searchsortedfirst(vlog, x) + end - # Large vector - big_v = collect(1:10000) - for x in [1, 100, 1000, 5000, 9999, 10000] - @test searchsortedfirstexp(big_v, x) == searchsortedfirst(big_v, x) + # InterpolationSearch ignores hint (computes its own guess) + for h in (1, 100, 256) + @test searchsortedlast(InterpolationSearch(), vlog, 50.0, h) == + searchsortedlast(vlog, 50.0) end + + # InterpolationSearch falls back to BinaryBracket on non-Number eltypes + vs = ["a", "b", "c", "d"] + @test searchsortedlast(InterpolationSearch(), vs, "c") == + searchsortedlast(vs, "c") + @test searchsortedfirst(InterpolationSearch(), vs, "c", 2) == + searchsortedfirst(vs, "c") + + # InterpolationSearch on a constant vector (span=0) shouldn't divide + # by zero; should fall through to a bounded search and return a + # correct result. + vc = fill(3.0, 16) + @test searchsortedlast(InterpolationSearch(), vc, 3.0) == + searchsortedlast(vc, 3.0) + @test searchsortedlast(InterpolationSearch(), vc, 2.0) == + searchsortedlast(vc, 2.0) + @test searchsortedlast(InterpolationSearch(), vc, 4.0) == + searchsortedlast(vc, 4.0) + + # Edge: 1-element and 2-element vectors + @test searchsortedlast(ExpFromLeft(), [5], 4, 1) == 0 + @test searchsortedlast(ExpFromLeft(), [5], 5, 1) == 1 + @test searchsortedlast(ExpFromLeft(), [5], 6, 1) == 1 + @test searchsortedfirst(ExpFromLeft(), [5, 10], 7, 1) == 2 + @test searchsortedlast(InterpolationSearch(), [5], 4) == 0 + @test searchsortedlast(InterpolationSearch(), [5, 10], 7) == 1 end - @safetestset "Vectorized Search (searchsortedfirstvec and searchsortedlastvec)" begin - using FindFirstFunctions: searchsortedfirstvec, searchsortedlastvec + @safetestset "Batched Auto heuristic" begin + using FindFirstFunctions: + Auto, LinearScan, ExpFromLeft, BracketGallop, BinaryBracket, + searchsortedlast! + using StableRNGs - # Basic functionality + # Dense queries: Auto's avg-gap heuristic should land on LinearScan + # (verified by output correctness, not by introspecting the picked + # strategy — that's an implementation detail). + v = collect(0.0:0.01:10.0) # n=1001 + tt_dense = sort!(rand(StableRNG(1), 4096) .* 10.0) + out_auto = Vector{Int}(undef, length(tt_dense)) + out_linear = Vector{Int}(undef, length(tt_dense)) + searchsortedlast!(out_auto, v, tt_dense; strategy = Auto()) + searchsortedlast!(out_linear, v, tt_dense; strategy = LinearScan()) + @test out_auto == out_linear + + # Sparse queries on a long vector: same correctness check. + v_long = collect(0.0:0.001:100.0) # n=100001 + tt_sparse = sort!(rand(StableRNG(2), 10) .* 100.0) + out_a = Vector{Int}(undef, length(tt_sparse)) + out_e = Vector{Int}(undef, length(tt_sparse)) + searchsortedlast!(out_a, v_long, tt_sparse; strategy = Auto()) + searchsortedlast!(out_e, v_long, tt_sparse; strategy = ExpFromLeft()) + @test out_a == out_e + + # Dense burst — queries clustered inside one segment of v. + # The span-based gap detection should send Auto to LinearScan. + seg_lo = v[length(v) ÷ 2] + seg_hi = v[length(v) ÷ 2 + 1] + tt_burst = sort!(seg_lo .+ (seg_hi - seg_lo) .* rand(StableRNG(3), 2048)) + out_b = Vector{Int}(undef, length(tt_burst)) + out_l = Vector{Int}(undef, length(tt_burst)) + searchsortedlast!(out_b, v, tt_burst; strategy = Auto()) + searchsortedlast!(out_l, v, tt_burst; strategy = LinearScan()) + @test out_b == out_l + + # m=1 fast path — bypass span heuristic + out1 = Vector{Int}(undef, 1) + searchsortedlast!(out1, v, [5.123]; strategy = Auto()) + @test out1[1] == searchsortedlast(v, 5.123) + + # m=0 returns the output untouched (empty vector) + empty_out = Int[] + @test searchsortedlast!(empty_out, v, Float64[]; strategy = Auto()) === empty_out + @test isempty(empty_out) + + # Non-numeric eltype: span heuristic falls back to length-ratio. + vs = ["a", "b", "c", "d", "e", "f", "g", "h"] + qs = ["b", "d", "f"] + outs = Vector{Int}(undef, length(qs)) + searchsortedlast!(outs, vs, qs; strategy = Auto()) + @test outs == searchsortedlast.(Ref(vs), qs) + end + + @safetestset "SearchProperties cache" begin + using FindFirstFunctions: + Auto, SearchProperties, searchsortedlast!, searchsortedfirst! + using StableRNGs + + # The sentinel struct. + @test !SearchProperties().has_props + + # A populated SearchProperties from a linear, NaN-free vector. + v = collect(0.0:0.001:100.0) + props = SearchProperties(v) + @test props.has_props + @test props.is_linear + @test !props.has_nan + + # Output equivalence: Auto(props) returns the same answers as Auto() + # on a sparse-on-long-linear regime (where InterpolationSearch is + # the picked strategy via the cached `is_linear`). + tt = sort!(rand(StableRNG(10), 16) .* 100.0) + out_cached = Vector{Int}(undef, length(tt)) + out_baseline = Vector{Int}(undef, length(tt)) + out_truth = searchsortedlast.(Ref(v), tt) + searchsortedlast!(out_cached, v, tt; strategy = Auto(props)) + searchsortedlast!(out_baseline, v, tt; strategy = Auto()) + @test out_cached == out_truth + @test out_baseline == out_truth + + # searchsortedfirst path takes the same branch. + searchsortedfirst!(out_cached, v, tt; strategy = Auto(props)) + searchsortedfirst!(out_baseline, v, tt; strategy = Auto()) + @test out_cached == searchsortedfirst.(Ref(v), tt) + @test out_baseline == searchsortedfirst.(Ref(v), tt) + + # Float vector with a NaN: props.has_nan is true. The cache + # currently isn't consumed for has_nan in Auto's decision tree, + # but the field is populated correctly. + vnan = [1.0, 2.0, NaN, 4.0, 5.0] + propsnan = SearchProperties(vnan) + @test propsnan.has_nan + + # Non-float eltype: has_nan is always false. + vi = collect(Int64, 1:100) + @test !SearchProperties(vi).has_nan + + # Lying SearchProperties (claims is_linear=true on non-linear data) + # is still correctness-preserving — Auto's "InterpolationSearch on + # linear data" branch handles the false positive gracefully + # because InterpolationSearch's bad guess just makes BracketGallop + # wider, never incorrect. + v_log = exp.(range(0.0, 10.0; length = 4096)) + lying = SearchProperties(true, true, false, false) + tt_log = sort!(rand(StableRNG(11), 8) .* (v_log[end] - v_log[1]) .+ v_log[1]) + out_lying = Vector{Int}(undef, length(tt_log)) + searchsortedlast!(out_lying, v_log, tt_log; strategy = Auto(lying)) + @test out_lying == searchsortedlast.(Ref(v_log), tt_log) + + # Bits-ness: SearchProperties must be isbits so it doesn't allocate. + @test isbitstype(SearchProperties) + + # is_log_linear field: populated by SearchProperties(v) on + # geometric data, rejected on linear / two-scale data. + v_log = collect(exp.(range(0.0, log(1.0e6); length = 65536))) + p_log = SearchProperties(v_log) + @test p_log.is_log_linear + + v_lin = collect(0.0:0.001:65.0) + p_lin = SearchProperties(v_lin) + @test !p_lin.is_log_linear + + # Two-scale data should fail both linearity probes. + v_2s = sort!(vcat(range(0.0, 1.0; length = 32768), range(1.0, 100.0; length = 32768))) + p_2s = SearchProperties(v_2s) + @test !p_2s.is_log_linear + + # is_log_linear requires strictly positive v; mixed-sign rejects. + v_signed = collect(-100.0:0.001:65.0) + p_signed = SearchProperties(v_signed) + @test !p_signed.is_log_linear + end + + @safetestset "Batched in-place searchsorted!" begin + using FindFirstFunctions: + LinearScan, BracketGallop, BinaryBracket, Auto, + searchsortedlast!, searchsortedfirst! + + # Sorted queries v = collect(1:100) x_sorted = [5, 10, 20, 50, 90] + out_last = Vector{Int}(undef, length(x_sorted)) + out_first = Vector{Int}(undef, length(x_sorted)) - # searchsortedfirstvec should match element-wise searchsortedfirst - result_first = searchsortedfirstvec(v, x_sorted) - expected_first = searchsortedfirst.(Ref(v), x_sorted) - @test result_first == expected_first + @test searchsortedlast!(out_last, v, x_sorted) == + searchsortedlast.(Ref(v), x_sorted) + @test searchsortedfirst!(out_first, v, x_sorted) == + searchsortedfirst.(Ref(v), x_sorted) + @test out_last === searchsortedlast!(out_last, v, x_sorted) # in-place - # searchsortedlastvec should match element-wise searchsortedlast - result_last = searchsortedlastvec(v, x_sorted) - expected_last = searchsortedlast.(Ref(v), x_sorted) - @test result_last == expected_last + # Each strategy gives the same result on sorted input + for strategy in + (LinearScan(), BracketGallop(), BinaryBracket(), Auto()) + fill!(out_last, 0) + fill!(out_first, 0) + searchsortedlast!(out_last, v, x_sorted; strategy = strategy) + searchsortedfirst!(out_first, v, x_sorted; strategy = strategy) + @test out_last == searchsortedlast.(Ref(v), x_sorted) + @test out_first == searchsortedfirst.(Ref(v), x_sorted) + end - # Unsorted input falls back to element-wise + # Unsorted falls back to per-element regardless of strategy x_unsorted = [50, 10, 90, 5, 20] - @test searchsortedfirstvec(v, x_unsorted) == searchsortedfirst.(Ref(v), x_unsorted) - @test searchsortedlastvec(v, x_unsorted) == searchsortedlast.(Ref(v), x_unsorted) + out = Vector{Int}(undef, length(x_unsorted)) + for strategy in (LinearScan(), BracketGallop(), Auto()) + searchsortedlast!(out, v, x_unsorted; strategy = strategy) + @test out == searchsortedlast.(Ref(v), x_unsorted) + searchsortedfirst!(out, v, x_unsorted; strategy = strategy) + @test out == searchsortedfirst.(Ref(v), x_unsorted) + end + + # Reverse-order vector + reverse-sorted queries + v_rev = collect(10.0:-1.0:1.0) + x_rev_sorted = [9.5, 7.5, 5.0, 2.5, 0.5] # sorted descending + out_r = Vector{Int}(undef, length(x_rev_sorted)) + searchsortedlast!( + out_r, v_rev, x_rev_sorted; order = Base.Order.Reverse + ) + @test out_r == + [searchsortedlast(v_rev, x, Base.Order.Reverse) for x in x_rev_sorted] - # Float vectors + # Floats + values between grid points vf = collect(0.0:0.1:10.0) - xf_sorted = [0.5, 1.0, 2.5, 5.0, 9.5] - @test searchsortedfirstvec(vf, xf_sorted) == searchsortedfirst.(Ref(vf), xf_sorted) - @test searchsortedlastvec(vf, xf_sorted) == searchsortedlast.(Ref(vf), xf_sorted) - - # Edge cases - values outside range - x_edges = [-5, 0, 1, 100, 150] - @test searchsortedfirstvec(v, x_edges) == searchsortedfirst.(Ref(v), x_edges) - @test searchsortedlastvec(v, x_edges) == searchsortedlast.(Ref(v), x_edges) - - # Empty input vector - @test searchsortedfirstvec(v, Int[]) == Int[] - @test searchsortedlastvec(v, Int[]) == Int[] - - # Single element search - @test searchsortedfirstvec(v, [50]) == [50] - @test searchsortedlastvec(v, [50]) == [50] - - # Values between grid points - x_between = [1.5, 10.5, 50.5, 99.5] - @test searchsortedfirstvec(v, x_between) == searchsortedfirst.(Ref(v), x_between) - @test searchsortedlastvec(v, x_between) == searchsortedlast.(Ref(v), x_between) + xf = [0.5, 1.0, 2.5, 5.0, 9.5] + outf = Vector{Int}(undef, length(xf)) + searchsortedlast!(outf, vf, xf) + @test outf == searchsortedlast.(Ref(vf), xf) + + # Edge cases: out-of-range queries on each side + x_edges = [-5.0, 0.0, 5.0, 10.0, 15.0] + sort!(x_edges) + oute = Vector{Int}(undef, length(x_edges)) + searchsortedlast!(oute, vf, x_edges) + @test oute == searchsortedlast.(Ref(vf), x_edges) + searchsortedfirst!(oute, vf, x_edges) + @test oute == searchsortedfirst.(Ref(vf), x_edges) + + # Empty queries + @test searchsortedlast!(Int[], v, Int[]) == Int[] + @test searchsortedfirst!(Int[], v, Int[]) == Int[] + + # DimensionMismatch + @test_throws DimensionMismatch searchsortedlast!(zeros(Int, 2), v, [1, 2, 3]) + @test_throws DimensionMismatch searchsortedfirst!(zeros(Int, 2), v, [1, 2, 3]) + + # Sparse queries on a long vector — exercises the BracketGallop hint path + v_big = collect(1:100_000) + x_sparse = [100, 50_000, 99_900] + outb = Vector{Int}(undef, length(x_sparse)) + searchsortedlast!(outb, v_big, x_sparse) + @test outb == searchsortedlast.(Ref(v_big), x_sparse) + + # Range fast paths route through the strategy dispatch. + r = 1:100 + outr = Vector{Int}(undef, length(x_sorted)) + for strategy in (LinearScan(), BracketGallop(), Auto()) + searchsortedlast!(outr, r, x_sorted; strategy = strategy) + @test outr == searchsortedlast.(Ref(r), x_sorted) + end + end + + @safetestset "searchsortedrange" begin + using FindFirstFunctions, StableRNGs + v = collect(0.0:0.5:50.0) + # Compare against Base composition for several (lo, hi) pairs. + for (lo, hi) in [ + (5.0, 7.0), (0.0, 100.0), (-1.0, 5.5), (10.0, 10.0), + (45.0, 60.0), (51.0, 100.0), + ] + expected = searchsortedfirst(v, lo):searchsortedlast(v, hi) + for strategy in ( + Auto(), BinaryBracket(), BracketGallop(), LinearScan(), + InterpolationSearch(), ExpFromLeft(), SIMDLinearScan(), + ) + @test searchsortedrange(strategy, v, lo, hi) == expected + # Hinted form. + h = clamp(searchsortedfirst(v, lo), 1, length(v)) + @test searchsortedrange(strategy, v, lo, hi, h) == expected + end + end + # Random fuzz on Int64. + rng = StableRNG(2026) + vi = sort!(rand(rng, Int64(-100):Int64(100), 200)) + for _ in 1:200 + lo, hi = sort([rand(rng, Int64(-110):Int64(110)) for _ in 1:2]) + want = searchsortedfirst(vi, lo):searchsortedlast(vi, hi) + @test searchsortedrange(Auto(), vi, lo, hi) == want + @test searchsortedrange(BracketGallop(), vi, lo, hi, 100) == want + end + # Type stability: result is UnitRange{Int}. + @test typeof(searchsortedrange(Auto(), v, 5.0, 7.0)) === UnitRange{Int} + end + + @safetestset "queries_sorted kwarg" begin + using FindFirstFunctions, StableRNGs + v = collect(0.0:0.5:100.0) + rng = StableRNG(2026) + sorted_q = sort!(rand(rng, 64) .* 100.0) + unsorted_q = rand(rng, 64) .* 100.0 + out = Vector{Int}(undef, 64) + expected_sorted = searchsortedlast.(Ref(v), sorted_q) + expected_unsorted = searchsortedlast.(Ref(v), unsorted_q) + + # Default behaviour (nothing) — runtime issorted check. + searchsortedlast!(out, v, sorted_q; strategy = Auto()) + @test out == expected_sorted + searchsortedlast!(out, v, unsorted_q; strategy = Auto()) + @test out == expected_unsorted + + # Explicit queries_sorted = true: trust the caller's sortedness. + searchsortedlast!(out, v, sorted_q; strategy = Auto(), queries_sorted = true) + @test out == expected_sorted + # Across every shipped strategy. + for strategy in ( + LinearScan(), SIMDLinearScan(), BracketGallop(), + ExpFromLeft(), InterpolationSearch(), BinaryBracket(), + ) + searchsortedlast!( + out, v, sorted_q; + strategy = strategy, queries_sorted = true + ) + @test out == expected_sorted + searchsortedfirst!( + out, v, sorted_q; + strategy = strategy, queries_sorted = true + ) + @test out == searchsortedfirst.(Ref(v), sorted_q) + end + + # Explicit queries_sorted = false: take the unsorted-loop path + # unconditionally, even on sorted input — answers must still be + # correct (the unsorted loop is per-query unhinted Base call). + searchsortedlast!(out, v, sorted_q; strategy = Auto(), queries_sorted = false) + @test out == expected_sorted + searchsortedlast!(out, v, unsorted_q; strategy = Auto(), queries_sorted = false) + @test out == expected_unsorted + end + + @safetestset "SIMDLinearScan correctness" begin + using FindFirstFunctions, StableRNGs + F = FindFirstFunctions + + @testset "Int64 fuzz vs Base" begin + rng = StableRNG(2026) + for _ in 1:5_000 + n = rand(rng, 1:512) + v = sort!(rand(rng, Int64(-1000):Int64(1000), n)) + x = rand(rng, Int64(-1100):Int64(1100)) + hint = rand(rng, 1:n) + @test searchsortedlast(F.SIMDLinearScan(), v, x, hint) == + searchsortedlast(v, x) + @test searchsortedfirst(F.SIMDLinearScan(), v, x, hint) == + searchsortedfirst(v, x) + end + end + + @testset "Float64 fuzz vs Base" begin + rng = StableRNG(2027) + for _ in 1:5_000 + n = rand(rng, 1:512) + v = sort!(randn(rng, n)) + x = (rand(rng) - 0.5) * 6 + hint = rand(rng, 1:n) + @test searchsortedlast(F.SIMDLinearScan(), v, x, hint) == + searchsortedlast(v, x) + @test searchsortedfirst(F.SIMDLinearScan(), v, x, hint) == + searchsortedfirst(v, x) + end + end + + @testset "Edge cases (Int64)" begin + v = collect(Int64, 1:100) + # Out-of-range hint is clamped. + @test searchsortedlast(F.SIMDLinearScan(), v, Int64(50), -5) == 50 + @test searchsortedlast(F.SIMDLinearScan(), v, Int64(50), 1_000) == 50 + # x below/above the range. + @test searchsortedlast(F.SIMDLinearScan(), v, Int64(-10), 50) == 0 + @test searchsortedlast(F.SIMDLinearScan(), v, Int64(1_000), 50) == 100 + @test searchsortedfirst(F.SIMDLinearScan(), v, Int64(-10), 50) == 1 + @test searchsortedfirst(F.SIMDLinearScan(), v, Int64(1_000), 50) == 101 + # Empty and single-element vectors. + vempty = Int64[] + @test searchsortedlast(F.SIMDLinearScan(), vempty, Int64(5), 1) == 0 + @test searchsortedfirst(F.SIMDLinearScan(), vempty, Int64(5), 1) == 1 + v1 = Int64[42] + @test searchsortedlast(F.SIMDLinearScan(), v1, Int64(42), 1) == 1 + @test searchsortedfirst(F.SIMDLinearScan(), v1, Int64(42), 1) == 1 + # Duplicates. + vd = Int64[1, 2, 2, 2, 5] + @test searchsortedlast(F.SIMDLinearScan(), vd, Int64(2), 1) == 4 + @test searchsortedfirst(F.SIMDLinearScan(), vd, Int64(2), 5) == 2 + end + + @testset "Fallback: non-Int64/Float64 eltypes" begin + # Int32 vectors must hit the generic LinearScan fallback, + # not the Int64 SIMD primitive. + v32 = Int32[1, 5, 10, 20, 50, 100, 200] + for x in (Int32(0), Int32(7), Int32(20), Int32(300)) + for hint in 1:length(v32) + @test searchsortedlast(F.SIMDLinearScan(), v32, x, hint) == + searchsortedlast(v32, x) + @test searchsortedfirst(F.SIMDLinearScan(), v32, x, hint) == + searchsortedfirst(v32, x) + end + end + # Float32 same. + v32f = Float32[1.0, 5.0, 10.0, 20.0, 50.0] + for x in (Float32(0.0), Float32(7.0), Float32(20.0), Float32(100.0)) + @test searchsortedlast(F.SIMDLinearScan(), v32f, x, 2) == + searchsortedlast(v32f, x) + end + # Non-numeric. + vs = sort!(["alpha", "beta", "gamma", "delta", "epsilon"]) + @test searchsortedlast(F.SIMDLinearScan(), vs, "gamma", 2) == + searchsortedlast(vs, "gamma") + end + + @testset "Fallback: no hint, reverse order" begin + v = collect(Int64, 1:100) + # No hint → BinaryBracket. + @test searchsortedlast(F.SIMDLinearScan(), v, Int64(50)) == + searchsortedlast(v, Int64(50)) + # Reverse order → scalar LinearScan. + v_rev = collect(Int64, 100:-1:1) + @test searchsortedlast( + F.SIMDLinearScan(), v_rev, Int64(50), 1; order = Base.Order.Reverse + ) == searchsortedlast(v_rev, Int64(50), Base.Order.Reverse) + end + end + + @safetestset "findequal + BisectThenSIMD" begin + using FindFirstFunctions, StableRNGs + F = FindFirstFunctions + + # Reference: an Int sentinel-returning equality search built from + # Base.searchsortedfirst. + function ref_findequal(v, x) + i = searchsortedfirst(v, x) + return (i > lastindex(v) || !isequal(v[i], x)) ? + (firstindex(v) - 1) : i + end + + @testset "Strategy parity on Int64 (1-based)" begin + rng = StableRNG(3001) + for _ in 1:2_000 + n = rand(rng, 1:256) + v = sort!(rand(rng, Int64(-50):Int64(50), n)) + x = rand(rng, Int64(-60):Int64(60)) + hint = rand(rng, 1:n) + want = ref_findequal(v, x) + for strategy in ( + F.BinaryBracket(), F.BracketGallop(), + F.SIMDLinearScan(), F.LinearScan(), + F.ExpFromLeft(), F.InterpolationSearch(), + F.Auto(), F.BisectThenSIMD(), + ) + @test F.findequal(strategy, v, x) == want + @test F.findequal(strategy, v, x, hint) == want + end + end + end + + @testset "Strategy parity on Float64" begin + rng = StableRNG(3002) + for _ in 1:500 + n = rand(rng, 1:256) + v = sort!(randn(rng, n)) + # Mix queries that hit elements with ones that don't. + x = rand(rng) < 0.4 ? v[rand(rng, 1:n)] : + (rand(rng) - 0.5) * 6 + hint = rand(rng, 1:n) + want = ref_findequal(v, x) + for strategy in ( + F.BinaryBracket(), F.BracketGallop(), + F.SIMDLinearScan(), F.Auto(), F.BisectThenSIMD(), + ) + @test F.findequal(strategy, v, x) == want + @test F.findequal(strategy, v, x, hint) == want + end + end + end + + @testset "BisectThenSIMD shortcut uses SIMD on DenseVector{Int64}" begin + # Compare against findfirstsortedequal directly. + v = collect(Int64, 1:10_000) + for x in ( + Int64(1), Int64(5_000), Int64(10_000), + Int64(0), Int64(10_001), Int64(-100), Int64(20_000), + ) + a = F.findequal(F.BisectThenSIMD(), v, x) + b = F.findfirstsortedequal(x, v) + @test (a == 0 ? nothing : a) == b + end + end + + @testset "Sentinel for OffsetArray-style indexing" begin + # Manually shift the index base by using a UnitRange directly. + v = collect(Int64, 10:20) + @test F.findequal(F.Auto(), v, Int64(15)) == 6 + @test F.findequal(F.Auto(), v, Int64(100)) == 0 + @test F.findequal(F.Auto(), v, Int64(-5)) == 0 + end + + @testset "Reverse ordering" begin + v_rev = collect(Int64, 10:-1:1) + # Forward findequal on a reverse-sorted vector with the + # Reverse ordering should still find the element if present. + @test F.findequal( + F.BinaryBracket(), v_rev, Int64(5); + order = Base.Order.Reverse, + ) == 6 + @test F.findequal( + F.BinaryBracket(), v_rev, Int64(99); + order = Base.Order.Reverse, + ) == 0 + # BisectThenSIMD on reverse order falls back to generic path. + @test F.findequal( + F.BisectThenSIMD(), v_rev, Int64(5); + order = Base.Order.Reverse, + ) == 6 + end + + @testset "Empty and single-element" begin + vempty = Int64[] + @test F.findequal(F.Auto(), vempty, Int64(0)) == 0 + @test F.findequal(F.BisectThenSIMD(), vempty, Int64(0)) == 0 + v1 = Int64[42] + @test F.findequal(F.Auto(), v1, Int64(42)) == 1 + @test F.findequal(F.Auto(), v1, Int64(7)) == 0 + @test F.findequal(F.BisectThenSIMD(), v1, Int64(42)) == 1 + end + + @testset "BisectThenSIMD in positional dispatch falls back" begin + # When used with searchsortedfirst/last, BisectThenSIMD just + # delegates to BinaryBracket — its purpose is findequal. + v = collect(Int64, 1:100) + @test searchsortedfirst(F.BisectThenSIMD(), v, Int64(50)) == + searchsortedfirst(v, Int64(50)) + @test searchsortedlast(F.BisectThenSIMD(), v, Int64(50)) == + searchsortedlast(v, Int64(50)) + end end end