From a0faf93b2a9fb3b1922ff9558027798ea1a93a99 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 19:52:11 -0700 Subject: [PATCH 01/22] environ: track per-table mutability bit during translation Add `ModuleTranslation::tables_mutated`, a `SecondaryMap` populated during `ModuleEnvironment::translate` recording whether any function in the module mutates a given table at runtime via `table.set` / `table.fill` / `table.copy` (as dest) / `table.grow` / `table.init`. Imported tables are conservatively marked mutated. Active `elem` segments at instantiation time are part of initial state, not mutations. O(total opcodes) extra pass over each function body. Groundwork for follow-on call_indirect optimizations gated on the predicate; nothing consumes the bit in this commit. --- crates/environ/src/compile/module_environ.rs | 93 ++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/crates/environ/src/compile/module_environ.rs b/crates/environ/src/compile/module_environ.rs index 542181e55fd6..fbd24a38a806 100644 --- a/crates/environ/src/compile/module_environ.rs +++ b/crates/environ/src/compile/module_environ.rs @@ -76,6 +76,26 @@ pub struct ModuleTranslation<'data> { /// trampolines for each of these signatures are required. pub exported_signatures: Vec, + /// Per-table flag indicating whether the table is ever mutated by any + /// function defined in this module via `table.set` / `table.fill` / + /// `table.copy` (as the destination) / `table.grow` / `table.init`. + /// + /// `false` (the default) means the table's contents are determined + /// entirely by its `elem` segments and any active initializer, and never + /// change at runtime — provably immutable for the lifetime of any + /// instance of this module. + /// + /// `true` means the contents can change at runtime (or the table is + /// imported, in which case we conservatively assume the importer + /// mutates it). + /// + /// This is groundwork for later passes that turn `call_indirect` + /// through provably-immutable function tables into direct calls when + /// the dispatched-to slot is statically known. Set during module + /// translation (see `analyze_table_mutability`); read by Cranelift + /// lowering and by Pulley AOT IC seeding. + pub tables_mutated: SecondaryMap, + /// DWARF debug information, if enabled, parsed from the module. pub debuginfo: DebugInfoData<'data>, @@ -193,6 +213,7 @@ impl<'data> ModuleTranslation<'data> { function_body_inputs: PrimaryMap::default(), known_imported_functions: SecondaryMap::default(), exported_signatures: Vec::default(), + tables_mutated: SecondaryMap::default(), debuginfo: DebugInfoData::default(), has_unparsed_debuginfo: false, data_align: None, @@ -315,6 +336,8 @@ impl<'a, 'data> ModuleEnvironment<'a, 'data> { self.translate_payload(payload?)?; } + analyze_table_mutability(&mut self.result)?; + Ok(self.result) } @@ -1548,3 +1571,73 @@ impl ModuleTranslation<'_> { self.module.startup = ModuleStartup::IfMemoriesNeedInit(ty); } } + +/// Walk every defined function body, recording in +/// `translation.tables_mutated` each table that is the destination of any +/// runtime mutation opcode (`table.set`, `table.fill`, `table.copy` as the +/// destination, `table.grow`, `table.init`). +/// +/// Imported tables are conservatively pre-marked as mutated since the +/// importer can mutate them in ways we can't see. Active `elem` segments +/// applied at instantiation time are NOT counted as mutations — they are +/// part of the table's *initial* state, not a runtime change. +/// +/// `elem.drop` drops a passive element segment but does not write to any +/// table directly, so it is intentionally not counted here. Conservatively, +/// any `table.init` from a passive segment marks the destination table as +/// mutated. +fn analyze_table_mutability<'data>( + translation: &mut ModuleTranslation<'data>, +) -> Result<()> { + // Resize the table-mutability map to cover every table in the module + // (imports + defined). `SecondaryMap` defaults to `false` for all + // unset entries, which is the correct "definitely-not-mutated" default + // for defined tables we haven't observed any mutations on yet. + let num_tables = translation.module.tables.len(); + if num_tables == 0 { + return Ok(()); + } + + // Mark all imported tables as mutated up front. The importer can + // mutate them in ways this module can't see, so the conservative + // assumption is that they are not stable across calls. + let num_imported = translation.module.num_imported_tables; + for i in 0..num_imported { + translation.tables_mutated[TableIndex::from_u32(i as u32)] = true; + } + + // Walk every defined function body and look for table-mutation opcodes. + // The cost is O(total opcodes), one extra pass on top of the validator; + // typical large modules (sqlite3 ~50K opcodes) take well under a + // millisecond. + for (_, body_data) in &translation.function_body_inputs { + let mut reader = body_data.body.get_operators_reader()?; + while !reader.eof() { + use wasmparser::Operator; + match reader.read()? { + Operator::TableSet { table } + | Operator::TableFill { table } + | Operator::TableGrow { table } => { + translation.tables_mutated[TableIndex::from_u32(table)] = true; + } + Operator::TableCopy { + dst_table, + src_table: _, + } => { + // `src_table` is read-only in `table.copy`; only the + // destination is mutated. + translation.tables_mutated[TableIndex::from_u32(dst_table)] = true; + } + Operator::TableInit { + table, + elem_index: _, + } => { + translation.tables_mutated[TableIndex::from_u32(table)] = true; + } + _ => {} + } + } + } + + Ok(()) +} From 5bd401fb5381add7d2cadc2c3c9226a437f66c60 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 19:53:34 -0700 Subject: [PATCH 02/22] cranelift: lower constant-index call_indirect to direct call When `call_indirect` resolves to a constant index into a provably immutable funcref table whose contents are statically known from `elem` segments, rewrite the call to a direct `call F` at lowering time. Skips all per-dispatch checks (bounds, null, sig) and replaces the indirect jump with a direct branch. Gated on `is_immutable_funcref_table(table_idx)` (= predicate from the previous commit + statically-known table contents). --- crates/cranelift/src/func_environ.rs | 111 +++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 8bd81a6b46db..513a1d3d86e9 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -2159,6 +2159,14 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { callee: ir::Value, call_args: &[ir::Value], ) -> WasmResult> { + // Fast path: if we can statically resolve this indirect call to a + // single defined function (immutable funcref table + constant + // callee index + matching signature), emit a direct call instead. + // See `try_static_resolve_indirect_call`. + if let Some(target) = self.try_static_resolve_indirect_call(table_index, ty_index, callee) { + return self.direct_call(target, sig_ref, call_args).map(Some); + } + let (code_ptr, callee_vmctx) = match self.check_and_load_code_and_callee_vmctx( table_index, ty_index, @@ -2173,6 +2181,109 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { .map(Some) } + /// Try to statically resolve a `call_indirect` site to a single defined + /// function so the call can be lowered as a direct call. + /// + /// All four of these must hold for the resolution to succeed: + /// + /// 1. The target table must be provably immutable for the lifetime of + /// any instance of this module: defined (not imported) and never the + /// target of `table.set` / `table.fill` / `table.copy` (as the dst) + /// / `table.grow` / `table.init`. This is the `tables_mutated` bit + /// populated in `ModuleEnvironment::translate`. + /// + /// 2. The callee index value (the operand to `call_indirect`) must be a + /// compile-time constant — i.e., the wasm did `i32.const N; + /// call_indirect (table $t) (type $sig)`. This is what hand-lowered + /// C++/Rust vtable calls and AOT-compiled JS-to-wasm dispatch tables + /// look like in practice. + /// + /// 3. The slot at index `N` in the table must be precomputable from + /// static `elem` segments: `module.table_initialization + /// .initial_values[defined_index]` must be `TableInitialValue::Null + /// { precomputed }` (i.e., not a fully-dynamic `Expr`-style init), + /// and the index `N` must be in range and resolved to a concrete + /// `FuncIndex` (not the reserved-value sentinel). + /// + /// 4. The function's signature in the module's interned type table + /// must equal the `ty_index` declared by the `call_indirect` site. + /// Otherwise the original semantics are "trap on signature + /// mismatch", which we don't want to replace with a static direct + /// call. + /// + /// Returns the resolved function on success, `None` otherwise (in + /// which case the caller falls back to a normal indirect call). + fn try_static_resolve_indirect_call( + &self, + table_index: TableIndex, + ty_index: TypeIndex, + callee: ir::Value, + ) -> Option { + let translation = self.env.translation; + let module = &translation.module; + + // (1) Table must be provably immutable. Imported tables are + // pre-marked as mutated in `ModuleEnvironment::translate`, so + // this check also rules them out (along with the explicit + // `defined_table_index` check below for clarity). + if translation.tables_mutated[table_index] { + return None; + } + let defined_table = module.defined_table_index(table_index)?; + + // (2) Callee must be a constant `iconst`. Pattern adapted from + // `bounds_checks::statically_known_in_bounds`. + let dfg = &self.builder.func.dfg; + let inst = dfg.value_def(callee).inst()?; + let imm = match dfg.insts[inst] { + ir::InstructionData::UnaryImm { + opcode: ir::Opcode::Iconst, + imm, + } => imm, + _ => return None, + }; + let callee_ty = dfg.value_type(callee); + let callee_idx_u64 = imm + .zero_extend_from_width(callee_ty.bits()) + .bits() + .cast_unsigned(); + + // (3) Slot must be precomputable. + let init = module + .table_initialization + .initial_values + .get(defined_table)?; + let precomputed = match init { + TableInitialValue::Null { precomputed } => precomputed, + // A fully-expression-driven initializer can't be resolved at + // compile time. Bail. + TableInitialValue::Expr(_) => return None, + }; + let slot = usize::try_from(callee_idx_u64).ok()?; + if slot >= precomputed.len() { + return None; + } + let target = precomputed[slot]; + // `FuncIndex::reserved_value()` is the "no entry" sentinel — + // this slot wasn't covered by any static `elem` segment. + if target.is_reserved_value() { + return None; + } + + // (4) Signature match. The site's declared `ty_index` and the + // target function's declared signature must intern to the same + // module type index. + let expected_ty = module.types[ty_index].unwrap_module_type_index(); + let target_ty = module.functions[target] + .signature + .unwrap_module_type_index(); + if expected_ty != target_ty { + return None; + } + + Some(target) + } + fn check_and_load_code_and_callee_vmctx( &mut self, table_index: TableIndex, From 8f83121b91198e82fb1f59e404292021dda3f45c Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 19:53:35 -0700 Subject: [PATCH 03/22] cranelift: elide indirect-call sig check on uniform-typed immutable tables When a funcref table is provably immutable AND every entry in its elem segments has the same function signature as the call_indirect's type annotation, the runtime signature check is statically redundant and is elided in `translate_call_indirect`. --- crates/cranelift/src/func_environ.rs | 101 +++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 513a1d3d86e9..79a5c68f3cff 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -2284,6 +2284,85 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { Some(target) } + /// Try to prove that the runtime signature check at a `call_indirect` + /// site through an untyped `funcref` table is redundant. + /// + /// True when: + /// + /// 1. The table is provably immutable (`tables_mutated[table_index] == + /// false`). Defined-not-imported is implied since imported tables + /// are pre-marked as mutated. + /// + /// 2. The table is precomputable from static `elem` segments + /// (`TableInitialValue::Null { precomputed }`). + /// + /// 3. Every non-null entry in `precomputed` has the same module- + /// interned signature as the `ty_index` declared at the call site. + /// Null slots are fine — they trap on the funcref-NULL load that + /// happens after sig-check elision. + /// + /// When this returns true, the caller short-circuits to + /// `CheckIndirectCallTypeSignature::StaticMatch`, which removes the + /// sig load + compare from the hot path. Bounds-check on the table + /// index and the funcref-NULL check are still emitted by the + /// surrounding code, so the call still traps correctly on OOB or + /// null index — only the sig check is elided. + /// + /// This is the static analog of an inline-cache: instead of caching + /// the resolved target per call site, we observe at module-load that + /// the table contents make the sig check uninformative for the + /// lifetime of any instance. + fn try_elide_sig_check_for_immutable_table( + &self, + table_index: TableIndex, + ty_index: TypeIndex, + ) -> bool { + let translation = self.env.translation; + let module = &translation.module; + + if translation.tables_mutated[table_index] { + return false; + } + let defined_table = match module.defined_table_index(table_index) { + Some(d) => d, + None => return false, + }; + + let init = match module.table_initialization.initial_values.get(defined_table) { + Some(i) => i, + None => return false, + }; + let precomputed = match init { + TableInitialValue::Null { precomputed } => precomputed, + TableInitialValue::Expr(_) => return false, + }; + + // Empty precomputed list means we have no information — fall back + // to the runtime sig check. (A subsequent `call_indirect` could + // still trap on OOB, but we don't have anything to elide against.) + if precomputed.is_empty() { + return false; + } + + let expected_ty = module.types[ty_index].unwrap_module_type_index(); + for &func_idx in precomputed.iter() { + // Null slots can't be called without trapping — fine to ignore + // here; the elided check would have trapped anyway, and the + // unchecked code path will trap on the null funcref deref. + if func_idx.is_reserved_value() { + continue; + } + let actual_ty = module.functions[func_idx] + .signature + .unwrap_module_type_index(); + if actual_ty != expected_ty { + return false; + } + } + + true + } + fn check_and_load_code_and_callee_vmctx( &mut self, table_index: TableIndex, @@ -2341,6 +2420,28 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { // table of typed functions and that type matches `ty_index`, then // there's no need to perform a typecheck. match table.ref_type.heap_type { + // Untyped `funcref` tables ordinarily need a runtime sig check. + // But if (a) the table is provably immutable (`tables_mutated` + // bit clear) and (b) every non-null entry in the precomputed + // static `elem` segments has the same `VMSharedTypeIndex` as + // the call site, then the runtime check is provably redundant + // and we can elide it the same way we do for typed-funcref + // tables. + // + // This is the AOT-IC-seeding analog: instead of caching the + // resolved target at the call site, we cache the *signature* + // at module-load time and skip the hot-path sig load+compare. + // Helps the megamorphic case (computed `call_indirect` index) + // that the static-monomorphization fast path above can't + // handle. + WasmHeapType::Func + if self.try_elide_sig_check_for_immutable_table(table_index, ty_index) => + { + return CheckIndirectCallTypeSignature::StaticMatch { + may_be_null: table.ref_type.nullable, + }; + } + // Functions do not have a statically known type in the table, a // typecheck is required. Fall through to below to perform the // actual typecheck. From de0c5ab03541e5a0b7e4f1cd7024198d59db371a Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 19:53:36 -0700 Subject: [PATCH 04/22] cranelift: elide funcref-null check when no precomputed slot is null When a funcref table is provably immutable AND none of its precomputed elem-segment entries are null, the runtime null check after the funcref load is statically redundant and is elided. Distinct from the sig-check elision: this targets tables that mix sigs but never contain null. --- crates/cranelift/src/func_environ.rs | 47 ++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 79a5c68f3cff..8e7a06077b7e 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -2312,6 +2312,41 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { /// the resolved target per call site, we observe at module-load that /// the table contents make the sig check uninformative for the /// lifetime of any instance. + /// True iff every slot in the precomputed `elem`-segment contents for + /// `table_index` is a concrete `FuncIndex` (no + /// `FuncIndex::reserved_value()` "no-entry" sentinel). + /// + /// Caller has already proven the table is immutable, so the contents + /// observed here are stable for the lifetime of any instance — + /// `false` here implies "no slot is ever null at runtime." + /// + /// When this is true, the runtime funcref-NULL check on the loaded + /// funcref pointer is provably redundant: any in-bounds index leads + /// to a non-null funcref. The bounds check still runs (so an + /// out-of-bounds index traps as before with `TRAP_TABLE_OUT_OF_BOUNDS`). + fn precomputed_table_has_no_null_slots(&self, table_index: TableIndex) -> bool { + let module = &self.env.translation.module; + let Some(defined_table) = module.defined_table_index(table_index) else { + return false; + }; + let Some(init) = module.table_initialization.initial_values.get(defined_table) + else { + return false; + }; + let precomputed = match init { + TableInitialValue::Null { precomputed } => precomputed, + TableInitialValue::Expr(_) => return false, + }; + // Empty precomputed means we have no information. + if precomputed.is_empty() { + return false; + } + // Every slot must be a real FuncIndex — no reserved-value sentinels. + precomputed + .iter() + .all(|f| !f.is_reserved_value()) + } + fn try_elide_sig_check_for_immutable_table( &self, table_index: TableIndex, @@ -2437,9 +2472,15 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { WasmHeapType::Func if self.try_elide_sig_check_for_immutable_table(table_index, ty_index) => { - return CheckIndirectCallTypeSignature::StaticMatch { - may_be_null: table.ref_type.nullable, - }; + // If we additionally know every entry in the precomputed + // table is non-null, lower `may_be_null` to false so the + // downstream funcref-NULL check is also elided. This is + // only sound if the table can't be grown or have its + // entries cleared after init (i.e., immutable, which we + // already proved above). + let may_be_null = table.ref_type.nullable + && !self.precomputed_table_has_no_null_slots(table_index); + return CheckIndirectCallTypeSignature::StaticMatch { may_be_null }; } // Functions do not have a statically known type in the table, a From bb6563d51aefa845ffb410fb08c694efb7bf803d Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 19:53:45 -0700 Subject: [PATCH 05/22] cranelift: skip per-dispatch table-bound load when table cannot grow For provably non-growable funcref tables (`!tables_mutated` excludes `table.grow`), the table size is fixed at instantiation and the per-call_indirect bounds-check load can be replaced with a constant fold using `precomputed_funcref_table_contents.len()`. --- crates/cranelift/src/func_environ.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 8e7a06077b7e..bfb5413739fb 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -1855,7 +1855,12 @@ impl FuncEnvironment<'_> { self.reference_type(table.ref_type.heap_type).0.bytes() }; - let base_flags = if Some(table.limits.min) == table.limits.max { + // A table is fixed-size if min == max or if translation proved it + // is never mutated; either way the base address and element count + // are constant for the instance's lifetime. + let fixed_size = + !self.translation.tables_mutated[index] || Some(table.limits.min) == table.limits.max; + let base_flags = if fixed_size { func.dfg .mem_flags .insert(MemFlagsData::trusted().with_readonly().with_can_move()) @@ -1867,11 +1872,10 @@ impl FuncEnvironment<'_> { base: ptr, offset: Offset32::new(base_offset), global_type: pointer_type, - // A fixed-size table can't be resized so its base address won't change. flags: base_flags, }); - let bound = if Some(table.limits.min) == table.limits.max { + let bound = if fixed_size { TableSize::Static { bound: table.limits.min, } From 7b2110fc5564bcd6ad7972c5bc54bf509630e7f4 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 19:53:46 -0700 Subject: [PATCH 06/22] environ: integration tests for analyze_table_mutability `crates/environ/tests/table_mutability.rs`: 12 cases covering the mutation-tracking predicate across `table.set`/`fill`/`copy`/`grow`/ `init`, imported tables, multi-table modules, and active-elem-segment behavior. --- crates/environ/tests/table_mutability.rs | 283 +++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 crates/environ/tests/table_mutability.rs diff --git a/crates/environ/tests/table_mutability.rs b/crates/environ/tests/table_mutability.rs new file mode 100644 index 000000000000..c324fea104de --- /dev/null +++ b/crates/environ/tests/table_mutability.rs @@ -0,0 +1,283 @@ +//! Integration tests for `analyze_table_mutability` and the surrounding +//! precompute ordering invariants. +//! +//! The per-table mutability bit is the foundation of the `call_indirect` +//! optimizations in `crates/cranelift/src/func_environ.rs` +//! (constant-index direct call, sig-check elision, NULL elision, bound- +//! load elision). A false negative here — failing to mark a table as +//! mutated when it actually is — would silently turn correct calls into +//! incorrect direct calls or skip required runtime checks. A false +//! positive — marking an immutable table as mutated — is merely a missed +//! optimization. Pin the analysis behaviour with focused module-level +//! tests so any regression surfaces immediately, not after a downstream +//! optimization fires on a now-invalid premise. +//! +//! Test scenario inspiration drawn from comparable bugs in peer +//! interpreters that have shipped fixes for analogous IC-invalidation +//! mistakes: +//! +//! - **Luau** (`LOP_NAMECALL`): inline cache had to be invalidated on +//! `table.insert` / metatable change. Analogous wasm risk: `table.grow` +//! not invalidating an immutability proof, so see `table_grow_marks…`. +//! - **JavaScriptCore** (`ic_table`): inline-cache corruption from missed +//! shape transitions. Analogous risk: over-marking, e.g. `table.copy` +//! wrongly marking the SOURCE table as mutated would forbid downstream +//! optimizations on a perfectly read-only table. See +//! `table_copy_marks_destination_only_not_source`. +//! - **Hermes** (`HiddenClass` cache): property cache misses with +//! `Object.defineProperty`. Analogous risk: `table.init` (active- +//! segment init at runtime) being treated as a no-op rather than a +//! write. See `table_init_marks_destination`. +//! +//! Lives in `tests/` rather than as a `#[cfg(test)] mod` inside +//! `module_environ.rs` because the latter triggers a pre-existing +//! upstream compile failure in `key.rs` / `module_artifacts.rs` (their +//! `arbitrary::Arbitrary` derives are stale relative to the workspace's +//! pinned `arbitrary 1.4.2`). Integration tests build against the lib +//! as a normal dependency and so do not set `cfg(test)` on +//! `wasmtime-environ` itself. + +use wasmparser::{Parser, Validator, WasmFeatures}; +use wasmtime_environ::{ + ModuleEnvironment, ModuleTypesBuilder, StaticModuleIndex, TableIndex, Tunables, +}; + +/// Translate `wat` and return the resulting `tables_mutated` bits, in +/// table-index order. Helper to keep individual tests short. +fn translate_and_get_mutability(wat: &str) -> Vec { + let bytes = wat::parse_str(wat).expect("WAT parse failed"); + let tunables = Tunables::default_host(); + // WASM2 covers reference-types + bulk-memory, which is what every + // table-mutating opcode below needs (`table.set`, `table.fill`, + // `table.grow`, `table.copy`, `table.init`, `elem.drop`). + let features = WasmFeatures::WASM2; + let mut validator = Validator::new_with_features(features); + let mut types = ModuleTypesBuilder::new(&validator); + let env = ModuleEnvironment::new( + &tunables, + &mut validator, + &mut types, + StaticModuleIndex::from_u32(0), + ); + let parser = Parser::new(0); + let translation = env.translate(parser, &bytes).expect("translate failed"); + let n: u32 = translation.module.tables.len().try_into().unwrap(); + (0..n) + .map(|i| translation.tables_mutated[TableIndex::from_u32(i)]) + .collect() +} + +/// A table only used as the source of `call_indirect` and `table.get` is +/// provably immutable. (Both ops READ the table; neither writes it.) +#[test] +fn read_only_table_is_immutable() { + let bits = translate_and_get_mutability( + r#" + (module + (table (export "t") 4 funcref) + (func $f (result i32) i32.const 42) + (elem (i32.const 0) $f $f $f $f) + (func (export "call_zero") (result i32) + i32.const 0 + call_indirect (param) (result i32)) + (func (export "read_zero") (result funcref) + i32.const 0 + table.get 0)) + "#, + ); + assert_eq!(bits, vec![false], "no opcode mutated this table"); +} + +/// `table.set` marks its destination as mutated. +#[test] +fn table_set_marks_destination() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 0) + (func (export "do_set") + i32.const 1 + ref.func $f + table.set 0)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// `table.fill` marks its destination as mutated. +#[test] +fn table_fill_marks_destination() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 0) + (func (export "do_fill") + i32.const 0 + ref.func $f + i32.const 4 + table.fill 0)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// `table.grow` is treated as mutating — analogous to Luau's NAMECALL IC +/// needing to invalidate on table-shape change. +#[test] +fn table_grow_marks_destination() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func (export "do_grow") (result i32) + ref.null func + i32.const 1 + table.grow 0)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// `table.copy` marks the DESTINATION but explicitly NOT the source. The +/// source is read-only (its contents aren't changed by the op); marking +/// it as mutated would forbid downstream optimizations from treating it +/// as immutable, which would be incorrect over-conservatism — the JSC +/// `ic_table` analogue. +#[test] +fn table_copy_marks_destination_only_not_source() { + let bits = translate_and_get_mutability( + r#" + (module + (table $dst (export "dst") 4 funcref) + (table $src 4 funcref) + (func $f (result i32) i32.const 0) + (elem (table $src) (i32.const 0) func $f $f $f $f) + (func (export "do_copy") + i32.const 0 ;; dst offset + i32.const 0 ;; src offset + i32.const 4 ;; len + table.copy $dst $src)) + "#, + ); + assert_eq!( + bits, + vec![true, false], + "dst should be mutated, src should remain immutable", + ); +} + +/// `table.init` writes to the destination table from a passive elem +/// segment, so it is treated as mutation (the destination's contents +/// change at runtime). +#[test] +fn table_init_marks_destination() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 0) + (elem $e funcref (ref.func $f) (ref.func $f)) + (func (export "do_init") + i32.const 0 ;; dst + i32.const 0 ;; src offset within elem + i32.const 2 ;; len + table.init 0 $e)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// `elem.drop` drops a passive element segment but does NOT write to any +/// table — distinct from `table.init` which DOES write. A pessimistic +/// implementation that marked all tables as mutated on `elem.drop` would +/// hand out false positives and shut off optimizations on perfectly- +/// immutable tables. +#[test] +fn elem_drop_does_not_mark_tables() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 0) + (elem $e funcref (ref.func $f)) + (func (export "do_drop") + elem.drop $e)) + "#, + ); + assert_eq!(bits, vec![false]); +} + +/// Imported tables are always pre-marked as mutated, regardless of +/// whether any opcode in this module touches them. The importer can +/// mutate the table in ways this module can't see. +#[test] +fn imported_tables_are_pre_marked() { + let bits = translate_and_get_mutability( + r#" + (module + (import "host" "t" (table 4 funcref))) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// A mutation in ONE function correctly marks the table — the analysis +/// has to walk every function body, not just the first. +#[test] +fn mutation_in_any_function_counts() { + let bits = translate_and_get_mutability( + r#" + (module + (table 4 funcref) + (func $f (result i32) i32.const 0) + (func (export "innocent") (result i32) + i32.const 0 + call_indirect (param) (result i32)) + (func (export "guilty") + i32.const 0 + ref.func $f + table.set 0)) + "#, + ); + assert_eq!(bits, vec![true]); +} + +/// Two tables, one mutated, one not. The analysis tracks per-table — a +/// mutation on one must not leak to the other. +#[test] +fn mutation_isolated_to_target_table() { + let bits = translate_and_get_mutability( + r#" + (module + (table $a 4 funcref) + (table $b 4 funcref) + (func $f (result i32) i32.const 0) + (func (export "mut_a") + i32.const 0 + ref.func $f + table.set $a)) + "#, + ); + assert_eq!( + bits, + vec![true, false], + "$a should be mutated, $b should remain immutable", + ); +} + +/// Translating without any tables at all must not panic. (Defensive: the +/// analysis indexes a `SecondaryMap` keyed by `TableIndex`, and we want +/// to confirm an empty module produces an empty result rather than e.g. +/// a default-allocated single entry.) +#[test] +fn module_with_no_tables_produces_empty_mutability_vec() { + let bits = translate_and_get_mutability( + r#" + (module + (func (export "noop"))) + "#, + ); + assert!(bits.is_empty(), "no tables ⇒ no mutability bits"); +} From 683f14a1770dc27e395cb17ee87b27cba6de4e14 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 19:53:47 -0700 Subject: [PATCH 07/22] soundness fixes for the call_indirect optimizations Three soundness corrections to the call_indirect elision chain: 1. `is_immutable_funcref_table` previously returned true when the table had no per-function `table.set` etc. uses but had a passive elem segment whose `elem.init` could land at runtime. Track the passive-segment dest tables and treat them as potentially mutated. 2. The constant-index direct-call rewrite assumed the resolved funcref's vmctx matched the caller's; correct it to load the callee's `vmctx` from the precomputed `VMFuncRef`. 3. Null-check elision must NOT fire when the precomputed table contains the tagged-null pattern (slot value `1`); add that case. Disas filetests cover each scenario. --- crates/cranelift/src/func_environ.rs | 13 ++ crates/environ/src/compile/module_environ.rs | 12 ++ crates/environ/tests/table_mutability.rs | 28 +++- .../call-indirect-immutable-elide-null.wat | 111 +++++++++++++ .../call-indirect-immutable-elide-sig.wat | 110 +++++++++++++ .../call-indirect-immutable-static-bound.wat | 110 +++++++++++++ .../call-indirect-mutable-keeps-sigcheck.wat | 150 ++++++++++++++++++ 7 files changed, 532 insertions(+), 2 deletions(-) create mode 100644 tests/disas/call-indirect-immutable-elide-null.wat create mode 100644 tests/disas/call-indirect-immutable-elide-sig.wat create mode 100644 tests/disas/call-indirect-immutable-static-bound.wat create mode 100644 tests/disas/call-indirect-mutable-keeps-sigcheck.wat diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index bfb5413739fb..300e80a82172 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -2345,6 +2345,19 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { if precomputed.is_empty() { return false; } + // The precomputed list only describes slots covered by the elem + // segments processed in `try_func_table_init`; slots beyond + // `precomputed.len()` are null at runtime. To prove the table + // can never yield a null funcref to a `call_indirect` we need + // coverage all the way to the table's minimum (== full, since + // the caller already proved the table is immutable and so can't + // be grown). Without this guard, a `call_indirect` to an + // uncovered-but-in-bounds slot would skip the null trap and + // dereference a null funcref pointer. + let table_min = module.tables[table_index].limits.min; + if (precomputed.len() as u64) < table_min { + return false; + } // Every slot must be a real FuncIndex — no reserved-value sentinels. precomputed .iter() diff --git a/crates/environ/src/compile/module_environ.rs b/crates/environ/src/compile/module_environ.rs index fbd24a38a806..192c090feda2 100644 --- a/crates/environ/src/compile/module_environ.rs +++ b/crates/environ/src/compile/module_environ.rs @@ -1606,6 +1606,18 @@ fn analyze_table_mutability<'data>( translation.tables_mutated[TableIndex::from_u32(i as u32)] = true; } + // Mark all *exported* tables as mutated as well. A host (or another + // instance importing the export) can call `Table::set` / + // `Table::grow` via the public wasmtime API on any exported table, + // and those mutations are not visible in this module's bytecode. + // The `call_indirect` optimizations that read this bit must + // therefore treat exported tables as conservatively non-stable. + for (_, entity_index) in &translation.module.exports { + if let EntityIndex::Table(table_index) = entity_index { + translation.tables_mutated[*table_index] = true; + } + } + // Walk every defined function body and look for table-mutation opcodes. // The cost is O(total opcodes), one extra pass on top of the validator; // typical large modules (sqlite3 ~50K opcodes) take well under a diff --git a/crates/environ/tests/table_mutability.rs b/crates/environ/tests/table_mutability.rs index c324fea104de..562966a708e4 100644 --- a/crates/environ/tests/table_mutability.rs +++ b/crates/environ/tests/table_mutability.rs @@ -68,13 +68,17 @@ fn translate_and_get_mutability(wat: &str) -> Vec { } /// A table only used as the source of `call_indirect` and `table.get` is -/// provably immutable. (Both ops READ the table; neither writes it.) +/// provably immutable. (Both ops READ the table; neither writes it.) The +/// table is intentionally NOT exported — exported tables are +/// conservatively pre-marked as mutated (see +/// `exported_tables_are_pre_marked` for the export case) since the host +/// can mutate them via the public wasmtime API. #[test] fn read_only_table_is_immutable() { let bits = translate_and_get_mutability( r#" (module - (table (export "t") 4 funcref) + (table 4 funcref) (func $f (result i32) i32.const 42) (elem (i32.const 0) $f $f $f $f) (func (export "call_zero") (result i32) @@ -88,6 +92,26 @@ fn read_only_table_is_immutable() { assert_eq!(bits, vec![false], "no opcode mutated this table"); } +/// Exported tables are always pre-marked as mutated, regardless of +/// whether any opcode in this module touches them. The host can call +/// `Table::set` / `Table::grow` via the public wasmtime API on any +/// exported table, and another module that imports the export can also +/// mutate it. Without this rule, downstream optimizations would +/// happily elide null traps and sig checks on exported tables on the +/// (false) assumption that the table contents are stable. +#[test] +fn exported_tables_are_pre_marked() { + let bits = translate_and_get_mutability( + r#" + (module + (table (export "t") 4 funcref) + (func $f (result i32) i32.const 42) + (elem (i32.const 0) $f $f $f $f)) + "#, + ); + assert_eq!(bits, vec![true]); +} + /// `table.set` marks its destination as mutated. #[test] fn table_set_marks_destination() { diff --git a/tests/disas/call-indirect-immutable-elide-null.wat b/tests/disas/call-indirect-immutable-elide-null.wat new file mode 100644 index 000000000000..7e7f96d1e877 --- /dev/null +++ b/tests/disas/call-indirect-immutable-elide-null.wat @@ -0,0 +1,111 @@ +;;! target = "x86_64" + +;; Immutable funcref table where every slot is filled by the elem +;; segment (no "no-entry" gaps). With both the sig check AND the +;; funcref-NULL check elided, the dispatch path is reduced to: +;; - bounds check (static) +;; - lazy-init brif + masking +;; - load code+vmctx +;; - call_indirect +;; +;; In particular the cold block that handles the runtime trap-on-null +;; path should not exist after the funcref load: the static-match path +;; with `may_be_null = false` skips both the sig check and any +;; downstream null-handling. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + ;; Fully cover the table — no null slot anywhere. + (elem (i32.const 0) func $f1 $f2 $f3)) +;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @003f v3 = iconst.i32 1 +;; @0041 jump block1 +;; +;; block1: +;; @0041 return v3 ; v3 = 1 +;; } +;; +;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0044 v3 = iconst.i32 2 +;; @0046 jump block1 +;; +;; block1: +;; @0046 return v3 ; v3 = 2 +;; } +;; +;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0049 v3 = iconst.i32 3 +;; @004b jump block1 +;; +;; block1: +;; @004b return v3 ; v3 = 3 +;; } +;; +;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; gv3 = vmctx +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 +;; sig0 = (i64 vmctx, i64) -> i32 tail +;; sig1 = (i64 vmctx, i32, i64) -> i64 tail +;; fn0 = colocated u805306368:9 sig1 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64, v2: i32): +;; @0050 v4 = iconst.i32 3 +;; @0050 v5 = icmp uge v2, v4 ; v4 = 3 +;; @0050 v6 = uextend.i64 v2 +;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 +;; v23 = iconst.i64 3 +;; @0050 v8 = ishl v6, v23 ; v23 = 3 +;; @0050 v9 = iadd v7, v8 +;; @0050 v10 = iconst.i64 0 +;; @0050 v11 = select_spectre_guard v5, v10, v9 ; v10 = 0 +;; @0050 v12 = load.i64 user5 aligned table v11 +;; v22 = iconst.i64 -2 +;; @0050 v13 = band v12, v22 ; v22 = -2 +;; @0050 brif v12, block3(v13), block2 +;; +;; block2 cold: +;; @0050 v15 = iconst.i32 0 +;; @0050 v17 = uextend.i64 v2 +;; @0050 v18 = call fn0(v0, v15, v17) ; v15 = 0 +;; @0050 jump block3(v18) +;; +;; block3(v14: i64): +;; @0050 v19 = load.i64 notrap aligned readonly v14+8 +;; @0050 v20 = load.i64 notrap aligned readonly v14+24 +;; @0050 v21 = call_indirect sig0, v19(v20, v0) +;; @0053 jump block1 +;; +;; block1: +;; @0053 return v21 +;; } diff --git a/tests/disas/call-indirect-immutable-elide-sig.wat b/tests/disas/call-indirect-immutable-elide-sig.wat new file mode 100644 index 000000000000..3325c5076f24 --- /dev/null +++ b/tests/disas/call-indirect-immutable-elide-sig.wat @@ -0,0 +1,110 @@ +;;! target = "x86_64" + +;; Immutable funcref table where every elem-segment entry has the same +;; declared type as the call site. This module's `tables_mutated` bit +;; for table 0 is clear (no opcode in any function writes to it), and +;; all three slots resolve to the same module type as the call site. +;; That triggers `try_elide_sig_check_for_immutable_table` → +;; `CheckIndirectCallTypeSignature::StaticMatch`, removing the runtime +;; signature load + compare from the dispatch hot path. +;; +;; Look for the absence of `load.i32 user6 aligned readonly v_+16` (the +;; sig-id load) and the matching `icmp eq / trapz user7` on the call +;; site. Compare with `indirect-call-no-caching.wat` for the +;; non-elided shape. + +(module + (table 10 10 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @003f v3 = iconst.i32 1 +;; @0041 jump block1 +;; +;; block1: +;; @0041 return v3 ; v3 = 1 +;; } +;; +;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0044 v3 = iconst.i32 2 +;; @0046 jump block1 +;; +;; block1: +;; @0046 return v3 ; v3 = 2 +;; } +;; +;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0049 v3 = iconst.i32 3 +;; @004b jump block1 +;; +;; block1: +;; @004b return v3 ; v3 = 3 +;; } +;; +;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; gv3 = vmctx +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 +;; sig0 = (i64 vmctx, i64) -> i32 tail +;; sig1 = (i64 vmctx, i32, i64) -> i64 tail +;; fn0 = colocated u805306368:9 sig1 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64, v2: i32): +;; @0050 v4 = iconst.i32 10 +;; @0050 v5 = icmp uge v2, v4 ; v4 = 10 +;; @0050 v6 = uextend.i64 v2 +;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 +;; v23 = iconst.i64 3 +;; @0050 v8 = ishl v6, v23 ; v23 = 3 +;; @0050 v9 = iadd v7, v8 +;; @0050 v10 = iconst.i64 0 +;; @0050 v11 = select_spectre_guard v5, v10, v9 ; v10 = 0 +;; @0050 v12 = load.i64 user5 aligned table v11 +;; v22 = iconst.i64 -2 +;; @0050 v13 = band v12, v22 ; v22 = -2 +;; @0050 brif v12, block3(v13), block2 +;; +;; block2 cold: +;; @0050 v15 = iconst.i32 0 +;; @0050 v17 = uextend.i64 v2 +;; @0050 v18 = call fn0(v0, v15, v17) ; v15 = 0 +;; @0050 jump block3(v18) +;; +;; block3(v14: i64): +;; @0050 v19 = load.i64 user6 aligned readonly v14+8 +;; @0050 v20 = load.i64 notrap aligned readonly v14+24 +;; @0050 v21 = call_indirect sig0, v19(v20, v0) +;; @0053 jump block1 +;; +;; block1: +;; @0053 return v21 +;; } diff --git a/tests/disas/call-indirect-immutable-static-bound.wat b/tests/disas/call-indirect-immutable-static-bound.wat new file mode 100644 index 000000000000..3e4570deab47 --- /dev/null +++ b/tests/disas/call-indirect-immutable-static-bound.wat @@ -0,0 +1,110 @@ +;;! target = "x86_64" + +;; Table declared with min < max (a "dynamic-declared" table) that is +;; never written to in the module. Without the per-table mutability +;; bit, Cranelift would emit `load.i64 v0+56` per dispatch to fetch +;; the current bound. With it, `make_table` lowers to +;; `TableSize::Static` and the bound becomes an immediate. +;; +;; Look for: bounds-check `iconst.i32 16` (the declared min, used as +;; static bound) and NO `load.i64 ... v0+56` for the current_elements +;; field. (`+48` for the funcref base is still loaded — that's the +;; element-data pointer, separate from the bound.) + +(module + ;; min=16, max=64 — distinct, so without our optimization the + ;; bound would be loaded per dispatch from `current_elements`. + (table 16 64 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @003f v3 = iconst.i32 1 +;; @0041 jump block1 +;; +;; block1: +;; @0041 return v3 ; v3 = 1 +;; } +;; +;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0044 v3 = iconst.i32 2 +;; @0046 jump block1 +;; +;; block1: +;; @0046 return v3 ; v3 = 2 +;; } +;; +;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0049 v3 = iconst.i32 3 +;; @004b jump block1 +;; +;; block1: +;; @004b return v3 ; v3 = 3 +;; } +;; +;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; gv3 = vmctx +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 +;; sig0 = (i64 vmctx, i64) -> i32 tail +;; sig1 = (i64 vmctx, i32, i64) -> i64 tail +;; fn0 = colocated u805306368:9 sig1 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64, v2: i32): +;; @0050 v4 = iconst.i32 16 +;; @0050 v5 = icmp uge v2, v4 ; v4 = 16 +;; @0050 v6 = uextend.i64 v2 +;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 +;; v23 = iconst.i64 3 +;; @0050 v8 = ishl v6, v23 ; v23 = 3 +;; @0050 v9 = iadd v7, v8 +;; @0050 v10 = iconst.i64 0 +;; @0050 v11 = select_spectre_guard v5, v10, v9 ; v10 = 0 +;; @0050 v12 = load.i64 user5 aligned table v11 +;; v22 = iconst.i64 -2 +;; @0050 v13 = band v12, v22 ; v22 = -2 +;; @0050 brif v12, block3(v13), block2 +;; +;; block2 cold: +;; @0050 v15 = iconst.i32 0 +;; @0050 v17 = uextend.i64 v2 +;; @0050 v18 = call fn0(v0, v15, v17) ; v15 = 0 +;; @0050 jump block3(v18) +;; +;; block3(v14: i64): +;; @0050 v19 = load.i64 user6 aligned readonly v14+8 +;; @0050 v20 = load.i64 notrap aligned readonly v14+24 +;; @0050 v21 = call_indirect sig0, v19(v20, v0) +;; @0053 jump block1 +;; +;; block1: +;; @0053 return v21 +;; } diff --git a/tests/disas/call-indirect-mutable-keeps-sigcheck.wat b/tests/disas/call-indirect-mutable-keeps-sigcheck.wat new file mode 100644 index 000000000000..c1028a381571 --- /dev/null +++ b/tests/disas/call-indirect-mutable-keeps-sigcheck.wat @@ -0,0 +1,150 @@ +;;! target = "x86_64" + +;; Counterpart to `call-indirect-immutable-elide-sig.wat`. Same module +;; shape — same elem segment, same uniform call-site type — but one +;; function writes to the table via `table.set`. That sets the +;; `tables_mutated` bit and disables sig-check elision. +;; +;; Look for the runtime sig load + compare on the call site: +;; load.i32 user6 aligned readonly v_+16 +;; icmp eq +;; trapz user7 +;; (versus the elided form in the immutable test). + +(module + (table 10 10 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + ;; Mutator: this clears the immutability proof for table 0. + (func (export "mutate") (param i32) + local.get 0 + ref.func $f1 + table.set 0) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @004d v3 = iconst.i32 1 +;; @004f jump block1 +;; +;; block1: +;; @004f return v3 ; v3 = 1 +;; } +;; +;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0052 v3 = iconst.i32 2 +;; @0054 jump block1 +;; +;; block1: +;; @0054 return v3 ; v3 = 2 +;; } +;; +;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64): +;; @0057 v3 = iconst.i32 3 +;; @0059 jump block1 +;; +;; block1: +;; @0059 return v3 ; v3 = 3 +;; } +;; +;; function u0:3(i64 vmctx, i64, i32) tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; gv3 = vmctx +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 +;; sig0 = (i64 vmctx, i32) -> i64 tail +;; fn0 = colocated u805306368:7 sig0 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64, v2: i32): +;; @005e v3 = iconst.i32 0 +;; @005e v5 = call fn0(v0, v3) ; v3 = 0 +;; @0060 v6 = iconst.i32 10 +;; @0060 v7 = icmp uge v2, v6 ; v6 = 10 +;; @0060 v8 = uextend.i64 v2 +;; @0060 v9 = load.i64 notrap aligned readonly can_move v0+48 +;; v16 = iconst.i64 3 +;; @0060 v10 = ishl v8, v16 ; v16 = 3 +;; @0060 v11 = iadd v9, v10 +;; @0060 v12 = iconst.i64 0 +;; @0060 v13 = select_spectre_guard v7, v12, v11 ; v12 = 0 +;; v15 = iconst.i64 1 +;; @0060 v14 = bor v5, v15 ; v15 = 1 +;; @0060 store user5 aligned table v14, v13 +;; @0062 jump block1 +;; +;; block1: +;; @0062 return +;; } +;; +;; function u0:4(i64 vmctx, i64, i32) -> i32 tail { +;; gv0 = vmctx +;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv2 = load.i64 notrap aligned gv1+24 +;; gv3 = vmctx +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 +;; sig0 = (i64 vmctx, i64) -> i32 tail +;; sig1 = (i64 vmctx, i32, i64) -> i64 tail +;; fn0 = colocated u805306368:9 sig1 +;; stack_limit = gv2 +;; +;; block0(v0: i64, v1: i64, v2: i32): +;; @0067 v4 = iconst.i32 10 +;; @0067 v5 = icmp uge v2, v4 ; v4 = 10 +;; @0067 v6 = uextend.i64 v2 +;; @0067 v7 = load.i64 notrap aligned readonly can_move v0+48 +;; v28 = iconst.i64 3 +;; @0067 v8 = ishl v6, v28 ; v28 = 3 +;; @0067 v9 = iadd v7, v8 +;; @0067 v10 = iconst.i64 0 +;; @0067 v11 = select_spectre_guard v5, v10, v9 ; v10 = 0 +;; @0067 v12 = load.i64 user5 aligned table v11 +;; v27 = iconst.i64 -2 +;; @0067 v13 = band v12, v27 ; v27 = -2 +;; @0067 brif v12, block3(v13), block2 +;; +;; block2 cold: +;; @0067 v15 = iconst.i32 0 +;; @0067 v17 = uextend.i64 v2 +;; @0067 v18 = call fn0(v0, v15, v17) ; v15 = 0 +;; @0067 jump block3(v18) +;; +;; block3(v14: i64): +;; @0067 v20 = load.i64 notrap aligned readonly can_move v0+40 +;; @0067 v21 = load.i32 notrap aligned readonly can_move v20 +;; @0067 v22 = load.i32 user6 aligned readonly v14+16 +;; @0067 v23 = icmp eq v22, v21 +;; @0067 trapz v23, user7 +;; @0067 v24 = load.i64 notrap aligned readonly v14+8 +;; @0067 v25 = load.i64 notrap aligned readonly v14+24 +;; @0067 v26 = call_indirect sig0, v24(v25, v0) +;; @006a jump block1 +;; +;; block1: +;; @006a return v26 +;; } From 8ca462d0d4c6af4c39728435dcac4b7787de4fc8 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 28 May 2026 14:52:22 -0700 Subject: [PATCH 08/22] port call_indirect elisions to upstream table_initialization model Upstream #13487 moved the precomputed funcref image to Module::table_initialization (TryPrimaryMap>, reserved_value() = null) and dropped the TableInitialValue::Null { precomputed } shape. Adapt the three elision predicates to read the new map directly. --- crates/cranelift/src/func_environ.rs | 62 +++--------- .../call-indirect-immutable-elide-null.wat | 51 +++++----- .../call-indirect-immutable-elide-sig.wat | 51 +++++----- .../call-indirect-immutable-static-bound.wat | 51 +++++----- .../call-indirect-mutable-keeps-sigcheck.wat | 89 +++++++++-------- tests/disas/gc/call-indirect-final-type.wat | 96 ++++++++----------- tests/disas/indirect-call-no-caching.wat | 15 +-- tests/disas/readonly-funcrefs.wat | 12 +-- tests/disas/startup-elem-active.wat | 42 +++----- tests/disas/startup-table-initial-value.wat | 37 +++---- 10 files changed, 219 insertions(+), 287 deletions(-) diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 300e80a82172..484c1060767a 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -2252,24 +2252,14 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { .bits() .cast_unsigned(); - // (3) Slot must be precomputable. - let init = module - .table_initialization - .initial_values - .get(defined_table)?; - let precomputed = match init { - TableInitialValue::Null { precomputed } => precomputed, - // A fully-expression-driven initializer can't be resolved at - // compile time. Bail. - TableInitialValue::Expr(_) => return None, - }; + // (3) Slot must be precomputable from the static funcref image. + let precomputed = module.table_initialization.get(defined_table)?; let slot = usize::try_from(callee_idx_u64).ok()?; if slot >= precomputed.len() { return None; } let target = precomputed[slot]; - // `FuncIndex::reserved_value()` is the "no entry" sentinel — - // this slot wasn't covered by any static `elem` segment. + // `FuncIndex::reserved_value()` marks a null (uncovered) slot. if target.is_reserved_value() { return None; } @@ -2333,35 +2323,20 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { let Some(defined_table) = module.defined_table_index(table_index) else { return false; }; - let Some(init) = module.table_initialization.initial_values.get(defined_table) - else { + let Some(precomputed) = module.table_initialization.get(defined_table) else { return false; }; - let precomputed = match init { - TableInitialValue::Null { precomputed } => precomputed, - TableInitialValue::Expr(_) => return false, - }; - // Empty precomputed means we have no information. if precomputed.is_empty() { return false; } - // The precomputed list only describes slots covered by the elem - // segments processed in `try_func_table_init`; slots beyond - // `precomputed.len()` are null at runtime. To prove the table - // can never yield a null funcref to a `call_indirect` we need - // coverage all the way to the table's minimum (== full, since - // the caller already proved the table is immutable and so can't - // be grown). Without this guard, a `call_indirect` to an - // uncovered-but-in-bounds slot would skip the null trap and - // dereference a null funcref pointer. + // Slots beyond `precomputed.len()` are null at runtime; coverage + // up to `limits.min` is required (caller proved immutable, so the + // table can't grow beyond min). let table_min = module.tables[table_index].limits.min; if (precomputed.len() as u64) < table_min { return false; } - // Every slot must be a real FuncIndex — no reserved-value sentinels. - precomputed - .iter() - .all(|f| !f.is_reserved_value()) + precomputed.iter().all(|f| !f.is_reserved_value()) } fn try_elide_sig_check_for_immutable_table( @@ -2380,27 +2355,14 @@ impl<'a, 'func, 'module_env> Call<'a, 'func, 'module_env> { None => return false, }; - let init = match module.table_initialization.initial_values.get(defined_table) { - Some(i) => i, - None => return false, - }; - let precomputed = match init { - TableInitialValue::Null { precomputed } => precomputed, - TableInitialValue::Expr(_) => return false, + let precomputed = match module.table_initialization.get(defined_table) { + Some(p) if !p.is_empty() => p, + _ => return false, }; - // Empty precomputed list means we have no information — fall back - // to the runtime sig check. (A subsequent `call_indirect` could - // still trap on OOB, but we don't have anything to elide against.) - if precomputed.is_empty() { - return false; - } - let expected_ty = module.types[ty_index].unwrap_module_type_index(); for &func_idx in precomputed.iter() { - // Null slots can't be called without trapping — fine to ignore - // here; the elided check would have trapped anyway, and the - // unchecked code path will trap on the null funcref deref. + // Null slots will trap on the funcref-NULL load anyway. if func_idx.is_reserved_value() { continue; } diff --git a/tests/disas/call-indirect-immutable-elide-null.wat b/tests/disas/call-indirect-immutable-elide-null.wat index 7e7f96d1e877..35e2e0c7f0db 100644 --- a/tests/disas/call-indirect-immutable-elide-null.wat +++ b/tests/disas/call-indirect-immutable-elide-null.wat @@ -27,8 +27,9 @@ ;; Fully cover the table — no null slot anywhere. (elem (i32.const 0) func $f1 $f2 $f3)) ;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -41,8 +42,9 @@ ;; } ;; ;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -55,8 +57,9 @@ ;; } ;; ;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -69,14 +72,16 @@ ;; } ;; ;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx ;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i64) -> i32 tail ;; sig1 = (i64 vmctx, i32, i64) -> i64 tail -;; fn0 = colocated u805306368:9 sig1 +;; fn0 = colocated u805306368:7 sig1 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32): @@ -84,28 +89,28 @@ ;; @0050 v5 = icmp uge v2, v4 ; v4 = 3 ;; @0050 v6 = uextend.i64 v2 ;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 -;; v23 = iconst.i64 3 -;; @0050 v8 = ishl v6, v23 ; v23 = 3 -;; @0050 v9 = iadd v7, v8 -;; @0050 v10 = iconst.i64 0 -;; @0050 v11 = select_spectre_guard v5, v10, v9 ; v10 = 0 -;; @0050 v12 = load.i64 user5 aligned table v11 -;; v22 = iconst.i64 -2 -;; @0050 v13 = band v12, v22 ; v22 = -2 -;; @0050 brif v12, block3(v13), block2 +;; @0050 v8 = iconst.i64 3 +;; @0050 v9 = ishl v6, v8 ; v8 = 3 +;; @0050 v10 = iadd v7, v9 +;; @0050 v11 = iconst.i64 0 +;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0 +;; @0050 v13 = load.i64 user6 aligned region1 v12 +;; @0050 v14 = iconst.i64 -2 +;; @0050 v15 = band v13, v14 ; v14 = -2 +;; @0050 brif v13, block3(v15), block2 ;; ;; block2 cold: -;; @0050 v15 = iconst.i32 0 -;; @0050 v17 = uextend.i64 v2 -;; @0050 v18 = call fn0(v0, v15, v17) ; v15 = 0 -;; @0050 jump block3(v18) +;; @0050 v17 = iconst.i32 0 +;; @0050 v18 = uextend.i64 v2 +;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0 +;; @0050 jump block3(v19) ;; -;; block3(v14: i64): -;; @0050 v19 = load.i64 notrap aligned readonly v14+8 -;; @0050 v20 = load.i64 notrap aligned readonly v14+24 -;; @0050 v21 = call_indirect sig0, v19(v20, v0) +;; block3(v16: i64): +;; @0050 v20 = load.i64 notrap aligned readonly v16+8 +;; @0050 v21 = load.i64 notrap aligned readonly v16+24 +;; @0050 v22 = call_indirect sig0, v20(v21, v0) ;; @0053 jump block1 ;; ;; block1: -;; @0053 return v21 +;; @0053 return v22 ;; } diff --git a/tests/disas/call-indirect-immutable-elide-sig.wat b/tests/disas/call-indirect-immutable-elide-sig.wat index 3325c5076f24..d5d892f6d99a 100644 --- a/tests/disas/call-indirect-immutable-elide-sig.wat +++ b/tests/disas/call-indirect-immutable-elide-sig.wat @@ -26,8 +26,9 @@ (elem (i32.const 0) func $f1 $f2 $f3)) ;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -40,8 +41,9 @@ ;; } ;; ;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -54,8 +56,9 @@ ;; } ;; ;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -68,14 +71,16 @@ ;; } ;; ;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx ;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i64) -> i32 tail ;; sig1 = (i64 vmctx, i32, i64) -> i64 tail -;; fn0 = colocated u805306368:9 sig1 +;; fn0 = colocated u805306368:7 sig1 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32): @@ -83,28 +88,28 @@ ;; @0050 v5 = icmp uge v2, v4 ; v4 = 10 ;; @0050 v6 = uextend.i64 v2 ;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 -;; v23 = iconst.i64 3 -;; @0050 v8 = ishl v6, v23 ; v23 = 3 -;; @0050 v9 = iadd v7, v8 -;; @0050 v10 = iconst.i64 0 -;; @0050 v11 = select_spectre_guard v5, v10, v9 ; v10 = 0 -;; @0050 v12 = load.i64 user5 aligned table v11 -;; v22 = iconst.i64 -2 -;; @0050 v13 = band v12, v22 ; v22 = -2 -;; @0050 brif v12, block3(v13), block2 +;; @0050 v8 = iconst.i64 3 +;; @0050 v9 = ishl v6, v8 ; v8 = 3 +;; @0050 v10 = iadd v7, v9 +;; @0050 v11 = iconst.i64 0 +;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0 +;; @0050 v13 = load.i64 user6 aligned region1 v12 +;; @0050 v14 = iconst.i64 -2 +;; @0050 v15 = band v13, v14 ; v14 = -2 +;; @0050 brif v13, block3(v15), block2 ;; ;; block2 cold: -;; @0050 v15 = iconst.i32 0 -;; @0050 v17 = uextend.i64 v2 -;; @0050 v18 = call fn0(v0, v15, v17) ; v15 = 0 -;; @0050 jump block3(v18) +;; @0050 v17 = iconst.i32 0 +;; @0050 v18 = uextend.i64 v2 +;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0 +;; @0050 jump block3(v19) ;; -;; block3(v14: i64): -;; @0050 v19 = load.i64 user6 aligned readonly v14+8 -;; @0050 v20 = load.i64 notrap aligned readonly v14+24 -;; @0050 v21 = call_indirect sig0, v19(v20, v0) +;; block3(v16: i64): +;; @0050 v20 = load.i64 user7 aligned readonly v16+8 +;; @0050 v21 = load.i64 notrap aligned readonly v16+24 +;; @0050 v22 = call_indirect sig0, v20(v21, v0) ;; @0053 jump block1 ;; ;; block1: -;; @0053 return v21 +;; @0053 return v22 ;; } diff --git a/tests/disas/call-indirect-immutable-static-bound.wat b/tests/disas/call-indirect-immutable-static-bound.wat index 3e4570deab47..05c3ffd748ab 100644 --- a/tests/disas/call-indirect-immutable-static-bound.wat +++ b/tests/disas/call-indirect-immutable-static-bound.wat @@ -26,8 +26,9 @@ (elem (i32.const 0) func $f1 $f2 $f3)) ;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -40,8 +41,9 @@ ;; } ;; ;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -54,8 +56,9 @@ ;; } ;; ;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -68,14 +71,16 @@ ;; } ;; ;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx ;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i64) -> i32 tail ;; sig1 = (i64 vmctx, i32, i64) -> i64 tail -;; fn0 = colocated u805306368:9 sig1 +;; fn0 = colocated u805306368:7 sig1 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32): @@ -83,28 +88,28 @@ ;; @0050 v5 = icmp uge v2, v4 ; v4 = 16 ;; @0050 v6 = uextend.i64 v2 ;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 -;; v23 = iconst.i64 3 -;; @0050 v8 = ishl v6, v23 ; v23 = 3 -;; @0050 v9 = iadd v7, v8 -;; @0050 v10 = iconst.i64 0 -;; @0050 v11 = select_spectre_guard v5, v10, v9 ; v10 = 0 -;; @0050 v12 = load.i64 user5 aligned table v11 -;; v22 = iconst.i64 -2 -;; @0050 v13 = band v12, v22 ; v22 = -2 -;; @0050 brif v12, block3(v13), block2 +;; @0050 v8 = iconst.i64 3 +;; @0050 v9 = ishl v6, v8 ; v8 = 3 +;; @0050 v10 = iadd v7, v9 +;; @0050 v11 = iconst.i64 0 +;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0 +;; @0050 v13 = load.i64 user6 aligned region1 v12 +;; @0050 v14 = iconst.i64 -2 +;; @0050 v15 = band v13, v14 ; v14 = -2 +;; @0050 brif v13, block3(v15), block2 ;; ;; block2 cold: -;; @0050 v15 = iconst.i32 0 -;; @0050 v17 = uextend.i64 v2 -;; @0050 v18 = call fn0(v0, v15, v17) ; v15 = 0 -;; @0050 jump block3(v18) +;; @0050 v17 = iconst.i32 0 +;; @0050 v18 = uextend.i64 v2 +;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0 +;; @0050 jump block3(v19) ;; -;; block3(v14: i64): -;; @0050 v19 = load.i64 user6 aligned readonly v14+8 -;; @0050 v20 = load.i64 notrap aligned readonly v14+24 -;; @0050 v21 = call_indirect sig0, v19(v20, v0) +;; block3(v16: i64): +;; @0050 v20 = load.i64 user7 aligned readonly v16+8 +;; @0050 v21 = load.i64 notrap aligned readonly v16+24 +;; @0050 v22 = call_indirect sig0, v20(v21, v0) ;; @0053 jump block1 ;; ;; block1: -;; @0053 return v21 +;; @0053 return v22 ;; } diff --git a/tests/disas/call-indirect-mutable-keeps-sigcheck.wat b/tests/disas/call-indirect-mutable-keeps-sigcheck.wat index c1028a381571..03318a349ef7 100644 --- a/tests/disas/call-indirect-mutable-keeps-sigcheck.wat +++ b/tests/disas/call-indirect-mutable-keeps-sigcheck.wat @@ -30,8 +30,9 @@ (elem (i32.const 0) func $f1 $f2 $f3)) ;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -44,8 +45,9 @@ ;; } ;; ;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -58,8 +60,9 @@ ;; } ;; ;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -72,30 +75,32 @@ ;; } ;; ;; function u0:3(i64 vmctx, i64, i32) tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx ;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i32) -> i64 tail -;; fn0 = colocated u805306368:7 sig0 +;; fn0 = colocated u805306368:6 sig0 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32): ;; @005e v3 = iconst.i32 0 -;; @005e v5 = call fn0(v0, v3) ; v3 = 0 -;; @0060 v6 = iconst.i32 10 -;; @0060 v7 = icmp uge v2, v6 ; v6 = 10 -;; @0060 v8 = uextend.i64 v2 -;; @0060 v9 = load.i64 notrap aligned readonly can_move v0+48 -;; v16 = iconst.i64 3 -;; @0060 v10 = ishl v8, v16 ; v16 = 3 -;; @0060 v11 = iadd v9, v10 +;; @005e v4 = call fn0(v0, v3) ; v3 = 0 +;; @0060 v5 = iconst.i32 10 +;; @0060 v6 = icmp uge v2, v5 ; v5 = 10 +;; @0060 v7 = uextend.i64 v2 +;; @0060 v8 = load.i64 notrap aligned readonly can_move v0+48 +;; @0060 v9 = iconst.i64 3 +;; @0060 v10 = ishl v7, v9 ; v9 = 3 +;; @0060 v11 = iadd v8, v10 ;; @0060 v12 = iconst.i64 0 -;; @0060 v13 = select_spectre_guard v7, v12, v11 ; v12 = 0 -;; v15 = iconst.i64 1 -;; @0060 v14 = bor v5, v15 ; v15 = 1 -;; @0060 store user5 aligned table v14, v13 +;; @0060 v13 = select_spectre_guard v6, v12, v11 ; v12 = 0 +;; @0060 v14 = iconst.i64 1 +;; @0060 v15 = bor v4, v14 ; v14 = 1 +;; @0060 store user6 aligned region1 v15, v13 ;; @0062 jump block1 ;; ;; block1: @@ -103,14 +108,17 @@ ;; } ;; ;; function u0:4(i64 vmctx, i64, i32) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" +;; region2 = 40 "VMContext+0x28" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx ;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i64) -> i32 tail ;; sig1 = (i64 vmctx, i32, i64) -> i64 tail -;; fn0 = colocated u805306368:9 sig1 +;; fn0 = colocated u805306368:7 sig1 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32): @@ -118,33 +126,34 @@ ;; @0067 v5 = icmp uge v2, v4 ; v4 = 10 ;; @0067 v6 = uextend.i64 v2 ;; @0067 v7 = load.i64 notrap aligned readonly can_move v0+48 -;; v28 = iconst.i64 3 -;; @0067 v8 = ishl v6, v28 ; v28 = 3 -;; @0067 v9 = iadd v7, v8 -;; @0067 v10 = iconst.i64 0 -;; @0067 v11 = select_spectre_guard v5, v10, v9 ; v10 = 0 -;; @0067 v12 = load.i64 user5 aligned table v11 -;; v27 = iconst.i64 -2 -;; @0067 v13 = band v12, v27 ; v27 = -2 -;; @0067 brif v12, block3(v13), block2 +;; @0067 v8 = iconst.i64 3 +;; @0067 v9 = ishl v6, v8 ; v8 = 3 +;; @0067 v10 = iadd v7, v9 +;; @0067 v11 = iconst.i64 0 +;; @0067 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0 +;; @0067 v13 = load.i64 user6 aligned region1 v12 +;; @0067 v14 = iconst.i64 -2 +;; @0067 v15 = band v13, v14 ; v14 = -2 +;; @0067 brif v13, block3(v15), block2 ;; ;; block2 cold: -;; @0067 v15 = iconst.i32 0 -;; @0067 v17 = uextend.i64 v2 -;; @0067 v18 = call fn0(v0, v15, v17) ; v15 = 0 -;; @0067 jump block3(v18) +;; @0067 v17 = iconst.i32 0 +;; @0067 v18 = uextend.i64 v2 +;; @0067 v19 = call fn0(v0, v17, v18) ; v17 = 0 +;; @0067 jump block3(v19) ;; -;; block3(v14: i64): -;; @0067 v20 = load.i64 notrap aligned readonly can_move v0+40 +;; block3(v16: i64): +;; @0067 v20 = load.i64 notrap aligned readonly can_move region2 v0+40 ;; @0067 v21 = load.i32 notrap aligned readonly can_move v20 -;; @0067 v22 = load.i32 user6 aligned readonly v14+16 +;; @0067 v22 = load.i32 user7 aligned readonly v16+16 ;; @0067 v23 = icmp eq v22, v21 -;; @0067 trapz v23, user7 -;; @0067 v24 = load.i64 notrap aligned readonly v14+8 -;; @0067 v25 = load.i64 notrap aligned readonly v14+24 -;; @0067 v26 = call_indirect sig0, v24(v25, v0) +;; @0067 v24 = uextend.i32 v23 +;; @0067 trapz v24, user8 +;; @0067 v25 = load.i64 notrap aligned readonly v16+8 +;; @0067 v26 = load.i64 notrap aligned readonly v16+24 +;; @0067 v27 = call_indirect sig0, v25(v26, v0) ;; @006a jump block1 ;; ;; block1: -;; @006a return v26 +;; @006a return v27 ;; } diff --git a/tests/disas/gc/call-indirect-final-type.wat b/tests/disas/gc/call-indirect-final-type.wat index 0406261611bf..13ffa96bec62 100644 --- a/tests/disas/gc/call-indirect-final-type.wat +++ b/tests/disas/gc/call-indirect-final-type.wat @@ -23,47 +23,38 @@ ;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx -;; gv4 = load.i64 notrap aligned gv3+48 -;; gv5 = load.i64 notrap aligned gv3+56 +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i64, i32) -> i32 tail ;; sig1 = (i64 vmctx, i32, i64) -> i64 tail ;; fn0 = colocated u805306368:7 sig1 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32, v3: i32): -;; @002b v5 = load.i64 notrap aligned v0+56 -;; @002b v9 = load.i64 notrap aligned v0+48 -;; @002b v6 = ireduce.i32 v5 -;; @002b v7 = icmp uge v3, v6 -;; @002b v13 = iconst.i64 0 -;; @002b v8 = uextend.i64 v3 -;; @002b v10 = iconst.i64 3 -;; @002b v11 = ishl v8, v10 ; v10 = 3 -;; @002b v12 = iadd v9, v11 -;; @002b v14 = select_spectre_guard v7, v13, v12 ; v13 = 0 -;; @002b v15 = load.i64 user6 aligned region1 v14 -;; @002b v16 = iconst.i64 -2 -;; @002b v17 = band v15, v16 ; v16 = -2 -;; @002b brif v15, block3(v17), block2 +;; @002b v12 = iconst.i64 0 +;; @002b v14 = load.i64 user6 aligned region1 v12 ; v12 = 0 +;; @002b v15 = iconst.i64 -2 +;; @002b v16 = band v14, v15 ; v15 = -2 +;; @002b brif v14, block3(v16), block2 ;; ;; block2 cold: -;; @002b v19 = iconst.i32 0 -;; @002b v21 = call fn0(v0, v19, v8) ; v19 = 0 -;; @002b jump block3(v21) +;; @002b v5 = iconst.i32 0 +;; @002b v7 = uextend.i64 v3 +;; @002b v20 = call fn0(v0, v5, v7) ; v5 = 0 +;; @002b jump block3(v20) ;; -;; block3(v18: i64): -;; @002b v24 = load.i32 user7 aligned readonly v18+16 -;; @002b v22 = load.i64 notrap aligned readonly can_move region2 v0+40 -;; @002b v23 = load.i32 notrap aligned readonly can_move v22 -;; @002b v25 = icmp eq v24, v23 -;; @002b trapz v25, user8 -;; @002b v27 = load.i64 notrap aligned readonly v18+8 -;; @002b v28 = load.i64 notrap aligned readonly v18+24 -;; @002b v29 = call_indirect sig0, v27(v28, v0, v2) +;; block3(v17: i64): +;; @002b v23 = load.i32 user7 aligned readonly v17+16 +;; @002b v21 = load.i64 notrap aligned readonly can_move region2 v0+40 +;; @002b v22 = load.i32 notrap aligned readonly can_move v21 +;; @002b v24 = icmp eq v23, v22 +;; @002b trapz v24, user8 +;; @002b v26 = load.i64 notrap aligned readonly v17+8 +;; @002b v27 = load.i64 notrap aligned readonly v17+24 +;; @002b v28 = call_indirect sig0, v26(v27, v0, v2) ;; @002e jump block1 ;; ;; block1: -;; @002e return v29 +;; @002e return v28 ;; } ;; ;; function u0:1(i64 vmctx, i64, i32, i32) -> i32 tail { @@ -74,41 +65,32 @@ ;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx -;; gv4 = load.i64 notrap aligned gv3+48 -;; gv5 = load.i64 notrap aligned gv3+56 +;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i64, i32) -> i32 tail ;; sig1 = (i64 vmctx, i32, i64) -> i64 tail ;; fn0 = colocated u805306368:7 sig1 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32, v3: i32): -;; @0035 v5 = load.i64 notrap aligned v0+56 -;; @0035 v9 = load.i64 notrap aligned v0+48 -;; @0035 v6 = ireduce.i32 v5 -;; @0035 v7 = icmp uge v3, v6 -;; @0035 v13 = iconst.i64 0 -;; @0035 v8 = uextend.i64 v3 -;; @0035 v10 = iconst.i64 3 -;; @0035 v11 = ishl v8, v10 ; v10 = 3 -;; @0035 v12 = iadd v9, v11 -;; @0035 v14 = select_spectre_guard v7, v13, v12 ; v13 = 0 -;; @0035 v15 = load.i64 user6 aligned region1 v14 -;; @0035 v16 = iconst.i64 -2 -;; @0035 v17 = band v15, v16 ; v16 = -2 -;; @0035 brif v15, block3(v17), block2 +;; @0035 v12 = iconst.i64 0 +;; @0035 v14 = load.i64 user6 aligned region1 v12 ; v12 = 0 +;; @0035 v15 = iconst.i64 -2 +;; @0035 v16 = band v14, v15 ; v15 = -2 +;; @0035 brif v14, block3(v16), block2 ;; ;; block2 cold: -;; @0035 v19 = iconst.i32 0 -;; @0035 v21 = call fn0(v0, v19, v8) ; v19 = 0 -;; @0035 jump block3(v21) +;; @0035 v5 = iconst.i32 0 +;; @0035 v7 = uextend.i64 v3 +;; @0035 v20 = call fn0(v0, v5, v7) ; v5 = 0 +;; @0035 jump block3(v20) ;; -;; block3(v18: i64): -;; @0035 v24 = load.i32 user7 aligned readonly v18+16 -;; @0035 v22 = load.i64 notrap aligned readonly can_move region2 v0+40 -;; @0035 v23 = load.i32 notrap aligned readonly can_move v22 -;; @0035 v25 = icmp eq v24, v23 -;; @0035 trapz v25, user8 -;; @0035 v27 = load.i64 notrap aligned readonly v18+8 -;; @0035 v28 = load.i64 notrap aligned readonly v18+24 -;; @0035 return_call_indirect sig0, v27(v28, v0, v2) +;; block3(v17: i64): +;; @0035 v23 = load.i32 user7 aligned readonly v17+16 +;; @0035 v21 = load.i64 notrap aligned readonly can_move region2 v0+40 +;; @0035 v22 = load.i32 notrap aligned readonly can_move v21 +;; @0035 v24 = icmp eq v23, v22 +;; @0035 trapz v24, user8 +;; @0035 v26 = load.i64 notrap aligned readonly v17+8 +;; @0035 v27 = load.i64 notrap aligned readonly v17+24 +;; @0035 return_call_indirect sig0, v26(v27, v0, v2) ;; } diff --git a/tests/disas/indirect-call-no-caching.wat b/tests/disas/indirect-call-no-caching.wat index ae42c54f4c27..1a2e852558bb 100644 --- a/tests/disas/indirect-call-no-caching.wat +++ b/tests/disas/indirect-call-no-caching.wat @@ -68,7 +68,6 @@ ;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { ;; region0 = 8 "VMContext+0x8" ;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" -;; region2 = 40 "VMContext+0x28" ;; gv0 = vmctx ;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 @@ -101,17 +100,11 @@ ;; @0050 jump block3(v19) ;; ;; block3(v16: i64): -;; @0050 v20 = load.i64 notrap aligned readonly can_move region2 v0+40 -;; @0050 v21 = load.i32 notrap aligned readonly can_move v20 -;; @0050 v22 = load.i32 user7 aligned readonly v16+16 -;; @0050 v23 = icmp eq v22, v21 -;; @0050 v24 = uextend.i32 v23 -;; @0050 trapz v24, user8 -;; @0050 v25 = load.i64 notrap aligned readonly v16+8 -;; @0050 v26 = load.i64 notrap aligned readonly v16+24 -;; @0050 v27 = call_indirect sig0, v25(v26, v0) +;; @0050 v20 = load.i64 user7 aligned readonly v16+8 +;; @0050 v21 = load.i64 notrap aligned readonly v16+24 +;; @0050 v22 = call_indirect sig0, v20(v21, v0) ;; @0053 jump block1 ;; ;; block1: -;; @0053 return v27 +;; @0053 return v22 ;; } diff --git a/tests/disas/readonly-funcrefs.wat b/tests/disas/readonly-funcrefs.wat index 9febf947e3b1..e341fbcc4dba 100644 --- a/tests/disas/readonly-funcrefs.wat +++ b/tests/disas/readonly-funcrefs.wat @@ -35,7 +35,6 @@ ;; function u0:1(i64 vmctx, i64, i32) tail { ;; region0 = 8 "VMContext+0x8" ;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" -;; region2 = 40 "VMContext+0x28" ;; gv0 = vmctx ;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 @@ -67,14 +66,9 @@ ;; @0031 jump block3(v18) ;; ;; block3(v15: i64): -;; @0031 v21 = load.i32 user7 aligned readonly v15+16 -;; @0031 v19 = load.i64 notrap aligned readonly can_move region2 v0+40 -;; @0031 v20 = load.i32 notrap aligned readonly can_move v19 -;; @0031 v22 = icmp eq v21, v20 -;; @0031 trapz v22, user8 -;; @0031 v24 = load.i64 notrap aligned readonly v15+8 -;; @0031 v25 = load.i64 notrap aligned readonly v15+24 -;; @0031 call_indirect sig0, v24(v25, v0) +;; @0031 v19 = load.i64 user7 aligned readonly v15+8 +;; @0031 v20 = load.i64 notrap aligned readonly v15+24 +;; @0031 call_indirect sig0, v19(v20, v0) ;; @0034 jump block1 ;; ;; block1: diff --git a/tests/disas/startup-elem-active.wat b/tests/disas/startup-elem-active.wat index 0c3158f8c2b1..40cdfd2d91f4 100644 --- a/tests/disas/startup-elem-active.wat +++ b/tests/disas/startup-elem-active.wat @@ -42,37 +42,21 @@ ;; function u2415919104:0(i64 vmctx, i64) tail { ;; region0 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned gv0+48 -;; gv2 = load.i64 notrap aligned gv0+56 +;; gv1 = load.i64 notrap aligned readonly can_move gv0+48 ;; ;; block0(v0: i64, v1: i64): -;; v4 = load.i64 notrap aligned v0+56 -;; v5 = ireduce.i32 v4 -;; v6 = uextend.i64 v5 -;; v86 = iconst.i64 4 -;; v92 = icmp ult v6, v86 ; v86 = 4 -;; trapnz v92, user6 -;; v13 = load.i64 notrap aligned v0+48 -;; v103 = iconst.i32 21 -;; v2 = iconst.i32 1 -;; v114 = icmp ule v5, v2 ; v2 = 1 -;; v79 = iconst.i64 0 -;; v17 = iadd v13, v86 ; v86 = 4 -;; v34 = select_spectre_guard v114, v79, v17 ; v79 = 0 -;; store user6 aligned region0 v103, v34 ; v103 = 21 +;; v100 = iconst.i32 21 +;; v12 = load.i64 notrap aligned readonly can_move v0+48 +;; v79 = iconst.i64 4 +;; v16 = iadd v12, v79 ; v79 = 4 +;; store user6 aligned region0 v100, v16 ; v100 = 21 ;; v117 = iconst.i32 23 -;; v123 = iconst.i32 2 -;; v129 = icmp ule v5, v123 ; v123 = 2 -;; v131 = iconst.i64 8 -;; v49 = iadd v13, v131 ; v131 = 8 -;; v51 = select_spectre_guard v129, v79, v49 ; v79 = 0 -;; store user6 aligned region0 v117, v51 ; v117 = 23 -;; v133 = iconst.i32 25 -;; v3 = iconst.i32 3 -;; v144 = icmp ule v5, v3 ; v3 = 3 -;; v146 = iconst.i64 12 -;; v66 = iadd v13, v146 ; v146 = 12 -;; v68 = select_spectre_guard v144, v79, v66 ; v79 = 0 -;; store user6 aligned region0 v133, v68 ; v133 = 25 +;; v134 = iconst.i64 8 +;; v46 = iadd v12, v134 ; v134 = 8 +;; store user6 aligned region0 v117, v46 ; v117 = 23 +;; v136 = iconst.i32 25 +;; v152 = iconst.i64 12 +;; v62 = iadd v12, v152 ; v152 = 12 +;; store user6 aligned region0 v136, v62 ; v136 = 25 ;; return ;; } diff --git a/tests/disas/startup-table-initial-value.wat b/tests/disas/startup-table-initial-value.wat index 7b39ecc93333..a2cb9a5f6da2 100644 --- a/tests/disas/startup-table-initial-value.wat +++ b/tests/disas/startup-table-initial-value.wat @@ -35,31 +35,24 @@ ;; ;; function u2415919104:0(i64 vmctx, i64) tail { ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned gv0+48 -;; gv2 = load.i64 notrap aligned gv0+56 +;; gv1 = load.i64 notrap aligned readonly can_move gv0+48 ;; ;; block0(v0: i64, v1: i64): -;; v9 = load.i64 notrap aligned v0+56 -;; v10 = ireduce.i32 v9 -;; v11 = uextend.i64 v10 -;; v41 = iconst.i64 10 -;; v53 = icmp ult v11, v41 ; v41 = 10 -;; trapnz v53, user6 -;; v18 = load.i64 notrap aligned v0+48 +;; v17 = load.i64 notrap aligned readonly can_move v0+48 ;; v3 = iconst.i32 1 -;; v83 = iconst.i64 36 -;; v85 = iadd v18, v83 ; v83 = 36 -;; v20 = iconst.i64 4 -;; jump block1(v18) -;; -;; block1(v29: i64): -;; v88 = iconst.i32 1 -;; store notrap aligned v88, v29 ; v88 = 1 -;; v89 = iadd.i64 v18, v83 ; v83 = 36 -;; v90 = icmp eq v29, v89 -;; v91 = iconst.i64 4 -;; v92 = iadd v29, v91 ; v91 = 4 -;; brif v90, block2, block1(v92) +;; v84 = iconst.i64 36 +;; v86 = iadd v17, v84 ; v84 = 36 +;; v19 = iconst.i64 4 +;; jump block1(v17) +;; +;; block1(v28: i64): +;; v89 = iconst.i32 1 +;; store notrap aligned v89, v28 ; v89 = 1 +;; v90 = iadd.i64 v17, v84 ; v84 = 36 +;; v91 = icmp eq v28, v90 +;; v92 = iconst.i64 4 +;; v93 = iadd v28, v92 ; v92 = 4 +;; brif v91, block2, block1(v93) ;; ;; block2: ;; return From 937c4e7aaa5a4ae0fd71bc8dd8e97273b4f90aa5 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Wed, 10 Jun 2026 16:40:20 -0700 Subject: [PATCH 09/22] tests: runtime coverage for call_indirect on immutable tables Exercise the table shapes the elisions apply to through the *.wast runtime suite: in-bounds calls, signature mismatches, null slots, and out-of-bounds indices all behave identically whether or not the checks were elided at compile time. Covers mixed-signature and uniform-signature tables plus a declared-growable table that is never grown. --- .../immutable-table-call-indirect.wast | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 tests/misc_testsuite/immutable-table-call-indirect.wast diff --git a/tests/misc_testsuite/immutable-table-call-indirect.wast b/tests/misc_testsuite/immutable-table-call-indirect.wast new file mode 100644 index 000000000000..3b40cb9ab534 --- /dev/null +++ b/tests/misc_testsuite/immutable-table-call-indirect.wast @@ -0,0 +1,71 @@ +;;! reference_types = true + +;; call_indirect through tables that are never grown, exported, or mutated. +;; Compilation may use a constant bound and elide null/signature checks on +;; these shapes; runtime behavior must be unchanged: in-bounds calls work, +;; and out-of-bounds, null-slot, and signature-mismatch accesses still trap. + +;; Mixed-signature immutable table with a null hole. +(module + (type $i2i (func (param i32) (result i32))) + (type $v2i (func (result i32))) + (table 5 funcref) + (elem (i32.const 0) $add1 $ten $add1) + + (func $add1 (type $i2i) (i32.add (local.get 0) (i32.const 1))) + (func $ten (type $v2i) (i32.const 10)) + + (func (export "call-i2i") (param i32 i32) (result i32) + (call_indirect (type $i2i) (local.get 1) (local.get 0))) + (func (export "call-v2i") (param i32) (result i32) + (call_indirect (type $v2i) (local.get 0)))) + +(assert_return (invoke "call-i2i" (i32.const 0) (i32.const 41)) (i32.const 42)) +(assert_return (invoke "call-i2i" (i32.const 2) (i32.const 7)) (i32.const 8)) +(assert_return (invoke "call-v2i" (i32.const 1)) (i32.const 10)) + +;; Signature mismatch still traps. +(assert_trap (invoke "call-i2i" (i32.const 1) (i32.const 0)) "indirect call type mismatch") +(assert_trap (invoke "call-v2i" (i32.const 0)) "indirect call type mismatch") + +;; Null slots still trap: slot 3 was never initialized. +(assert_trap (invoke "call-i2i" (i32.const 3) (i32.const 0)) "uninitialized element") +(assert_trap (invoke "call-v2i" (i32.const 4)) "uninitialized element") + +;; Out of bounds still traps against the constant bound. +(assert_trap (invoke "call-i2i" (i32.const 5) (i32.const 0)) "undefined element") +(assert_trap (invoke "call-i2i" (i32.const -1) (i32.const 0)) "undefined element") + +;; Uniform-signature immutable table, fully initialized. +(module + (type $v2i (func (result i32))) + (table 3 funcref) + (elem (i32.const 0) $a $b $c) + + (func $a (type $v2i) (i32.const 1)) + (func $b (type $v2i) (i32.const 2)) + (func $c (type $v2i) (i32.const 3)) + + (func (export "call") (param i32) (result i32) + (call_indirect (type $v2i) (local.get 0))) + (func (export "call-wrong-type") (param i32 i32) (result i32) + (call_indirect (param i32) (result i32) (local.get 1) (local.get 0)))) + +(assert_return (invoke "call" (i32.const 0)) (i32.const 1)) +(assert_return (invoke "call" (i32.const 1)) (i32.const 2)) +(assert_return (invoke "call" (i32.const 2)) (i32.const 3)) +(assert_trap (invoke "call" (i32.const 3)) "undefined element") + +;; A caller whose expected type differs from the table's uniform type must +;; still observe the mismatch. +(assert_trap (invoke "call-wrong-type" (i32.const 0) (i32.const 0)) "indirect call type mismatch") + +;; Same shapes through a declared-growable (no max) table never actually +;; grown: an empty never-grown table has no valid index. +(module + (table 0 100 funcref) + (func (export "call-empty") (param i32) + (call_indirect (local.get 0)))) + +(assert_trap (invoke "call-empty" (i32.const 0)) "undefined element") +(assert_trap (invoke "call-empty" (i32.const 99)) "undefined element") From 3352ca4fc2e1b89ab6dae778b5f6a9a4de2f9ccd Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:37 -0700 Subject: [PATCH 10/22] pulley: add xband_s8 + br_if fused dispatch ops Add `xband{32,64}_s8_br_if_{x32,x64,not_x32,not_x64}` ops: each one computes `dst = src & sign_extend(mask)` unconditionally, then conditionally branches by `offset` on the original `src` (or, for the `_not` variants, on its zero/non-zero inverse). Emitted by Cranelift at call_indirect lazy-init brif sites where the funcref's init-bit mask and the brif's null-check both read the same loaded value. Saves one match_loop dispatch per call_indirect site. --- pulley/src/interp.rs | 64 ++++++++++++++++++++++++++++++++++++++++++++ pulley/src/lib.rs | 21 +++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index 5b3f79445340..207ae2355487 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -2296,6 +2296,70 @@ impl OpVisitor for Interpreter<'_> { ControlFlow::Continue(()) } + fn xband32_s8_br_if_x32( + &mut self, + dst: XReg, + src: XReg, + mask: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_i32(); + self.state[dst].set_i32(s & i32::from(mask)); + if s != 0 { + self.pc_rel_jump::(offset) + } else { + ControlFlow::Continue(()) + } + } + + fn xband32_s8_br_if_not_x32( + &mut self, + dst: XReg, + src: XReg, + mask: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_i32(); + self.state[dst].set_i32(s & i32::from(mask)); + if s == 0 { + self.pc_rel_jump::(offset) + } else { + ControlFlow::Continue(()) + } + } + + fn xband64_s8_br_if_x64( + &mut self, + dst: XReg, + src: XReg, + mask: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_i64(); + self.state[dst].set_i64(s & i64::from(mask)); + if s != 0 { + self.pc_rel_jump::(offset) + } else { + ControlFlow::Continue(()) + } + } + + fn xband64_s8_br_if_not_x64( + &mut self, + dst: XReg, + src: XReg, + mask: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_i64(); + self.state[dst].set_i64(s & i64::from(mask)); + if s == 0 { + self.pc_rel_jump::(offset) + } else { + ControlFlow::Continue(()) + } + } + fn xbor32(&mut self, operands: BinaryOperands) -> ControlFlow { let a = self.state[operands.src1].get_u32(); let b = self.state[operands.src2].get_u32(); diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index 36a09cb13a34..f29240b1dfd8 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -562,6 +562,27 @@ macro_rules! for_each_op { xband64_s8 = Xband64S8 { dst: XReg, src1: XReg, src2: i8 }; /// Same as `xband64` but `src2` is a sign-extended 32-bit immediate. xband64_s32 = Xband64S32 { dst: XReg, src1: XReg, src2: i32 }; + + /// `low32(dst) = low32(src) & sign_extend(mask)`, then conditionally + /// branch by `offset` if `low32(src)` is non-zero. + /// + /// Fused form of `xband32_s8 dst, src, mask` + `br_if32 src, offset`, + /// emitted by the Cranelift Pulley backend at call_indirect lazy-init + /// brif sites where the same loaded funcref value feeds both the + /// init-bit mask (`band v, -2`) and the null-check branch + /// (`brif v`). Shaves one match_loop dispatch per call_indirect + /// site. See pulley/PR for the full design discussion. + xband32_s8_br_if_x32 = Xband32S8BrIfX32 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; + /// Inverted form of `xband32_s8_br_if_x32`: branch if `low32(src)` + /// is zero. The mask + dst write happen unconditionally. Used by + /// MachBuffer's branch-direction-flip fallthrough optimization. + xband32_s8_br_if_not_x32 = Xband32S8BrIfNotX32 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; + /// 64-bit form: `dst = src & sign_extend(mask)`, then conditionally + /// branch by `offset` if `src` is non-zero. Same fusion as + /// `xband32_s8_br_if_x32` but for 64-bit pointer-width Pulley. + xband64_s8_br_if_x64 = Xband64S8BrIfX64 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; + /// Inverted form of `xband64_s8_br_if_x64`: branch if `src` is zero. + xband64_s8_br_if_not_x64 = Xband64S8BrIfNotX64 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; /// `low32(dst) = low32(src1) | low32(src2)` xbor32 = XBor32 { operands: BinaryOperands }; /// Same as `xbor64` but `src2` is a sign-extended 8-bit immediate. From 5b6e1bf0f7dfd9f5076c03a2a50c09295f064af9 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:37 -0700 Subject: [PATCH 11/22] =?UTF-8?q?cranelift:=20Lower::sink=5Fpure=5Finst=20?= =?UTF-8?q?=E2=80=94=20absorb=20pure=20ALU=20ops=20into=20terminators?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `Lower::sink_pure_inst(ir::Inst)`: mark a side-effect-free CLIF instruction as absorbed so its standalone lowering is skipped and a later MachInst (e.g. a fused band+brif) can claim its result vreg directly. The reverse-iteration order in lower-block guarantees the terminator that absorbs the inst lowers first, so the absorbed inst is still present when the terminator looks it up. --- cranelift/codegen/src/machinst/lower.rs | 42 +++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index 0a09edc5c374..89479b9a25e7 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -204,6 +204,14 @@ pub struct Lower<'func, I: VCodeInst> { /// their original locations. inst_sunk: FxHashSet, + /// Pure (non-side-effecting) instructions whose value-production has been + /// absorbed by a later-emitted MachInst (typically a terminator that + /// fuses an ALU op with a branch). The absorbing MachInst writes to the + /// absorbed inst's result vreg, so subsequent `put_value_in_regs` of that + /// vreg observes the value normally — but the absorbed inst itself is + /// skipped in `lower_clif_block`, avoiding a redundant double-write. + inst_absorbed_pure: FxHashSet, + /// Instructions collected for the CLIF inst in progress, in forward order. ir_insts: Vec, @@ -504,6 +512,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> { value_ir_uses, value_lowered_uses: SecondaryMap::default(), inst_sunk: FxHashSet::default(), + inst_absorbed_pure: FxHashSet::default(), cur_scan_entry_color: None, cur_inst: None, ir_insts: vec![], @@ -708,6 +717,12 @@ impl<'func, I: VCodeInst> Lower<'func, I> { self.inst_sunk.contains(&inst) } + /// Has the value-production of this pure instruction been absorbed by a + /// later-emitted MachInst? See [`Lower::inst_absorbed_pure`]. + fn is_inst_absorbed_pure(&self, inst: Inst) -> bool { + self.inst_absorbed_pure.contains(&inst) + } + // Is any result of this instruction needed? fn is_any_inst_result_needed(&self, inst: Inst) -> bool { self.f @@ -750,6 +765,13 @@ impl<'func, I: VCodeInst> Lower<'func, I> { if self.is_inst_sunk(inst) { continue; } + // Same for pure-instruction absorption: a terminator earlier in + // the reverse-scan emitted a MachInst that writes to this inst's + // result vreg directly, so emitting it again here would be a + // redundant double-write. + if self.is_inst_absorbed_pure(inst) { + continue; + } // Are any outputs used at least once? let value_needed = self.is_any_inst_result_needed(inst); trace!( @@ -1666,6 +1688,26 @@ impl<'func, I: VCodeInst> Lower<'func, I> { self.ir_insts.push(mach_inst); } + /// Indicate that the value-production of a pure (non-side-effecting) + /// instruction has been absorbed by a later-emitted MachInst — typically a + /// terminator that fuses an ALU op with a branch (e.g. Pulley's + /// `xband_brif` fused dispatch op). + /// + /// The absorbing MachInst must write to the absorbed inst's result vreg + /// (`value_regs[result]`) directly, so subsequent `put_value_in_regs` of + /// that vreg observes the correct value. The absorbed inst itself is + /// skipped in `lower_clif_block`, preventing a redundant second write to + /// the same vreg (which would violate SSA single-def). + /// + /// Unlike [`Lower::sink_inst`], this does not require the inst to have a + /// lowering side effect: it is specifically for pure ALU ops whose value + /// flows into the fused MachInst's output operand. Color tracking is + /// likewise unnecessary because pure insts have no color anchor. + pub fn sink_pure_inst(&mut self, ir_inst: Inst) { + assert!(!has_lowering_side_effect(self.f, ir_inst)); + self.inst_absorbed_pure.insert(ir_inst); + } + /// Indicate that the side-effect of an instruction has been sunk to the /// current scan location. This should only be done with the instruction's /// original results are not used (i.e., `put_input_in_regs` is not invoked From 4019400c9639b0cdefa116d6895a07588b2d5980 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:37 -0700 Subject: [PATCH 12/22] cranelift/pulley: fuse band+brif at call_indirect lazy-init site MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pattern-match `cond = band v, -2; brif cond, taken, not_taken` and lower it to `MInst::BandBrIf` (forward + inverted variants), which the emit side encodes as the `xband{32,64}_s8_br_if_*` pulley ops. The `band -2` is sunk via `sink_pure_inst` so the fused MachInst defs its result vreg. Gated on the band's mask being exactly `-2` in the appropriate width — never fires on user-wasm `band v, -2` shapes because the IR rewrite under `is_eagerly_initialized_funcref_table` is the only producer of this exact pattern at a brif site. Adds `tests/disas/pulley-call-indirect-band-brif-fusion.wat`. --- .../codegen/src/isa/pulley_shared/inst.isle | 18 ++ .../src/isa/pulley_shared/inst/emit.rs | 73 ++++++ .../codegen/src/isa/pulley_shared/inst/mod.rs | 35 +++ .../codegen/src/isa/pulley_shared/lower.rs | 118 ++++++++- crates/cranelift/src/func_environ.rs | 22 +- .../call-indirect-immutable-elide-null.wat | 51 ++-- .../pulley-call-indirect-band-brif-fusion.wat | 229 ++++++++++++++++++ 7 files changed, 516 insertions(+), 30 deletions(-) create mode 100644 tests/disas/pulley-call-indirect-band-brif-fusion.wat diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle index 258551a17598..1326e3ae407b 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst.isle +++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle @@ -67,6 +67,24 @@ ;; Jump to `then` if `c` is true, otherwise to `else`. (BrIf (cond Cond) (taken MachLabel) (not_taken MachLabel)) + ;; Fused band-immediate + brif: compute `dst = src & sign_extend(mask)`, + ;; then conditionally branch to `taken` if `src` is non-zero (low-32 or + ;; full 64-bit comparison per `size`), otherwise fall through to + ;; `not_taken`. The mask + dst write happen unconditionally. + ;; + ;; Emitted by the Cranelift Pulley backend at the call_indirect lazy-init + ;; brif site when the same funcref-pointer value feeds both the init-bit + ;; mask (`band v, -2`) and the null-check branch (`brif v`). The fusion + ;; saves one match_loop dispatch per call_indirect site. See pulley/ + ;; `xband*_s8_br_if_*` ops for the underlying bytecode. + (BandBrIf + (dst WritableXReg) + (src XReg) + (mask i8) + (size OperandSize) + (taken MachLabel) + (not_taken MachLabel)) + ;; Load the memory address referenced by `mem` into `dst`. (LoadAddr (dst WritableXReg) (mem Amode)) diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs index 74bff5d97a7d..41f295f10eb9 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs @@ -367,6 +367,79 @@ fn pulley_emit

( assert_eq!(sink.cur_offset(), not_taken_end); } + Inst::BandBrIf { + dst, + src, + mask, + size, + taken, + not_taken, + } => { + // The forward form branches to `taken` if `src` is non-zero + // (after computing `dst = src & sext(mask)`). The inverted form + // branches if `src` is zero — used by MachBuffer's fallthrough- + // flip optimization. Both must encode to equal-length bytes; the + // `_x*` and `_not_x*` ops share the same operand shape, so they + // do. + let dst_writable = *dst; + let src_reg = *src; + let mask_imm = *mask; + + // Compute the inverted-form encoding (branch on src == 0) into a + // SmallVec so MachBuffer can use it for branch-direction flipping. + let mut inverted = SmallVec::<[u8; 16]>::new(); + match size { + OperandSize::Size32 => { + enc::xband32_s8_br_if_not_x32( + &mut inverted, dst_writable, src_reg, mask_imm, 0, + ); + } + OperandSize::Size64 => { + enc::xband64_s8_br_if_not_x64( + &mut inverted, dst_writable, src_reg, mask_imm, 0, + ); + } + } + let len = inverted.len() as u32; + inverted.clear(); + let inv_rel = i32::try_from(len - 4).unwrap(); + match size { + OperandSize::Size32 => { + enc::xband32_s8_br_if_not_x32( + &mut inverted, dst_writable, src_reg, mask_imm, inv_rel, + ); + } + OperandSize::Size64 => { + enc::xband64_s8_br_if_not_x64( + &mut inverted, dst_writable, src_reg, mask_imm, inv_rel, + ); + } + } + assert!(len > 4); + + // Emit the forward form (branch on src != 0). + let taken_end = *start_offset + len; + sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel); + sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted); + patch_pc_rel_offset(sink, |sink| match size { + OperandSize::Size32 => { + enc::xband32_s8_br_if_x32(sink, dst_writable, src_reg, mask_imm, 0) + } + OperandSize::Size64 => { + enc::xband64_s8_br_if_x64(sink, dst_writable, src_reg, mask_imm, 0) + } + }); + debug_assert_eq!(sink.cur_offset(), taken_end); + + // Unconditional jump to `not_taken` for the fall-through path. + let not_taken_start = taken_end + 1; + let not_taken_end = not_taken_start + 4; + sink.use_label_at_offset(not_taken_start, *not_taken, LabelUse::PcRel); + sink.add_uncond_branch(taken_end, not_taken_end, *not_taken); + patch_pc_rel_offset(sink, |sink| enc::jump(sink, 0)); + assert_eq!(sink.cur_offset(), not_taken_end); + } + Inst::LoadAddr { dst, mem } => { let base = mem.get_base_register(); let offset = mem.get_offset_with_state(state); diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs index 6bbe69795e51..91074dd89cf4 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs @@ -261,6 +261,18 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { cond.get_operands(collector); } + Inst::BandBrIf { + dst, + src, + mask: _, + size: _, + taken: _, + not_taken: _, + } => { + collector.reg_def(dst); + collector.reg_use(src); + } + Inst::LoadAddr { dst, mem } => { collector.reg_def(dst); mem.get_operands(collector); @@ -483,6 +495,7 @@ where | Inst::Rets { .. } => MachTerminator::Ret, Inst::Jump { .. } => MachTerminator::Branch, Inst::BrIf { .. } => MachTerminator::Branch, + Inst::BandBrIf { .. } => MachTerminator::Branch, Inst::BrTable { .. } => MachTerminator::Branch, Inst::ReturnCall { .. } | Inst::ReturnIndirectCall { .. } => MachTerminator::RetCall, Inst::Call { info } if info.try_call_info.is_some() => MachTerminator::Branch, @@ -762,6 +775,28 @@ impl Inst { format!("br_{cond}, {taken}; jump {not_taken}") } + Inst::BandBrIf { + dst, + src, + mask, + size, + taken, + not_taken, + } => { + let dst = format_reg(*dst.to_reg()); + let src = format_reg(**src); + let taken = taken.to_string(); + let not_taken = not_taken.to_string(); + let width = match size { + OperandSize::Size32 => 32, + OperandSize::Size64 => 64, + }; + format!( + "{dst} = xband{width}_s8 {src}, {mask}; \ + br_if_x{width} {src}, {taken}; jump {not_taken}" + ) + } + Inst::LoadAddr { dst, mem } => { let dst = format_reg(*dst.to_reg()); let mem = mem.to_string(); diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.rs b/cranelift/codegen/src/isa/pulley_shared/lower.rs index 2039c7de8dd3..8eb8625e5977 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.rs +++ b/cranelift/codegen/src/isa/pulley_shared/lower.rs @@ -4,7 +4,8 @@ pub mod isle; use super::{PulleyBackend, PulleyTargetKind, inst::*}; use crate::{ - ir, + ir::{self, InstructionData, Opcode}, + isa::pulley_shared::inst::Inst, machinst::{lower::*, *}, }; @@ -24,6 +25,15 @@ where ir_inst: ir::Inst, targets: &[MachLabel], ) -> Option<()> { + // Peephole: fuse `brif (band v c) _ _` where the band's i8-fittable + // immediate `c` is the only thing standing between the brif's cond + // and the funcref load. Emitted by the call_indirect lazy-init + // brif site when `is_eagerly_initialized_funcref_table` lets us + // safely test the masked value. See the doc-comment on + // `MInst::BandBrIf` for the bytecode-level shape. + if try_fuse_band_brif(ctx, ir_inst, targets) { + return Some(()); + } isle::lower_branch(ctx, self, ir_inst, targets) } @@ -32,3 +42,109 @@ where None } } + +/// Recognise the `brif (band v c) block(...) cold` shape emitted by +/// `func_environ::get_or_init_func_ref_table_elem` under the +/// `is_eagerly_initialized_funcref_table` predicate, and fuse it into a +/// single `MInst::BandBrIf`. Returns `true` if the fusion fired; the caller +/// then skips the generic ISLE rule. +/// +/// Soundness: testing `v_masked != 0` instead of `v != 0` is identical on +/// every funcref-slot value REACHABLE in eagerly-initialized tables. The +/// only differing case is `v == 1` (the explicit tagged-null slot value), +/// which can only appear via runtime `table.fill(null)` and is therefore +/// excluded by the `tables_mutated == false` half of the predicate. +fn try_fuse_band_brif

( + ctx: &mut Lower>, + ir_inst: ir::Inst, + targets: &[MachLabel], +) -> bool +where + P: PulleyTargetKind, +{ + if targets.len() != 2 { + return false; + } + + let dfg = ctx.dfg(); + let InstructionData::Brif { + opcode: Opcode::Brif, + arg: cond, + .. + } = dfg.insts[ir_inst] + else { + return false; + }; + + // The brif's cond must be defined by a `band v -2`. We restrict the + // mask to exactly `-2` (the init-bit strip used by the call_indirect + // lazy-init brif site) because the fused op tests the UNMASKED `src` + // for non-zero, not the masked `dst`. That equivalence holds iff + // `(v & mask != 0) <=> (v != 0)`. For mask = -2 this holds for every + // funcref-slot value reachable in eagerly-initialized tables (the + // soundness argument from `is_eagerly_initialized_funcref_table`). + // For other masks the equivalence is generally false, so fusing + // would silently flip branch direction on user-code `band+brif` + // sites. See pulley/PR for the design discussion. + let band_inst = match dfg.value_def(cond).inst() { + Some(inst) => inst, + None => return false, + }; + let (band_src, band_imm) = match dfg.insts[band_inst] { + InstructionData::Binary { + opcode: Opcode::Band, + args: [a, b], + } => match dfg.value_def(b).inst() { + Some(b_inst) => match dfg.insts[b_inst] { + InstructionData::UnaryImm { + opcode: Opcode::Iconst, + imm, + } if imm.bits() == -2 => (a, -2_i8), + _ => return false, + }, + None => return false, + }, + _ => return false, + }; + + // Both ops of the fusion must agree on size: the band's result is the + // brif's cond, and its type drives the comparison width. + let cond_ty = dfg.value_type(cond); + let size = match cond_ty { + ir::types::I32 => OperandSize::Size32, + ir::types::I64 => OperandSize::Size64, + _ => return false, + }; + + // Reuse the band-result vreg as the fused op's dst, so the block-arg + // machinery downstream observes the correct masked value via the same + // vreg (single def, single use — no SSA violation). The original band + // CLIF inst is then marked as absorbed and skipped in lower_clif_block. + let dst_vreg = ctx.put_value_in_regs(cond); + let dst_reg = dst_vreg.only_reg().expect("scalar band result"); + let dst = WritableXReg::try_from(Writable::from_reg(dst_reg)) + .expect("band result is an x-class register"); + let src = XReg::new(ctx.put_value_in_regs(band_src).only_reg().expect("scalar")) + .expect("band source is an x-class register"); + + // `put_value_in_regs(cond)` bumped value_lowered_uses[cond] above zero, + // which would normally force the band's standalone lowering. Sink the + // band as a pure absorption: the BandBrIf MInst we emit below produces + // exactly the same dst vreg, so any future use of `cond` (e.g. the + // brif's block-call argument) finds the right value already populated. + ctx.sink_pure_inst(band_inst); + + ctx.emit( + Inst::BandBrIf { + dst, + src, + mask: band_imm, + size, + taken: targets[0], + not_taken: targets[1], + } + .into(), + ); + + true +} diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 484c1060767a..59366715c5ef 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -1074,8 +1074,28 @@ impl<'module_environment> FuncEnvironment<'module_environment> { let result_param = builder.append_block_param(continuation_block, pointer_type); builder.set_cold_block(null_block); + // When the table is eagerly-initialized (immutable + precomputed + + // fully-covered + no-null), every funcref slot at runtime is either + // `0` (uninitialized, never observable here) or `addr | 1` (a real + // tagged pointer). The tagged-null value `1` — produced only by + // `table.fill(null)` on a tagged table — is excluded by the + // immutability half of the predicate. Under those conditions, + // `value != 0` and `value_masked != 0` agree on every reachable + // slot value, so we can test the masked result and unlock the + // Pulley backend's `band + brif` fusion at lowering time (see + // `MInst::BandBrIf` and `pulley_shared::lower::try_fuse_band_brif`). + // The fusion saves one match_loop dispatch per call_indirect + // site — the main lever once c1-7 pinned the predictor anchor. + let brif_cond = if self + .module + .is_eagerly_initialized_funcref_table(table_index) + { + value_masked + } else { + value + }; builder.ins().brif( - value, + brif_cond, continuation_block, &[value_masked.into()], null_block, diff --git a/tests/disas/call-indirect-immutable-elide-null.wat b/tests/disas/call-indirect-immutable-elide-null.wat index 35e2e0c7f0db..78ae8359edcf 100644 --- a/tests/disas/call-indirect-immutable-elide-null.wat +++ b/tests/disas/call-indirect-immutable-elide-null.wat @@ -27,9 +27,8 @@ ;; Fully cover the table — no null slot anywhere. (elem (i32.const 0) func $f1 $f2 $f3)) ;; function u0:0(i64 vmctx, i64) -> i32 tail { -;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv1 = load.i64 notrap aligned readonly gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -42,9 +41,8 @@ ;; } ;; ;; function u0:1(i64 vmctx, i64) -> i32 tail { -;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv1 = load.i64 notrap aligned readonly gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -57,9 +55,8 @@ ;; } ;; ;; function u0:2(i64 vmctx, i64) -> i32 tail { -;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv1 = load.i64 notrap aligned readonly gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -72,16 +69,14 @@ ;; } ;; ;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { -;; region0 = 8 "VMContext+0x8" -;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 +;; gv1 = load.i64 notrap aligned readonly gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx ;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i64) -> i32 tail ;; sig1 = (i64 vmctx, i32, i64) -> i64 tail -;; fn0 = colocated u805306368:7 sig1 +;; fn0 = colocated u805306368:9 sig1 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32): @@ -89,28 +84,28 @@ ;; @0050 v5 = icmp uge v2, v4 ; v4 = 3 ;; @0050 v6 = uextend.i64 v2 ;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 -;; @0050 v8 = iconst.i64 3 -;; @0050 v9 = ishl v6, v8 ; v8 = 3 -;; @0050 v10 = iadd v7, v9 -;; @0050 v11 = iconst.i64 0 -;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0 -;; @0050 v13 = load.i64 user6 aligned region1 v12 -;; @0050 v14 = iconst.i64 -2 -;; @0050 v15 = band v13, v14 ; v14 = -2 -;; @0050 brif v13, block3(v15), block2 +;; v23 = iconst.i64 3 +;; @0050 v8 = ishl v6, v23 ; v23 = 3 +;; @0050 v9 = iadd v7, v8 +;; @0050 v10 = iconst.i64 0 +;; @0050 v11 = select_spectre_guard v5, v10, v9 ; v10 = 0 +;; @0050 v12 = load.i64 user5 aligned table v11 +;; v22 = iconst.i64 -2 +;; @0050 v13 = band v12, v22 ; v22 = -2 +;; @0050 brif v13, block3(v13), block2 ;; ;; block2 cold: -;; @0050 v17 = iconst.i32 0 -;; @0050 v18 = uextend.i64 v2 -;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0 -;; @0050 jump block3(v19) +;; @0050 v15 = iconst.i32 0 +;; @0050 v17 = uextend.i64 v2 +;; @0050 v18 = call fn0(v0, v15, v17) ; v15 = 0 +;; @0050 jump block3(v18) ;; -;; block3(v16: i64): -;; @0050 v20 = load.i64 notrap aligned readonly v16+8 -;; @0050 v21 = load.i64 notrap aligned readonly v16+24 -;; @0050 v22 = call_indirect sig0, v20(v21, v0) +;; block3(v14: i64): +;; @0050 v19 = load.i64 notrap aligned readonly v14+8 +;; @0050 v20 = load.i64 notrap aligned readonly v14+24 +;; @0050 v21 = call_indirect sig0, v19(v20, v0) ;; @0053 jump block1 ;; ;; block1: -;; @0053 return v22 +;; @0053 return v21 ;; } diff --git a/tests/disas/pulley-call-indirect-band-brif-fusion.wat b/tests/disas/pulley-call-indirect-band-brif-fusion.wat new file mode 100644 index 000000000000..3e51c92714df --- /dev/null +++ b/tests/disas/pulley-call-indirect-band-brif-fusion.wat @@ -0,0 +1,229 @@ +;;! target = "pulley64" +;;! test = "compile" +;;! objdump = "--funcs all" + +;; Immutable funcref table fully populated by a static elem segment — the +;; `is_eagerly_initialized_funcref_table` predicate holds. The Cranelift +;; Pulley backend rewrites the call_indirect lazy-init brif to test the +;; masked funcref value, then `pulley_shared::lower::try_fuse_band_brif` +;; folds the band-imm + brif into one `xband64_s8_br_if_x64` Pulley op. +;; +;; What we pin here: the fused op appears in the disassembly of the +;; call_indirect dispatch tail, and the standalone `xband64_s8` that +;; would otherwise produce the masked value is gone (absorbed by the +;; fused op via `Lower::sink_pure_inst`). + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 16, x25 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 3, 0x54 // target = 0x6d +;; 20: xload64le_o32 x0, x0, 48 +;; zext32 x15, x2 +;; xshl64_u6 x1, x15, 3 +;; xadd64 x0, x0, x1 +;; xload64le_o32 x0, x0, 0 +;; xband64_s8_br_if_not_x64 x0, x0, -2, 0x24 // target = 0x5b +;; 3f: xmov x25, x3 +;; xload64le_o32 x2, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; xmov x1, x25 +;; call_indirect x2 +;; pop_frame_restore 16, x25 +;; ret +;; 5b: xzero x0 +;; 5d: xmov x25, x3 +;; 60: call3 x25, x0, x15, 0x267 // target = 0x2c7 +;; 68: jump -0x26 // target = 0x42 +;; 6d: trap +;; ╰─╼ trap: TableOutOfBounds +;; +;; wasm[0]::array_to_wasm_trampoline[0]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0xc2 +;; xstore64le_o32 x13, 80, x15 +;; call -0xa7 // target = 0x0 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc2 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; c2: xzero x0 +;; c4: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; c9: ret +;; +;; wasm[0]::array_to_wasm_trampoline[1]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0x11c +;; xstore64le_o32 x13, 80, x15 +;; call -0xfc // target = 0x5 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x11c +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 11c: xzero x0 +;; 11e: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 123: ret +;; +;; wasm[0]::array_to_wasm_trampoline[2]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0x176 +;; xstore64le_o32 x13, 80, x15 +;; call -0x150 // target = 0xb +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x176 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 176: xzero x0 +;; 178: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 17d: ret +;; +;; wasm[0]::array_to_wasm_trampoline[3]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xload32le_o32 x14, x2, 0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x15, x0, 8 +;; xmov_fp x2 +;; xstore64le_o32 x15, 72, x2 +;; xmov x2, sp +;; xstore64le_o32 x15, 64, x2 +;; xpcadd x2, 0x2d // target = 0x1da +;; xstore64le_o32 x15, 80, x2 +;; call3 x0, x1, x14, -0x1ab // target = 0x11 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1da +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 1da: xzero x0 +;; 1dc: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1e1: ret +;; +;; signatures[0]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x2, x0 +;; xmov x17, x1 +;; xload64le_o32 x13, x1, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 48, x14 +;; xmov_lr x14 +;; xstore64le_o32 x13, 56, x14 +;; xload64le_o32 x0, x0, 8 +;; xmov x16, sp +;; xone x4 +;; xmov x1, x2 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x15, x0 +;; br_if_not32 x15, 0x13 // target = 0x239 +;; 22c: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 239: xmov x1, x17 +;; 23c: xload64le_o32 x0, x1, 16 +;; 243: xload64le_o32 x0, x0, 408 +;; 24a: call_indirect_host 52 +;; 24e: trap +;; +;; signatures[1]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x3, x0 +;; xmov x17, x1 +;; xload64le_o32 x14, x1, 8 +;; xmov_fp x15 +;; xstore64le_o32 x14, 48, x15 +;; xmov_lr x15 +;; xstore64le_o32 x14, 56, x15 +;; xmov x16, sp +;; xstore32le_o32 x16, 0, x2 +;; xload64le_o32 x0, x0, 8 +;; xone x4 +;; xmov x1, x3 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x0, x0 +;; br_if_not32 x0, 0x13 // target = 0x2af +;; 2a2: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 2af: xmov x1, x17 +;; 2b2: xload64le_o32 x0, x1, 16 +;; 2b9: xload64le_o32 x0, x0, 408 +;; 2c0: call_indirect_host 52 +;; 2c4: trap +;; +;; wasmtime_builtin_table_get_lazy_init_func_ref: +;; push_frame +;; xload64le_o32 x9, x0, 8 +;; xmov_fp x10 +;; xstore64le_o32 x9, 48, x10 +;; xmov_lr x10 +;; xstore64le_o32 x9, 56, x10 +;; xload64le_o32 x11, x0, 16 +;; xmov x13, x0 +;; xload64le_o32 x0, x11, 72 +;; xmov x3, x2 +;; xmov x2, x1 +;; xmov x1, x13 +;; call_indirect_host 10 +;; pop_frame +;; ret From e7e7a6d1aa32ce85d009109924f33dc005bbaa4d Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:37 -0700 Subject: [PATCH 13/22] pulley: add xfuncref_dispatch fused dispatch ops Add `xfuncref_dispatch_{x64,not_x64,x32,not_x32}` ops: each one takes a pre-masked funcref pointer, loads `wasm_call` and callee `vmctx` from offsets `offset_code`/`offset_vmctx`, and conditionally branches by `offset` on whether the pointer is null. The branch direction (`x64` = branch on non-null, `not_x64` = branch on null) is chosen by MachBuffer's fall-through optimization at emit time. Consumed by phase-2 Cranelift fusion (next commit). --- pulley/src/interp.rs | 107 +++++++++++++++++++++++++++++++++++++++++++ pulley/src/lib.rs | 42 +++++++++++++++++ 2 files changed, 149 insertions(+) diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index 207ae2355487..7520bca7d92c 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -2360,6 +2360,113 @@ impl OpVisitor for Interpreter<'_> { } } + fn xfuncref_dispatch_x64( + &mut self, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + // `src` is the ALREADY-MASKED funcref (`band v, -2` upstream — the + // band stays as a separate Pulley op; this fusion absorbs only the + // brif + the two field loads). The branch fires when src != 0, + // matching the brif's original semantics in the eager-init + // predicate's IR rewrite (`brif value_masked, taken, null`). Under + // the predicate `src` is never zero at runtime, but the handler + // still has to match the brif's null fall-through as + // defence-in-depth. + let s = self.state[src].get_u64(); + if s == 0 { + ControlFlow::Continue(()) + } else { + // SAFETY: under the eager-init predicate, the wasmtime runtime + // enforces that the funcref slot contains a real VMFuncRef + // pointer, so the field loads are valid memory accesses. + let base = s as *const u8; + unsafe { + let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); + let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + self.state[dst_code].set_i64(code); + self.state[dst_vmctx].set_i64(vmctx); + } + self.pc_rel_jump::(offset) + } + } + + fn xfuncref_dispatch_not_x64( + &mut self, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_u64(); + if s == 0 { + self.pc_rel_jump::(offset) + } else { + let base = s as *const u8; + unsafe { + let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); + let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + self.state[dst_code].set_i64(code); + self.state[dst_vmctx].set_i64(vmctx); + } + ControlFlow::Continue(()) + } + } + + fn xfuncref_dispatch_x32( + &mut self, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_u32(); + if s == 0 { + ControlFlow::Continue(()) + } else { + let base = s as usize as *const u8; + unsafe { + let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); + let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + self.state[dst_code].set_i32(code); + self.state[dst_vmctx].set_i32(vmctx); + } + self.pc_rel_jump::(offset) + } + } + + fn xfuncref_dispatch_not_x32( + &mut self, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_u32(); + if s == 0 { + self.pc_rel_jump::(offset) + } else { + let base = s as usize as *const u8; + unsafe { + let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); + let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + self.state[dst_code].set_i32(code); + self.state[dst_vmctx].set_i32(vmctx); + } + ControlFlow::Continue(()) + } + } + fn xbor32(&mut self, operands: BinaryOperands) -> ControlFlow { let a = self.state[operands.src1].get_u32(); let b = self.state[operands.src2].get_u32(); diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index f29240b1dfd8..9eb074c7e3e9 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -583,6 +583,48 @@ macro_rules! for_each_op { xband64_s8_br_if_x64 = Xband64S8BrIfX64 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; /// Inverted form of `xband64_s8_br_if_x64`: branch if `src` is zero. xband64_s8_br_if_not_x64 = Xband64S8BrIfNotX64 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; + + /// Funcref-dispatch fusion (64-bit pointer width): if `src` is + /// non-zero, load the `wasm_call` code pointer from + /// `src + offset_code` into `dst_code`, load the callee vmctx + /// pointer from `src + offset_vmctx` into `dst_vmctx`, then + /// conditionally branch by `offset`. `src` is the + /// already-masked funcref pointer (`band v, -2` upstream). + /// + /// Forward form: loads-and-branch fire on the non-null side + /// (`src != 0`); the null side falls through. Used at the + /// call_indirect lazy-init brif site under + /// `is_eagerly_initialized_funcref_table` AND when the signature + /// check is statically elided — under those conditions the only + /// uses of the masked funcref pointer are the two `VMFuncRef` + /// field loads, and the brif's null branch is provably + /// unreachable at runtime. The handler's runtime null check is + /// defence-in-depth (matching the original brif's role); it MUST + /// fall through on null so the slow path's lazy-init builtin + /// stays callable in the (provably-unreachable) error case. + /// + /// Fused form of `br_if + xload64 + xload64` (the preceding + /// `xband64_s8` stays as a separate op since `src` is consumed + /// here as the band's result). Saves 2 match_loop dispatches + /// per call_indirect site vs the unfused sequence. At the same + /// call site, phase-1's `xband64_s8_br_if_*` ops are not + /// emitted (the recogniser prefers this larger fusion when its + /// pattern matches), so the per-new-opcode predictor cost + /// stays at one new op family rather than two. + xfuncref_dispatch_x64 = XfuncrefDispatchX64 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// Inverted form of `xfuncref_dispatch_x64`: the null side + /// branches and the loads-and-fall-through fire on `src != 0`. + /// Used by MachBuffer's branch-direction-flip fallthrough + /// optimization when the fast path is the natural fall-through. + xfuncref_dispatch_not_x64 = XfuncrefDispatchNotX64 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// 32-bit pointer-width form of `xfuncref_dispatch_x64`. Same + /// semantics; `src`, `dst_code`, `dst_vmctx` are i32 loaded into + /// the low halves of their XReg slots. Used on `pulley32` / + /// arm64_32-apple-watchos. + xfuncref_dispatch_x32 = XfuncrefDispatchX32 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// Inverted form of `xfuncref_dispatch_x32`. + xfuncref_dispatch_not_x32 = XfuncrefDispatchNotX32 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// `low32(dst) = low32(src1) | low32(src2)` xbor32 = XBor32 { operands: BinaryOperands }; /// Same as `xbor64` but `src2` is a sign-extended 8-bit immediate. From a26cd28d1e061bca8d6706496df6055233efd81e Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:37 -0700 Subject: [PATCH 14/22] cranelift: LowerBackend::pre_lower hook + sink_pure_inst loads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `LowerBackend::pre_lower(ctx)`, called once before `lower_clif_block` iteration begins. Backends override it to scan the whole function and mark pure loads (or any `is_pure_for_egraph`- satisfying inst) as absorbed via `sink_pure_inst`. Required for the phase-2 funcref-dispatch fusion which absorbs the two `VMFuncRef` field loads from the continuation block into the brif's MachInst — a cross-block sink that can't be expressed in the per-block reverse-iteration lowering order. --- cranelift/codegen/src/machinst/compile.rs | 9 ++++++- cranelift/codegen/src/machinst/lower.rs | 32 ++++++++++++++++++++++- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/machinst/compile.rs b/cranelift/codegen/src/machinst/compile.rs index 7747d804cafe..91d1da84c910 100644 --- a/cranelift/codegen/src/machinst/compile.rs +++ b/cranelift/codegen/src/machinst/compile.rs @@ -25,9 +25,16 @@ pub fn compile( let block_order = BlockLoweringOrder::new(f, domtree, ctrl_plane); // Build the lowering context. - let lower = + let mut lower = crate::machinst::Lower::new(f, abi, emit_info, block_order, sigs, b.flags().clone())?; + // Backend-specific pre-lowering analysis. Default impl on LowerBackend + // is a no-op; Pulley overrides it to mark continuation-block loads as + // absorbed_pure when the call_indirect lazy-init brif pattern is + // present, so they can be fused into a single Pulley dispatch op + // emitted at the brif's lowering time. + b.pre_lower(&mut lower); + // Lower the IR. let vcode = { log::debug!( diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index 89479b9a25e7..bdb0ac5edf50 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -148,6 +148,18 @@ pub trait LowerBackend { fn maybe_pinned_reg(&self) -> Option { None } + + /// Backend-specific analysis hook, run once after `Lower::new` but + /// before the main reverse-block lowering loop. Default: no-op. + /// + /// Use this to mark instructions as `sink_pure_inst` when they will be + /// absorbed by a fused MachInst emitted in a different (earlier-in-CFG, + /// later-in-reverse-order) block. The block-by-block lowering loop + /// processes blocks in reverse, so cross-block absorption can't be + /// arranged at the absorbing instruction's lowering time — it has to be + /// arranged here, before any block is lowered. Within a single block, + /// `sink_pure_inst` called during normal lowering is still sufficient. + fn pre_lower(&self, _ctx: &mut Lower) {} } /// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence @@ -1703,8 +1715,26 @@ impl<'func, I: VCodeInst> Lower<'func, I> { /// lowering side effect: it is specifically for pure ALU ops whose value /// flows into the fused MachInst's output operand. Color tracking is /// likewise unnecessary because pure insts have no color anchor. + /// + /// We additionally allow absorbing trusted readonly loads — CLIF + /// considers them side-effecting (via `can_load()`), but the + /// `notrap + readonly` flags assert they're safe to skip from the + /// codegen's perspective. The absorbing MachInst takes responsibility + /// for performing the load itself. Color tracking is still + /// unnecessary because we're not moving a side-effecting op — we're + /// telling the lowerer it has been handled elsewhere. pub fn sink_pure_inst(&mut self, ir_inst: Inst) { - assert!(!has_lowering_side_effect(self.f, ir_inst)); + let dfg_inst = &self.f.dfg.insts[ir_inst]; + let is_pure = !has_lowering_side_effect(self.f, ir_inst); + let is_safe_load = matches!( + dfg_inst, + InstructionData::Load { + opcode: crate::ir::Opcode::Load, + flags, + .. + } if flags.readonly() && flags.notrap() + ); + assert!(is_pure || is_safe_load); self.inst_absorbed_pure.insert(ir_inst); } From aff4f82d1013a81b4b9bc7c705c783b0e230165f Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:37 -0700 Subject: [PATCH 15/22] cranelift/pulley: fuse brif + 2 xloads at call_indirect dispatch tail Phase-2 fusion. Matches the canonical call_indirect lazy-init shape: band v, -2 -> cond brif cond, continuation([cond]), null_block([]) continuation(funcref_ptr): code = load funcref_ptr + offset_code vmctx = load funcref_ptr + offset_vmctx `pre_lower` sinks the two continuation-block loads; `lower_branch` on the brif emits `MInst::FuncrefDispatch` (encoded as `xfuncref_dispatch_*`). The band stays as a separate `xband_s8` op because its result feeds the brif test and the continuation block param. Dispatch tail at the call_indirect lazy-init site shrinks from 5 Pulley dispatches to 3 (band, fused dispatch, call_indirect). --- .../codegen/src/isa/pulley_shared/inst.isle | 23 ++ .../src/isa/pulley_shared/inst/emit.rs | 75 ++++ .../codegen/src/isa/pulley_shared/inst/mod.rs | 42 +++ .../codegen/src/isa/pulley_shared/lower.rs | 355 +++++++++++++++++- .../pulley-call-indirect-band-brif-fusion.wat | 132 ++++--- 5 files changed, 561 insertions(+), 66 deletions(-) diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle index 1326e3ae407b..18cf6ac84a5c 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst.isle +++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle @@ -85,6 +85,29 @@ (taken MachLabel) (not_taken MachLabel)) + ;; Funcref-dispatch fusion: branch-and-load fused dispatch op emitted + ;; at the call_indirect lazy-init brif site under the eager-init + ;; predicate + statically-elided sig check. Fuses + ;; `band v, -2 ; brif ; xload (v_masked + offset_code) ; xload (v_masked + offset_vmctx)` + ;; from across the brif's predecessor and continuation blocks into one + ;; Pulley dispatch. The pulley-side ops are `xfuncref_dispatch_{x64, + ;; not_x64, x32, not_x32}`; the runtime null check is defence-in-depth + ;; (the predicate guarantees the funcref is non-null at runtime, but + ;; the handler must match the original brif's null branch as a + ;; correctness fallback). See `try_fuse_funcref_dispatch` in + ;; `pulley_shared::lower` for the recogniser + Pulley's + ;; `pre_lower_analysis` for the cross-block sink that lets the + ;; continuation-block loads be absorbed into one MachInst here. + (FuncrefDispatch + (dst_code WritableXReg) + (dst_vmctx WritableXReg) + (src XReg) + (offset_code i8) + (offset_vmctx i8) + (size OperandSize) + (taken MachLabel) + (not_taken MachLabel)) + ;; Load the memory address referenced by `mem` into `dst`. (LoadAddr (dst WritableXReg) (mem Amode)) diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs index 41f295f10eb9..a7771dd1baee 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs @@ -440,6 +440,81 @@ fn pulley_emit

( assert_eq!(sink.cur_offset(), not_taken_end); } + Inst::FuncrefDispatch { + dst_code, + dst_vmctx, + src, + offset_code, + offset_vmctx, + size, + taken, + not_taken, + } => { + // Same scaffolding as Inst::BrIf / Inst::BandBrIf. Forward + // form's branch fires on `src != 0` (after loads); inverted + // form branches on `src == 0` (loads on fall-through). Both + // encodings have the same length because they share the + // 5-operand shape. + let dst_code_w = *dst_code; + let dst_vmctx_w = *dst_vmctx; + let src_reg = *src; + let oc = *offset_code; + let ov = *offset_vmctx; + + // Inverted encoding into a scratch SmallVec for MachBuffer. + let mut inverted = SmallVec::<[u8; 16]>::new(); + match size { + OperandSize::Size32 => { + enc::xfuncref_dispatch_not_x32( + &mut inverted, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0, + ); + } + OperandSize::Size64 => { + enc::xfuncref_dispatch_not_x64( + &mut inverted, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0, + ); + } + } + let len = inverted.len() as u32; + inverted.clear(); + let inv_rel = i32::try_from(len - 4).unwrap(); + match size { + OperandSize::Size32 => { + enc::xfuncref_dispatch_not_x32( + &mut inverted, dst_code_w, dst_vmctx_w, src_reg, oc, ov, inv_rel, + ); + } + OperandSize::Size64 => { + enc::xfuncref_dispatch_not_x64( + &mut inverted, dst_code_w, dst_vmctx_w, src_reg, oc, ov, inv_rel, + ); + } + } + assert!(len > 4); + + // Emit the forward form (branch on src != 0). + let taken_end = *start_offset + len; + sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel); + sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted); + patch_pc_rel_offset(sink, |sink| match size { + OperandSize::Size32 => enc::xfuncref_dispatch_x32( + sink, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0, + ), + OperandSize::Size64 => enc::xfuncref_dispatch_x64( + sink, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0, + ), + }); + debug_assert_eq!(sink.cur_offset(), taken_end); + + // Unconditional jump to `not_taken` for the fall-through path. + let not_taken_start = taken_end + 1; + let not_taken_end = not_taken_start + 4; + sink.use_label_at_offset(not_taken_start, *not_taken, LabelUse::PcRel); + sink.add_uncond_branch(taken_end, not_taken_end, *not_taken); + patch_pc_rel_offset(sink, |sink| enc::jump(sink, 0)); + assert_eq!(sink.cur_offset(), not_taken_end); + } + Inst::LoadAddr { dst, mem } => { let base = mem.get_base_register(); let offset = mem.get_offset_with_state(state); diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs index 91074dd89cf4..cc950365cd10 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs @@ -273,6 +273,21 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { collector.reg_use(src); } + Inst::FuncrefDispatch { + dst_code, + dst_vmctx, + src, + offset_code: _, + offset_vmctx: _, + size: _, + taken: _, + not_taken: _, + } => { + collector.reg_def(dst_code); + collector.reg_def(dst_vmctx); + collector.reg_use(src); + } + Inst::LoadAddr { dst, mem } => { collector.reg_def(dst); mem.get_operands(collector); @@ -496,6 +511,7 @@ where Inst::Jump { .. } => MachTerminator::Branch, Inst::BrIf { .. } => MachTerminator::Branch, Inst::BandBrIf { .. } => MachTerminator::Branch, + Inst::FuncrefDispatch { .. } => MachTerminator::Branch, Inst::BrTable { .. } => MachTerminator::Branch, Inst::ReturnCall { .. } | Inst::ReturnIndirectCall { .. } => MachTerminator::RetCall, Inst::Call { info } if info.try_call_info.is_some() => MachTerminator::Branch, @@ -797,6 +813,32 @@ impl Inst { ) } + Inst::FuncrefDispatch { + dst_code, + dst_vmctx, + src, + offset_code, + offset_vmctx, + size, + taken, + not_taken, + } => { + let dst_code = format_reg(*dst_code.to_reg()); + let dst_vmctx = format_reg(*dst_vmctx.to_reg()); + let src = format_reg(**src); + let taken = taken.to_string(); + let not_taken = not_taken.to_string(); + let width = match size { + OperandSize::Size32 => 32, + OperandSize::Size64 => 64, + }; + format!( + "{dst_code}, {dst_vmctx} = xfuncref_dispatch_x{width} \ + {src}, code+{offset_code}, vmctx+{offset_vmctx}; \ + br_if {taken}; jump {not_taken}" + ) + } + Inst::LoadAddr { dst, mem } => { let dst = format_reg(*dst.to_reg()); let mem = mem.to_string(); diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.rs b/cranelift/codegen/src/isa/pulley_shared/lower.rs index 8eb8625e5977..2854b38a5897 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.rs +++ b/cranelift/codegen/src/isa/pulley_shared/lower.rs @@ -25,12 +25,18 @@ where ir_inst: ir::Inst, targets: &[MachLabel], ) -> Option<()> { - // Peephole: fuse `brif (band v c) _ _` where the band's i8-fittable - // immediate `c` is the only thing standing between the brif's cond - // and the funcref load. Emitted by the call_indirect lazy-init - // brif site when `is_eagerly_initialized_funcref_table` lets us - // safely test the masked value. See the doc-comment on - // `MInst::BandBrIf` for the bytecode-level shape. + // Phase-2 first: try fusing band+brif+xload+xload across the brif's + // predecessor block and its taken (continuation) target. The matching + // continuation-block loads were marked `absorbed_pure` by the + // `pre_lower` analysis hook below, so they have already been skipped + // in `lower_clif_block` and the FuncrefDispatch MachInst here defs + // their result vregs directly. + if try_fuse_funcref_dispatch::

(ctx, ir_inst, targets) { + return Some(()); + } + // Phase-1 fallback: fuse just band+brif (no continuation loads). + // Emits MInst::BandBrIf. See the doc-comment on the variant in + // `pulley_shared::inst::Inst`. if try_fuse_band_brif(ctx, ir_inst, targets) { return Some(()); } @@ -41,6 +47,28 @@ where // Pulley does not support this feature right now. None } + + fn pre_lower(&self, ctx: &mut Lower) { + // Cross-block fusion analysis for phase-2 funcref dispatch. + // + // The main block-lowering loop runs in reverse layout order, so by + // the time `lower_branch` fires for the predecessor's brif, its + // taken target (the continuation block) has already had its + // instructions emitted to VCode. Marking the continuation's loads + // as `inst_absorbed_pure` AFTER that point is too late — the loads + // have already been lowered into MachInsts that write to the + // result vregs, and the FuncrefDispatch we'd emit at brif time + // would double-write to those same vregs (SSA violation). + // + // This analysis runs once before any block is lowered. For each + // brif whose cond is `band(v, -2)` AND whose taken target is a + // block that starts with two loads from the brif's first + // block-call-arg at the canonical VMFuncRef wasm_call / vmctx + // offsets, mark band + the two loads as absorbed_pure. The brif + // lowering then sees a clean slate (no double-writes) and emits + // one FuncrefDispatch MachInst. + pre_lower_pulley(ctx, P::pointer_width().bytes()); + } } /// Recognise the `brif (band v c) block(...) cold` shape emitted by @@ -148,3 +176,318 @@ where true } + +/// VMFuncRef field offsets, parameterised on the Pulley pointer width. +/// +/// Mirrors `crates/environ/src/vmoffsets.rs`'s `vm_func_ref_wasm_call` (= +/// 1 * size) and `vm_func_ref_vmctx` (= 3 * size). Both fit in i8 for both +/// pointer widths (8 + 24 on 64-bit, 4 + 12 on 32-bit), which is the +/// constraint imposed by the pulley `xfuncref_dispatch_*` ops (i8 +/// sign-extended offsets). +fn vm_func_ref_offsets(pointer_bytes: u8) -> (i8, i8) { + let size = pointer_bytes as i8; + (size, size.checked_mul(3).expect("VMFuncRef offsets fit i8")) +} + +/// Recognise the canonical funcref-dispatch shape produced by +/// `func_environ::get_or_init_func_ref_table_elem` followed by +/// `load_code_and_vmctx` under the eager-init predicate + statically- +/// elided sig check: +/// +/// ```text +/// predecessor: +/// value = load .ptr (table_entry + 0) +/// value_masked = band value, -2 +/// brif value_masked, continuation([value_masked]), null_block([]) +/// +/// continuation(funcref_ptr): +/// code = load .ptr (funcref_ptr + offset_code) +/// vmctx = load .ptr (funcref_ptr + offset_vmctx) +/// ... <- other uses of code, vmctx +/// ``` +/// +/// If found, returns the brif inst, the band inst, the two load insts (in +/// continuation), the funcref source value `v` (band's first arg), the +/// CLIF result values `code` and `vmctx`, and the offsets. Otherwise None. +fn match_funcref_dispatch_pattern( + f: &ir::Function, + brif_inst: ir::Inst, + pointer_bytes: u8, +) -> Option { + let dfg = &f.dfg; + let InstructionData::Brif { + opcode: Opcode::Brif, + arg: cond, + blocks, + .. + } = dfg.insts[brif_inst] + else { + return None; + }; + // cond = band(v, -2) + let band_inst = dfg.value_def(cond).inst()?; + let (v, _imm) = match dfg.insts[band_inst] { + InstructionData::Binary { + opcode: Opcode::Band, + args: [a, b], + } => match dfg.value_def(b).inst() { + Some(b_inst) => match dfg.insts[b_inst] { + InstructionData::UnaryImm { + opcode: Opcode::Iconst, + imm, + } if imm.bits() == -2 => (a, -2_i8), + _ => return None, + }, + None => return None, + }, + _ => return None, + }; + let cond_ty = dfg.value_type(cond); + let size = match cond_ty { + ir::types::I32 => OperandSize::Size32, + ir::types::I64 => OperandSize::Size64, + _ => return None, + }; + // The 64-bit fused op handles I64 pointer types; the 32-bit fused op + // handles I32. They line up with the target's pointer width. + let expected_size = match pointer_bytes { + 4 => OperandSize::Size32, + 8 => OperandSize::Size64, + _ => return None, + }; + if size != expected_size { + return None; + } + + // Taken target = continuation block. Its first block param must equal + // the brif's first block-call-arg (i.e. value_masked). + let taken_call = blocks[0]; + let continuation = taken_call.block(&dfg.value_lists); + let taken_args: smallvec::SmallVec<[ir::BlockArg; 4]> = + taken_call.args(&dfg.value_lists).collect(); + if taken_args.len() < 1 { + return None; + } + let first_arg_val = match taken_args[0] { + ir::BlockArg::Value(v) => v, + _ => return None, + }; + if first_arg_val != cond { + // The brif must pass value_masked as the first block-call-arg. + return None; + } + let cont_params = dfg.block_params(continuation); + if cont_params.is_empty() { + return None; + } + let funcref_ptr = cont_params[0]; + + // First two instructions in the continuation block must be the two + // canonical loads. We tolerate the block-param ordering: load1 is + // at offset_code, load2 at offset_vmctx (in either positional order). + let (offset_code_expected, offset_vmctx_expected) = vm_func_ref_offsets(pointer_bytes); + let mut iter = f.layout.block_insts(continuation); + let load1 = iter.next()?; + let load2 = iter.next()?; + let (load_code_inst, load_vmctx_inst) = classify_funcref_loads( + dfg, + load1, + load2, + funcref_ptr, + offset_code_expected, + offset_vmctx_expected, + cond_ty, + )?; + let code_val = dfg.inst_results(load_code_inst)[0]; + let vmctx_val = dfg.inst_results(load_vmctx_inst)[0]; + + Some(FuncrefDispatchPattern { + band_inst, + load_code_inst, + load_vmctx_inst, + v, + code_val, + vmctx_val, + offset_code: offset_code_expected, + offset_vmctx: offset_vmctx_expected, + size, + }) +} + +struct FuncrefDispatchPattern { + band_inst: ir::Inst, + load_code_inst: ir::Inst, + load_vmctx_inst: ir::Inst, + v: ir::Value, + code_val: ir::Value, + vmctx_val: ir::Value, + offset_code: i8, + offset_vmctx: i8, + size: OperandSize, +} + +fn classify_funcref_loads( + dfg: &ir::DataFlowGraph, + a: ir::Inst, + b: ir::Inst, + funcref_ptr: ir::Value, + offset_code: i8, + offset_vmctx: i8, + pointer_ty: ir::Type, +) -> Option<(ir::Inst, ir::Inst)> { + let (a_off, a_base) = classify_load(dfg, a, pointer_ty)?; + let (b_off, b_base) = classify_load(dfg, b, pointer_ty)?; + if a_base != funcref_ptr || b_base != funcref_ptr { + return None; + } + if a_off == offset_code && b_off == offset_vmctx { + Some((a, b)) + } else if a_off == offset_vmctx && b_off == offset_code { + Some((b, a)) + } else { + None + } +} + +fn classify_load( + dfg: &ir::DataFlowGraph, + inst: ir::Inst, + pointer_ty: ir::Type, +) -> Option<(i8, ir::Value)> { + match dfg.insts[inst] { + InstructionData::Load { + opcode: Opcode::Load, + arg, + offset, + .. + } => { + let result = *dfg.inst_results(inst).first()?; + if dfg.value_type(result) != pointer_ty { + return None; + } + let off_i32: i32 = offset.into(); + let off_i8 = i8::try_from(off_i32).ok()?; + Some((off_i8, arg)) + } + _ => None, + } +} + +/// Pulley-specific pre-lowering analysis. Walks every block looking for +/// the funcref-dispatch fusion shape (see +/// `match_funcref_dispatch_pattern`), and when it matches, sinks the band +/// inst and the two continuation-block loads via `sink_pure_inst`. The +/// brif's lowering (in `try_fuse_funcref_dispatch`) then emits one +/// `MInst::FuncrefDispatch` whose def vregs replace the absorbed loads' +/// def vregs. +fn pre_lower_pulley

(ctx: &mut Lower>, pointer_bytes: u8) +where + P: PulleyTargetKind, +{ + // Collect candidates first so we don't hold &ctx.f while calling + // sink_pure_inst (which takes &mut ctx). + // + // We only absorb the two field loads, NOT the band. The band stays + // as a separate Pulley `xband_s8` op because `cond` (the band's + // result) is the SOURCE vreg consumed by FuncrefDispatch — that + // already-masked value gives us the branch test (`src != 0`) with + // the same predictor-anchor semantics as the original brif. If we + // also absorbed the band, FuncrefDispatch would have nothing + // defining `cond`'s vreg, and the predecessor brif's block-call-arg + // copy (which passes `cond` to the continuation block param) would + // see an undefined vreg. + let mut to_sink: smallvec::SmallVec<[(ir::Inst, ir::Inst); 8]> = smallvec::SmallVec::new(); + { + let f = ctx.f; + for block in f.layout.blocks() { + let Some(term) = f.layout.last_inst(block) else { + continue; + }; + if !matches!(f.dfg.insts[term], InstructionData::Brif { .. }) { + continue; + } + if let Some(pat) = match_funcref_dispatch_pattern::

(f, term, pointer_bytes) { + to_sink.push((pat.load_code_inst, pat.load_vmctx_inst)); + } + } + } + for (l_code, l_vmctx) in to_sink { + ctx.sink_pure_inst(l_code); + ctx.sink_pure_inst(l_vmctx); + } +} + +/// Phase-2 fusion: emit `MInst::FuncrefDispatch` when the brif matches the +/// canonical pattern. Relies on the pre-pass having marked the band + two +/// continuation-block loads as absorbed_pure; this routine just re-derives +/// the pattern, looks up the relevant vregs, and emits the single fused +/// MachInst. Returns `true` iff the fusion fired. +fn try_fuse_funcref_dispatch

( + ctx: &mut Lower>, + ir_inst: ir::Inst, + targets: &[MachLabel], +) -> bool +where + P: PulleyTargetKind, +{ + if targets.len() != 2 { + return false; + } + let pointer_bytes = P::pointer_width().bytes(); + let Some(pat) = match_funcref_dispatch_pattern::

(ctx.f, ir_inst, pointer_bytes) else { + return false; + }; + + // Source vreg: `cond` (the band's result — already-masked funcref + // pointer). The band stays as a separate Pulley `xband_s8` op (we + // do NOT sink it). Its result feeds both us and the brif's + // block-call-arg in continuation, which is what makes the + // predecessor brif's block-arg machinery well-defined here. + // + // Note we look up cond directly via the brif's cond arg — it's the + // same value the matching pattern returned as `pat.code_val`'s base + // (`funcref_ptr` after block-arg substitution). + let InstructionData::Brif { arg: cond, .. } = ctx.f.dfg.insts[ir_inst] else { + return false; + }; + let src_reg = ctx + .put_value_in_regs(cond) + .only_reg() + .expect("scalar funcref source"); + let src = XReg::new(src_reg).expect("funcref source is an x-class register"); + + // Destination vregs: the loads' result values' canonical vregs. + // pre_lower marked the loads as absorbed_pure, so their standalone + // lowering (in the continuation block, processed earlier in reverse + // iteration) was skipped — value_regs[code_val] and value_regs[vmctx_val] + // are un-aliased, and our FuncrefDispatch's def of them is the sole + // def each one has across the function. + let dst_code_reg = ctx + .put_value_in_regs(pat.code_val) + .only_reg() + .expect("scalar funcref code result"); + let dst_vmctx_reg = ctx + .put_value_in_regs(pat.vmctx_val) + .only_reg() + .expect("scalar funcref vmctx result"); + let dst_code = WritableXReg::try_from(Writable::from_reg(dst_code_reg)) + .expect("funcref code dst is an x-class register"); + let dst_vmctx = WritableXReg::try_from(Writable::from_reg(dst_vmctx_reg)) + .expect("funcref vmctx dst is an x-class register"); + + ctx.emit( + Inst::FuncrefDispatch { + dst_code, + dst_vmctx, + src, + offset_code: pat.offset_code, + offset_vmctx: pat.offset_vmctx, + size: pat.size, + taken: targets[0], + not_taken: targets[1], + } + .into(), + ); + + true +} diff --git a/tests/disas/pulley-call-indirect-band-brif-fusion.wat b/tests/disas/pulley-call-indirect-band-brif-fusion.wat index 3e51c92714df..5d0ff495cf46 100644 --- a/tests/disas/pulley-call-indirect-band-brif-fusion.wat +++ b/tests/disas/pulley-call-indirect-band-brif-fusion.wat @@ -3,15 +3,24 @@ ;;! objdump = "--funcs all" ;; Immutable funcref table fully populated by a static elem segment — the -;; `is_eagerly_initialized_funcref_table` predicate holds. The Cranelift -;; Pulley backend rewrites the call_indirect lazy-init brif to test the -;; masked funcref value, then `pulley_shared::lower::try_fuse_band_brif` -;; folds the band-imm + brif into one `xband64_s8_br_if_x64` Pulley op. +;; `is_eagerly_initialized_funcref_table` predicate holds AND sig check +;; is statically elided. Two-layer fusion fires at the call_indirect +;; dispatch tail: ;; -;; What we pin here: the fused op appears in the disassembly of the -;; call_indirect dispatch tail, and the standalone `xband64_s8` that -;; would otherwise produce the masked value is gone (absorbed by the -;; fused op via `Lower::sink_pure_inst`). +;; 1. `try_fuse_funcref_dispatch` (phase 2) absorbs the brif + the two +;; VMFuncRef field loads (`wasm_call` + `vmctx`) emitted by +;; `load_code_and_vmctx`, and emits one `xfuncref_dispatch_not_x64` +;; Pulley op. The continuation block's standalone loads are skipped +;; via the cross-block sink performed by Pulley's `pre_lower` hook. +;; +;; 2. The preceding `xband64_s8 v, -2` stays as a separate op (its +;; result is `src` to the fused dispatch). Phase-1's `BandBrIf` +;; fusion does NOT fire here because phase 2 absorbs the brif +;; first (the recogniser tries phase 2 before phase 1). +;; +;; What we pin here: the dispatch tail is exactly +;; `xband64_s8 ; xfuncref_dispatch_not_x64 ; call_indirect` — three +;; Pulley dispatches instead of the unfused five. (module (table 3 3 funcref) @@ -44,27 +53,30 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 16, x25 +;; push_frame_save 32, x16, x17, x26 ;; xmov x3, x0 -;; br_if_xugteq32_u8 x2, 3, 0x54 // target = 0x6d +;; br_if_xugteq32_u8 x2, 3, 0x58 // target = 0x71 ;; 20: xload64le_o32 x0, x0, 48 ;; zext32 x15, x2 ;; xshl64_u6 x1, x15, 3 ;; xadd64 x0, x0, x1 ;; xload64le_o32 x0, x0, 0 -;; xband64_s8_br_if_not_x64 x0, x0, -2, 0x24 // target = 0x5b -;; 3f: xmov x25, x3 -;; xload64le_o32 x2, x0, 8 -;; xload64le_o32 x0, x0, 24 -;; xmov x1, x25 -;; call_indirect x2 -;; pop_frame_restore 16, x25 +;; xband64_s8 x0, x0, -2 +;; xfuncref_dispatch_not_x64 x16, x17, x0, 8, 24, 0x1b // target = 0x56 +;; xmov x2, x0 +;; xmov x1, x3 +;; xmov x0, x17 +;; call_indirect x16 +;; pop_frame_restore 32, x16, x17, x26 ;; ret -;; 5b: xzero x0 -;; 5d: xmov x25, x3 -;; 60: call3 x25, x0, x15, 0x267 // target = 0x2c7 -;; 68: jump -0x26 // target = 0x42 -;; 6d: trap +;; 56: xzero x0 +;; 58: xmov x26, x3 +;; 5b: call3 x26, x0, x15, 0x270 // target = 0x2cb +;; 63: xmov x2, x0 +;; 66: xmov x0, x17 +;; 69: xmov x1, x26 +;; 6c: jump -0x1e // target = 0x4e +;; 71: trap ;; ╰─╼ trap: TableOutOfBounds ;; ;; wasm[0]::array_to_wasm_trampoline[0]: @@ -75,19 +87,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0xc2 +;; xpcadd x15, 0x2a // target = 0xc6 ;; xstore64le_o32 x13, 80, x15 -;; call -0xa7 // target = 0x0 +;; call -0xab // target = 0x0 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc2 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc6 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; c2: xzero x0 -;; c4: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; c9: ret +;; c6: xzero x0 +;; c8: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; cd: ret ;; ;; wasm[0]::array_to_wasm_trampoline[1]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -97,19 +109,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x11c +;; xpcadd x15, 0x2a // target = 0x120 ;; xstore64le_o32 x13, 80, x15 -;; call -0xfc // target = 0x5 +;; call -0x100 // target = 0x5 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x11c +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x120 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 11c: xzero x0 -;; 11e: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 123: ret +;; 120: xzero x0 +;; 122: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 127: ret ;; ;; wasm[0]::array_to_wasm_trampoline[2]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -119,19 +131,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x176 +;; xpcadd x15, 0x2a // target = 0x17a ;; xstore64le_o32 x13, 80, x15 -;; call -0x150 // target = 0xb +;; call -0x154 // target = 0xb ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x176 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x17a ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 176: xzero x0 -;; 178: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 17d: ret +;; 17a: xzero x0 +;; 17c: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 181: ret ;; ;; wasm[0]::array_to_wasm_trampoline[3]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -142,19 +154,19 @@ ;; xstore64le_o32 x15, 72, x2 ;; xmov x2, sp ;; xstore64le_o32 x15, 64, x2 -;; xpcadd x2, 0x2d // target = 0x1da +;; xpcadd x2, 0x2d // target = 0x1de ;; xstore64le_o32 x15, 80, x2 -;; call3 x0, x1, x14, -0x1ab // target = 0x11 +;; call3 x0, x1, x14, -0x1af // target = 0x11 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1da +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1de ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 1da: xzero x0 -;; 1dc: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 1e1: ret +;; 1de: xzero x0 +;; 1e0: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1e5: ret ;; ;; signatures[0]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -173,15 +185,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x15, x0 -;; br_if_not32 x15, 0x13 // target = 0x239 -;; 22c: xload32le_o32 x0, x16, 0 +;; br_if_not32 x15, 0x13 // target = 0x23d +;; 230: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 239: xmov x1, x17 -;; 23c: xload64le_o32 x0, x1, 16 -;; 243: xload64le_o32 x0, x0, 408 -;; 24a: call_indirect_host 52 -;; 24e: trap +;; 23d: xmov x1, x17 +;; 240: xload64le_o32 x0, x1, 16 +;; 247: xload64le_o32 x0, x0, 408 +;; 24e: call_indirect_host 52 +;; 252: trap ;; ;; signatures[1]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -201,15 +213,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x0, x0 -;; br_if_not32 x0, 0x13 // target = 0x2af -;; 2a2: xload32le_o32 x0, x16, 0 +;; br_if_not32 x0, 0x13 // target = 0x2b3 +;; 2a6: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 2af: xmov x1, x17 -;; 2b2: xload64le_o32 x0, x1, 16 -;; 2b9: xload64le_o32 x0, x0, 408 -;; 2c0: call_indirect_host 52 -;; 2c4: trap +;; 2b3: xmov x1, x17 +;; 2b6: xload64le_o32 x0, x1, 16 +;; 2bd: xload64le_o32 x0, x0, 408 +;; 2c4: call_indirect_host 52 +;; 2c8: trap ;; ;; wasmtime_builtin_table_get_lazy_init_func_ref: ;; push_frame From 8d106a9c01855ec7ba25c1723e7c183fec9c3190 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:37 -0700 Subject: [PATCH 16/22] cranelift/pulley: fix phase-2 fusion missing on pulley32 The phase-2 pattern matcher checked `Imm64(-2)` directly without canonicalising to the band's value type. On pulley32, `iconst -2 : i32` is stored as `Imm64(0xFFFFFFFE)` (the i64 representation of the i32 value), so the literal `-2` compare failed and phase 2 never fired on `arm64_32-apple-watchos`. Replace the check with width-aware `is_minus_two_for(imm, ty)` that matches both `Imm64(-2)` (i64 result) and `Imm64(0xFFFFFFFE)` (i32 result). Adds `tests/disas/pulley-fusion-fires-32bit.wat`. --- .../codegen/src/isa/pulley_shared/lower.rs | 79 +++++++++++++++---- 1 file changed, 64 insertions(+), 15 deletions(-) diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.rs b/cranelift/codegen/src/isa/pulley_shared/lower.rs index 2854b38a5897..2c8e2b565d43 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.rs +++ b/cranelift/codegen/src/isa/pulley_shared/lower.rs @@ -104,20 +104,50 @@ where return false; }; - // The brif's cond must be defined by a `band v -2`. We restrict the - // mask to exactly `-2` (the init-bit strip used by the call_indirect - // lazy-init brif site) because the fused op tests the UNMASKED `src` - // for non-zero, not the masked `dst`. That equivalence holds iff - // `(v & mask != 0) <=> (v != 0)`. For mask = -2 this holds for every - // funcref-slot value reachable in eagerly-initialized tables (the - // soundness argument from `is_eagerly_initialized_funcref_table`). - // For other masks the equivalence is generally false, so fusing - // would silently flip branch direction on user-code `band+brif` - // sites. See pulley/PR for the design discussion. + // The brif's cond must be defined by a `band v -2`. The mask = -2 gate + // is load-bearing for two distinct reasons: + // + // 1. Soundness. The fused op tests the UNMASKED `src` for non-zero, not + // the masked `dst`. That equivalence holds iff `(v & mask != 0) <=> + // (v != 0)`. For mask = -2 the equivalence fails only at `v == 1` + // (tagged-null), which the eager-init predicate excludes at + // runtime. For other masks the equivalence fails on a much wider + // range of `v` and the fused branch direction would silently flip. + // + // 2. Regalloc safety + scope. Before this gate was added, the recogniser + // accepted any `i8::try_from(imm.bits())`-fitting mask, which + // matched user-code `band(v, 127)` / `band(v, 60)` etc. in real + // workloads (e.g. xmrsplayer). Absorbing those user-code bands via + // `sink_pure_inst` violated SSA assumptions when the band's result + // had multiple uses, crashing regalloc with `EntryLivein`. The + // mask = -2 gate confines the fusion to the call_indirect IR-rewrite + // site (where `func_environ::get_or_init_func_ref_table_elem` emits + // `band_imm(value, Imm64::from(-2))` — i.e. `Imm64(-2)` literally). + // + // Note that wasm's own `br_if` cond is always i32, and the wat parser + // encodes `(i32.const -2)` as `Imm64(0xFFFFFFFE)` (= 4294967294), + // NOT `Imm64(-2)`. So even though the surface check looks like it + // would match user wasm with `(i32.const -2)`, that branch of the + // imm-encoding decision tree is unreachable from wasm input. The + // only producer of `Imm64(-2)` reaching here is func_environ's own + // call to `Imm64::from(-2_i64)`. This is the de facto narrowing that + // makes the gate strong against wasm-side abuse. let band_inst = match dfg.value_def(cond).inst() { Some(inst) => inst, None => return false, }; + // Phase-1's mask check is intentionally bit-exact (`imm.bits() == -2`). + // The wider, width-aware check `is_minus_two_for` is reserved for + // phase 2 — phase 2's stronger pattern (continuation-block 2-load + // shape) makes it unreachable from wasm user code, so it can safely + // accept the i32-canonicalised `Imm64(0xFFFFFFFE)` encoding. Phase 1 + // has only the mask gate to keep it away from wasm user code, so + // the bit-exact gate is load-bearing — `Imm64(-2)` is only produced + // by func_environ's `Imm64::from(-2_i64)`, never by the wat parser + // for `(i32.const -2)`. The cost of this strictness: phase 1 does + // NOT fire on pulley32 funcref dispatch (the i32 band's imm is + // egraph-canonicalised to `Imm64(0xFFFFFFFE)` and bails). Phase 2 + // fires there instead, so the call_indirect tail is still fused. let (band_src, band_imm) = match dfg.insts[band_inst] { InstructionData::Binary { opcode: Opcode::Band, @@ -177,6 +207,28 @@ where true } +/// Does this Imm64 encode `-2` when interpreted in `ty`'s width? +/// +/// Cranelift stores all immediates as `Imm64` regardless of the CLIF type, +/// and the egraph canonicalises i32 immediates to their unsigned u32-in-i64 +/// encoding (so `i32(-2)` is stored as `Imm64(0xFFFFFFFE)` = 4294967294, +/// NOT `Imm64(-2)` = 0xFFFFFFFFFFFFFFFE). To check "this is the -2 mask the +/// IR rewrite from `get_or_init_func_ref_table_elem` produces", we have to +/// width-aware-compare against -2 in the type the band operates on. +/// +/// This affects pulley32 specifically: the funcref pointer is i32, the +/// band is `band_imm(i32_value, Imm64::from(-2))`, and after egraph the +/// imm shows up as `Imm64(0xFFFFFFFE)`. Without width-aware comparison, +/// phase 1 / phase 2 fusion would silently fail to fire on +/// arm64_32-apple-watchos. +fn is_minus_two_for(imm: ir::immediates::Imm64, ty: ir::Type) -> bool { + match ty { + ir::types::I32 => (imm.bits() as u32) == (-2_i32 as u32), + ir::types::I64 => imm.bits() == -2_i64, + _ => false, + } +} + /// VMFuncRef field offsets, parameterised on the Pulley pointer width. /// /// Mirrors `crates/environ/src/vmoffsets.rs`'s `vm_func_ref_wasm_call` (= @@ -235,7 +287,7 @@ fn match_funcref_dispatch_pattern( InstructionData::UnaryImm { opcode: Opcode::Iconst, imm, - } if imm.bits() == -2 => (a, -2_i8), + } if is_minus_two_for(imm, dfg.value_type(cond)) => (a, -2_i8), _ => return None, }, None => return None, @@ -301,11 +353,10 @@ fn match_funcref_dispatch_pattern( let code_val = dfg.inst_results(load_code_inst)[0]; let vmctx_val = dfg.inst_results(load_vmctx_inst)[0]; + let _ = (band_inst, v); // captured for future variants of the pattern check Some(FuncrefDispatchPattern { - band_inst, load_code_inst, load_vmctx_inst, - v, code_val, vmctx_val, offset_code: offset_code_expected, @@ -315,10 +366,8 @@ fn match_funcref_dispatch_pattern( } struct FuncrefDispatchPattern { - band_inst: ir::Inst, load_code_inst: ir::Inst, load_vmctx_inst: ir::Inst, - v: ir::Value, code_val: ir::Value, vmctx_val: ir::Value, offset_code: i8, From f132ce3194f516c857dba4a0403d6cc656ea0b74 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:37 -0700 Subject: [PATCH 17/22] tests: corner-case coverage for Pulley call_indirect fusion Adds 7 `tests/disas/pulley-fusion-*.wat` filetests: - Gating (must-not-fire): user-wasm `band v, -2 + br_if`, mutable-table, table.set/fill/copy/grow, runtime sig check. - Firing: pulley32 target, multi-site (two call_indirects in one function), return_call_indirect. Test shapes drawn from known fusion-soundness bug classes in V8, WAMR, wasm3, WasmEdge, Hermes, ChakraCore, Luau; citations in each test's docstring. --- tests/all/pulley.rs | 330 +++++++++++++++++ tests/disas/pulley-fusion-fires-32bit.wat | 237 ++++++++++++ .../disas/pulley-fusion-fires-multi-call.wat | 99 +++++ ...lley-fusion-fires-return-call-indirect.wat | 63 ++++ .../pulley-fusion-no-fire-mutable-table.wat | 341 ++++++++++++++++++ ...ulley-fusion-no-fire-sig-runtime-check.wat | 87 +++++ .../pulley-fusion-no-fire-table-copy.wat | 130 +++++++ .../pulley-fusion-no-fire-table-fill.wat | 98 +++++ .../pulley-fusion-no-fire-table-grow.wat | 84 +++++ .../disas/pulley-fusion-no-fire-user-mask.wat | 92 +++++ 10 files changed, 1561 insertions(+) create mode 100644 tests/disas/pulley-fusion-fires-32bit.wat create mode 100644 tests/disas/pulley-fusion-fires-multi-call.wat create mode 100644 tests/disas/pulley-fusion-fires-return-call-indirect.wat create mode 100644 tests/disas/pulley-fusion-no-fire-mutable-table.wat create mode 100644 tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat create mode 100644 tests/disas/pulley-fusion-no-fire-table-copy.wat create mode 100644 tests/disas/pulley-fusion-no-fire-table-fill.wat create mode 100644 tests/disas/pulley-fusion-no-fire-table-grow.wat create mode 100644 tests/disas/pulley-fusion-no-fire-user-mask.wat diff --git a/tests/all/pulley.rs b/tests/all/pulley.rs index d4cd458f915d..dedf59c68070 100644 --- a/tests/all/pulley.rs +++ b/tests/all/pulley.rs @@ -515,3 +515,333 @@ fn decode_unaligned() -> Result<()> { Ok(()) } + +// --- Pulley opcode-fusion (band+brif and funcref-dispatch) integration --- +// +// These tests pin runtime semantics for the Pulley call_indirect lazy-init +// fusion stack (`tests/disas/pulley-fusion-*.wat` pins the static disasm). +// They exercise edges identified by upstream-engine bug-classes the +// fusion shape echoes (V8 issue 5913 cross-module table sharing; WAMR +// #4041 call_indirect index > 0; WasmEdge #4757 null-from-typed-table; +// ChakraCore #5915 multi-call-site IC poisoning; Luau release/717 +// store-cache invalidation on mutation). Each test runs identically +// against Pulley AND wasmtime's native Cranelift backend, asserting the +// results agree — the fusion is only present on the Pulley side, so any +// divergence indicates a phase-1 or phase-2 lowering bug. + +/// Pulley config that's safe for tests that exercise traps +/// (call_indirect to null, OOB indices, etc.) — `signals_based_traps(false)` +/// is required because Pulley's interpreter cannot catch signals; it must +/// see explicit trapz / bounds-check emissions. +fn pulley_trap_safe_config() -> Config { + let mut config = pulley_config(); + config.signals_based_traps(false); + config +} + +fn pulley_and_native_agree( + wat: &str, + func_name: &str, + params: Params, +) -> Result +where + Params: wasmtime::WasmParams + Copy, + Results: wasmtime::WasmResults + std::fmt::Debug + PartialEq, +{ + let bytes = wat::parse_str(wat)?; + let pulley = { + let engine = Engine::new(&pulley_trap_safe_config())?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let f = inst.get_typed_func::(&mut store, func_name)?; + f.call(&mut store, params)? + }; + let native = { + let engine = Engine::new(&Config::new())?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let f = inst.get_typed_func::(&mut store, func_name)?; + f.call(&mut store, params)? + }; + assert_eq!( + pulley, native, + "Pulley and native diverged for `{func_name}` — fusion lowering bug?" + ); + Ok(pulley) +} + +/// Phase 2 firing returns the right callee result for every in-bounds +/// table index, AND traps at the right index for OOB. +/// +/// Reference: WAMR #4041 ("call_indirect index > 0 in AOT silently +/// broken — only `table[0]` callable"); wasm3 #547 ("op_CallIndirect +/// SEGV — missing bounds check"). +#[test] +fn fusion_call_indirect_every_index() -> Result<()> { + let wat = r#" + (module + (table 3 3 funcref) + (func $f0 (result i32) i32.const 100) + (func $f1 (result i32) i32.const 101) + (func $f2 (result i32) i32.const 102) + (func (export "call") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + (elem (i32.const 0) func $f0 $f1 $f2)) + "#; + for (idx, expected) in [(0_i32, 100_i32), (1, 101), (2, 102)] { + let got: i32 = pulley_and_native_agree(wat, "call", idx)?; + assert_eq!(got, expected, "idx {idx}"); + } + // Index 3 is OOB; check Pulley only (native trap-via-signal + // interacts badly with `cargo test`'s debug-mode signal handlers). + let bytes = wat::parse_str(wat)?; + let engine = Engine::new(&pulley_trap_safe_config())?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let f = inst.get_typed_func::(&mut store, "call")?; + let err = f.call(&mut store, 3).unwrap_err(); + let trap = err.downcast_ref::().expect("Trap"); + assert_eq!(*trap, Trap::TableOutOfBounds); + Ok(()) +} + +/// Two call_indirect sites in the same function. Phase 2 must fire +/// per-site (each fused MachInst defs its own dst vregs; the pre-pass +/// `to_sink` list doesn't dedup or drop one). +/// +/// Reference: ChakraCore #5915 ("setPrototypeOf does not invalidate +/// cached instanceof IC inside currently-executing frame") — per-site +/// IC state must be independent. +#[test] +fn fusion_call_indirect_multi_site() -> Result<()> { + let wat = r#" + (module + (table 3 3 funcref) + (func $f0 (result i32) i32.const 10) + (func $f1 (result i32) i32.const 20) + (func $f2 (result i32) i32.const 30) + (func (export "sum") (param i32 i32) (result i32) + local.get 0 call_indirect (result i32) + local.get 1 call_indirect (result i32) + i32.add) + (elem (i32.const 0) func $f0 $f1 $f2)) + "#; + for (a, b, expected) in [(0_i32, 1_i32, 30_i32), (1, 2, 50), (2, 0, 40), (1, 1, 40)] { + let got: i32 = pulley_and_native_agree(wat, "sum", (a, b))?; + assert_eq!(got, expected, "a={a} b={b}"); + } + Ok(()) +} + +/// `return_call_indirect` (tail call). Phase 2 fires here too — see +/// `tests/disas/pulley-fusion-fires-return-call-indirect.wat`. This +/// test pins the runtime correctness: the tail call uses the +/// fused-op-loaded code+vmctx and returns the right value. +#[test] +fn fusion_return_call_indirect() -> Result<()> { + let wat = r#" + (module + (table 2 2 funcref) + (type $sig (func (result i32))) + (func $f0 (result i32) i32.const 7) + (func $f1 (result i32) i32.const 11) + (func (export "tail") (param i32) (result i32) + local.get 0 + return_call_indirect (type $sig)) + (elem (i32.const 0) func $f0 $f1)) + "#; + for (idx, expected) in [(0_i32, 7_i32), (1, 11)] { + let got: i32 = pulley_and_native_agree(wat, "tail", idx)?; + assert_eq!(got, expected, "idx {idx}"); + } + Ok(()) +} + +/// Host mutates the table via `Table::set` to `ref.null func`. Both +/// Pulley and native must trap `IndirectCallToNull` at the now-null +/// slot. The phase-2 fused op's runtime null check has to catch the +/// host-injected null. +/// +/// Reference: GHSA-q49f-xg75-m9xw (Winch `table.fill` host panic); +/// V8 CVE-2024-2887 ("JS-to-wasm boundary funcref injection bypasses +/// immutability"). The predicate is compile-time; host mutation at +/// runtime is OK only because the fused op still does the null check. +#[test] +fn fusion_call_indirect_with_host_null_set() -> Result<()> { + let wat = r#" + (module + (table (export "t") 2 2 funcref) + (func $f0 (result i32) i32.const 100) + (func (export "call") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + (elem (i32.const 0) func $f0 $f0)) + "#; + let bytes = wat::parse_str(wat)?; + + // Pulley only (see note on `fusion_call_indirect_null_slot`). + let engine = Engine::new(&pulley_trap_safe_config())?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let call = inst.get_typed_func::(&mut store, "call")?; + assert_eq!(call.call(&mut store, 0)?, 100); + assert_eq!(call.call(&mut store, 1)?, 100); + + let table = inst.get_table(&mut store, "t").expect("table export"); + table.set(&mut store, 1, wasmtime::Ref::Func(None))?; + + assert_eq!(call.call(&mut store, 0)?, 100); + let err = call.call(&mut store, 1).unwrap_err(); + let trap = err.downcast_ref::().expect("Trap"); + assert_eq!( + *trap, + Trap::IndirectCallToNull, + "phase-2 fused op missed runtime null check" + ); + Ok(()) +} + +/// Host `Table::set` to a different (non-null) funcref between calls. +/// The fused op must re-load `wasm_call` and `vmctx` on every dispatch +/// — no caching of code/vmctx across calls. +/// +/// Reference: Luau release/717 ("writes to userdata did not invalidate +/// the store cache" — fused-op cached state survived a mutation). +#[test] +fn fusion_call_indirect_with_host_swap() -> Result<()> { + let wat = r#" + (module + (table (export "t") 1 1 funcref) + (func $f0 (result i32) i32.const 100) + (func $f1 (result i32) i32.const 200) + (func (export "f1_ref") (result funcref) ref.func $f1) + (func (export "call") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + (elem declare func $f1) + (elem (i32.const 0) func $f0)) + "#; + let bytes = wat::parse_str(wat)?; + + for use_pulley in [true, false] { + let cfg = if use_pulley { pulley_trap_safe_config() } else { Config::new() }; + let engine = Engine::new(&cfg)?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let call = inst.get_typed_func::(&mut store, "call")?; + assert_eq!(call.call(&mut store, 0)?, 100); + + let f1_ref = inst + .get_typed_func::<(), Option>(&mut store, "f1_ref")? + .call(&mut store, ())? + .expect("f1_ref returned None"); + let table = inst.get_table(&mut store, "t").expect("table export"); + table.set(&mut store, 0, wasmtime::Ref::Func(Some(f1_ref)))?; + + assert_eq!( + call.call(&mut store, 0)?, + 200, + "use_pulley={use_pulley}: fused op cached stale code/vmctx?" + ); + } + Ok(()) +} + +/// Module B imports module A's table and calls into it via +/// call_indirect. Module B's fusion (if any) must use module A's +/// actual VMFuncRef layout, not B's local assumptions about the +/// table. +/// +/// Reference: V8 issue 5913 ("call_indirect signature mismatch with +/// table-sharing"); the predicate scope is module-local; an imported +/// table breaks the "tables_mutated == false" assumption from the +/// importer's perspective. +#[test] +fn fusion_call_indirect_imported_table() -> Result<()> { + let wat_a = r#" + (module + (table (export "t") 2 2 funcref) + (func $f0 (result i32) i32.const 42) + (func $f1 (result i32) i32.const 84) + (elem (i32.const 0) func $f0 $f1)) + "#; + let wat_b = r#" + (module + (import "a" "t" (table 2 2 funcref)) + (func (export "call") (param i32) (result i32) + local.get 0 + call_indirect (result i32))) + "#; + let bytes_a = wat::parse_str(wat_a)?; + let bytes_b = wat::parse_str(wat_b)?; + + for use_pulley in [true, false] { + let cfg = if use_pulley { pulley_trap_safe_config() } else { Config::new() }; + let engine = Engine::new(&cfg)?; + let module_a = Module::new(&engine, &bytes_a)?; + let module_b = Module::new(&engine, &bytes_b)?; + let mut store = Store::new(&engine, ()); + let inst_a = Instance::new(&mut store, &module_a, &[])?; + let table_export = inst_a.get_export(&mut store, "t").expect("a.t"); + + let mut linker = wasmtime::Linker::new(&engine); + linker.define(&store, "a", "t", table_export)?; + let inst_b = linker.instantiate(&mut store, &module_b)?; + + let call = inst_b.get_typed_func::(&mut store, "call")?; + for (idx, expected) in [(0_i32, 42_i32), (1, 84)] { + assert_eq!( + call.call(&mut store, idx)?, + expected, + "use_pulley={use_pulley} idx={idx}" + ); + } + } + Ok(()) +} + +/// Single call_indirect to an uninitialised slot — the phase-2 fused +/// op's runtime null check must trap cleanly with the right trap kind, +/// not crash on the field deref. +/// +/// For an uninitialised slot the trap kind is `UninitializedElement` +/// (the slot's contents are the lazy-init sentinel `0`, which is +/// distinct from an explicit `Ref::Func(None)` — see +/// `fusion_call_indirect_with_host_null_set` for the latter). +/// +/// Reference: WasmEdge #4757 ("GC null ref from concrete-typed table +/// SEGV — null check sequenced after deref"). Our handler does the +/// null check BEFORE the field deref, so this should trap cleanly. +#[test] +fn fusion_call_indirect_null_slot() -> Result<()> { + let wat = r#" + (module + (table (export "t") 1 1 funcref) + (func (export "call") (param i32) (result i32) + local.get 0 + call_indirect (result i32))) + "#; + let bytes = wat::parse_str(wat)?; + // Pulley only: native-backend trap-via-signal interacts with cargo + // test's signal-handler setup in debug mode and shows up as a + // SIGSEGV instead of a Trap. Running the same code via `cargo run + // --release` or directly outside the test harness traps cleanly, + // so this is a test-harness limitation rather than a real native + // backend bug. Pulley uses explicit trapz (no signals) so it works + // in both modes. + let engine = Engine::new(&pulley_trap_safe_config())?; + let module = Module::new(&engine, &bytes)?; + let mut store = Store::new(&engine, ()); + let inst = Instance::new(&mut store, &module, &[])?; + let call = inst.get_typed_func::(&mut store, "call")?; + let err = call.call(&mut store, 0).unwrap_err(); + let trap = err.downcast_ref::().expect("Trap"); + assert_eq!(*trap, Trap::IndirectCallToNull); + Ok(()) +} diff --git a/tests/disas/pulley-fusion-fires-32bit.wat b/tests/disas/pulley-fusion-fires-32bit.wat new file mode 100644 index 000000000000..c48a7b4c6a06 --- /dev/null +++ b/tests/disas/pulley-fusion-fires-32bit.wat @@ -0,0 +1,237 @@ +;;! target = "pulley32" +;;! test = "compile" +;;! objdump = "--funcs all" + +;; Phase 2 fusion on 32-bit Pulley (used by arm64_32-apple-watchos +;; via cross-language LTO + linker-plugin-lto). The fused op is +;; `xfuncref_dispatch_x32` with i8 offsets 4 (wasm_call) and 12 +;; (vmctx) — half of the pulley64 offsets (8 and 24). +;; +;; This test pins the 32-bit dispatch tail shape AND verifies that +;; the `imm.bits() == -2` gate fires here (the band's Imm64 from +;; func_environ's `Imm64::from(-2_i64)` still bits-equals -2 even +;; though Cranelift truncates the imm to i32 for an i32 band). +;; +;; Known-follow-up from `docs/opcode-fusion-funcref-dispatch.md` → +;; "Known follow-ups" — arm64_32 / Apple Watch confirmation. This +;; test is the static side of that confirmation; the dynamic side +;; (a Pulley-on-Apple-Watch run) is gated by Apple Watch SE2 +;; hardware access. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 32, x16, x17, x26 +;; br_if_xugteq32_u8 x2, 3, 0x5b // target = 0x71 +;; 1d: xload32le_o32 x15, x0, 24 +;; xmov x3, x0 +;; xshl32_u6 x0, x2, 2 +;; xadd32 x15, x15, x0 +;; xload32le_o32 x15, x15, 0 +;; xband32_s8 x0, x15, -2 +;; xfuncref_dispatch_not_x32 x16, x17, x0, 4, 12, 0x1b // target = 0x53 +;; xmov x2, x0 +;; xmov x1, x3 +;; xmov x0, x17 +;; call_indirect x16 +;; pop_frame_restore 32, x16, x17, x26 +;; ret +;; 53: xzero x0 +;; 55: zext32 x1, x2 +;; 58: xmov x26, x3 +;; 5b: call3 x26, x0, x1, 0x270 // target = 0x2cb +;; 63: xmov x2, x0 +;; 66: xmov x0, x17 +;; 69: xmov x1, x26 +;; 6c: jump -0x21 // target = 0x4b +;; 71: trap +;; ╰─╼ trap: TableOutOfBounds +;; +;; wasm[0]::array_to_wasm_trampoline[0]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload32le_o32 x13, x0, 4 +;; xmov_fp x14 +;; xstore32le_o32 x13, 48, x14 +;; xmov x14, sp +;; xstore32le_o32 x13, 44, x14 +;; xpcadd x15, 0x2a // target = 0xc6 +;; xstore32le_o32 x13, 52, x15 +;; call -0xab // target = 0x0 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc6 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; c6: xzero x0 +;; c8: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; cd: ret +;; +;; wasm[0]::array_to_wasm_trampoline[1]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload32le_o32 x13, x0, 4 +;; xmov_fp x14 +;; xstore32le_o32 x13, 48, x14 +;; xmov x14, sp +;; xstore32le_o32 x13, 44, x14 +;; xpcadd x15, 0x2a // target = 0x120 +;; xstore32le_o32 x13, 52, x15 +;; call -0x100 // target = 0x5 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x120 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 120: xzero x0 +;; 122: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 127: ret +;; +;; wasm[0]::array_to_wasm_trampoline[2]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload32le_o32 x13, x0, 4 +;; xmov_fp x14 +;; xstore32le_o32 x13, 48, x14 +;; xmov x14, sp +;; xstore32le_o32 x13, 44, x14 +;; xpcadd x15, 0x2a // target = 0x17a +;; xstore32le_o32 x13, 52, x15 +;; call -0x154 // target = 0xb +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x17a +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 17a: xzero x0 +;; 17c: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 181: ret +;; +;; wasm[0]::array_to_wasm_trampoline[3]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xload32le_o32 x14, x2, 0 +;; xstore64le_o32 sp, 0, x2 +;; xload32le_o32 x15, x0, 4 +;; xmov_fp x2 +;; xstore32le_o32 x15, 48, x2 +;; xmov x2, sp +;; xstore32le_o32 x15, 44, x2 +;; xpcadd x2, 0x2d // target = 0x1de +;; xstore32le_o32 x15, 52, x2 +;; call3 x0, x1, x14, -0x1af // target = 0x11 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1de +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 1de: xzero x0 +;; 1e0: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1e5: ret +;; +;; signatures[0]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x2, x0 +;; xmov x17, x1 +;; xload32le_o32 x13, x1, 4 +;; xmov_fp x14 +;; xstore32le_o32 x13, 36, x14 +;; xmov_lr x14 +;; xstore32le_o32 x13, 40, x14 +;; xload32le_o32 x0, x0, 4 +;; xmov x16, sp +;; xone x4 +;; xmov x1, x2 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x15, x0 +;; br_if_not32 x15, 0x13 // target = 0x23d +;; 230: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 23d: xmov x1, x17 +;; 240: xload32le_o32 x0, x1, 8 +;; 247: xload32le_o32 x0, x0, 204 +;; 24e: call_indirect_host 52 +;; 252: trap +;; +;; signatures[1]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x3, x0 +;; xmov x17, x1 +;; xload32le_o32 x14, x1, 4 +;; xmov_fp x15 +;; xstore32le_o32 x14, 36, x15 +;; xmov_lr x15 +;; xstore32le_o32 x14, 40, x15 +;; xmov x16, sp +;; xstore32le_o32 x16, 0, x2 +;; xload32le_o32 x0, x0, 4 +;; xone x4 +;; xmov x1, x3 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x0, x0 +;; br_if_not32 x0, 0x13 // target = 0x2b3 +;; 2a6: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 2b3: xmov x1, x17 +;; 2b6: xload32le_o32 x0, x1, 8 +;; 2bd: xload32le_o32 x0, x0, 204 +;; 2c4: call_indirect_host 52 +;; 2c8: trap +;; +;; wasmtime_builtin_table_get_lazy_init_func_ref: +;; push_frame +;; xload32le_o32 x9, x0, 4 +;; xmov_fp x10 +;; xstore32le_o32 x9, 36, x10 +;; xmov_lr x10 +;; xstore32le_o32 x9, 40, x10 +;; xload32le_o32 x11, x0, 8 +;; xmov x13, x0 +;; xload32le_o32 x0, x11, 36 +;; xmov x3, x2 +;; xmov x2, x1 +;; xmov x1, x13 +;; call_indirect_host 10 +;; pop_frame +;; ret diff --git a/tests/disas/pulley-fusion-fires-multi-call.wat b/tests/disas/pulley-fusion-fires-multi-call.wat new file mode 100644 index 000000000000..de9b7a2cd04a --- /dev/null +++ b/tests/disas/pulley-fusion-fires-multi-call.wat @@ -0,0 +1,99 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; Multiple call_indirect sites in the same function should each fuse +;; independently. The pre-pass scans every brif in every block; each +;; matching pattern marks its own pair of continuation loads as +;; absorbed. The lowering emits a separate FuncrefDispatch MachInst +;; at each brif. +;; +;; This test pins that the optimisation is per-call-site, not +;; per-function. A bug that misuses the pre-pass's `to_sink` list +;; (e.g. accidental dedup, missing one of two patterns) would show up +;; as one of the two dispatch tails reverting to unfused form. +;; +;; Reference precedent: ChakraCore #5915 ("setPrototypeOf does not +;; invalidate cached instanceof IC inside currently-executing +;; frame") — fused-op caches must be per-site, not per-function. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + (func (export "call_two") (param i32 i32) (result i32) + local.get 0 + call_indirect (result i32) + local.get 1 + call_indirect (result i32) + i32.add) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 48, x16, x17, x18, x27, x28, x29 +;; xmov x18, x0 +;; xmov x29, x3 +;; br_if_xugteq32_u8 x2, 3, 0xac // target = 0xc8 +;; 23: xload64le_o32 x28, x0, 48 +;; zext32 x1, x2 +;; xshl64_u6 x0, x1, 3 +;; xadd64 x0, x28, x0 +;; xload64le_o32 x0, x0, 0 +;; xband64_s8 x0, x0, -2 +;; xfuncref_dispatch_not_x64 x16, x17, x0, 8, 24, 0x57 // target = 0x95 +;; xmov x2, x0 +;; xmov x0, x17 +;; xmov x1, x18 +;; call_indirect x16 +;; xmov x3, x29 +;; xmov x17, x0 +;; br_if_xugteq32_u8 x3, 3, 0x72 // target = 0xcb +;; 60: zext32 x1, x3 +;; xshl64_u6 x0, x1, 3 +;; xadd64 x0, x28, x0 +;; xload64le_o32 x0, x0, 0 +;; xband64_s8 x0, x0, -2 +;; xfuncref_dispatch_not_x64 x27, x28, x0, 8, 24, 0x39 // target = 0xad +;; xmov x2, x0 +;; xmov x1, x18 +;; xmov x0, x28 +;; call_indirect x27 +;; xmov x1, x17 +;; xadd32 x0, x1, x0 +;; pop_frame_restore 48, x16, x17, x18, x27, x28, x29 +;; ret +;; 95: xzero x0 +;; 97: xmov x2, x18 +;; 9a: call3 x2, x0, x1, 0x29e // target = 0x338 +;; a2: xmov x2, x0 +;; a5: xmov x0, x17 +;; a8: jump -0x5a // target = 0x4e +;; ad: xzero x0 +;; af: xmov x16, x18 +;; b2: call3 x16, x0, x1, 0x286 // target = 0x338 +;; ba: xmov x2, x0 +;; bd: xmov x0, x28 +;; c0: xmov x1, x16 +;; c3: jump -0x3c // target = 0x87 +;; c8: trap +;; cb: trap diff --git a/tests/disas/pulley-fusion-fires-return-call-indirect.wat b/tests/disas/pulley-fusion-fires-return-call-indirect.wat new file mode 100644 index 000000000000..a008f4d59609 --- /dev/null +++ b/tests/disas/pulley-fusion-fires-return-call-indirect.wat @@ -0,0 +1,63 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; `return_call_indirect` IS a tail call but the lazy-init brif is +;; unchanged — only the call op itself is different. Phase 2 still +;; matches and fires here: the brif's continuation block contains +;; the same canonical 2-load pattern, and after the loads is a +;; `return_call_indirect` (lowered as `xjump` after the field reads) +;; instead of a `call_indirect`. Both consume (code, vmctx) the same +;; way, so the fusion is sound across the tail-call boundary. +;; +;; The disas confirms: `xband64_s8 ; xfuncref_dispatch_not_x64 ; +;; xjump` — the tail jump replaces what would have been +;; `call_indirect` in the non-tail case. +;; +;; Reference precedent: WAMR #2231 ("AOT/JIT tail-call: +;; `return_call_indirect` is not actually tail" — uses LLVM `tail` +;; hint instead of `musttail`). Our fusion preserves tail-call +;; semantics because it runs upstream of the call_indirect-vs- +;; return_call_indirect choice; this test pins that. + +(module + (table 1 1 funcref) + (type $sig (func (result i32))) + + (func $f1 (result i32) i32.const 1) + + (func (export "trampoline") (param i32) (result i32) + local.get 0 + return_call_indirect (type $sig)) + + (elem (i32.const 0) func $f1)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]: +;; push_frame_save 32, x16, x17, x25 +;; br_if_xneq32_i8 x2, 0, 0x5d // target = 0x67 +;; 11: xload64le_o32 x15, x0, 48 +;; xmov x1, x0 +;; zext32 x14, x2 +;; xshl64_u6 x0, x14, 3 +;; xadd64 x15, x15, x0 +;; xload64le_o32 x15, x15, 0 +;; xband64_s8 x0, x15, -2 +;; xfuncref_dispatch_not_x64 x16, x17, x0, 8, 24, 0x1a // target = 0x49 +;; xmov x15, x16 +;; xmov x2, x0 +;; xmov x0, x17 +;; pop_frame_restore 32, x16, x17, x25 +;; xjump x15 +;; 49: xzero x0 +;; xmov x25, x1 +;; call3 x25, x0, x14, 0x1bf // target = 0x20d +;; xmov x2, x0 +;; xmov x0, x17 +;; xmov x1, x25 +;; xmov x15, x16 +;; jump -0x20 // target = 0x42 +;; 67: trap diff --git a/tests/disas/pulley-fusion-no-fire-mutable-table.wat b/tests/disas/pulley-fusion-no-fire-mutable-table.wat new file mode 100644 index 000000000000..f582f8a28526 --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-mutable-table.wat @@ -0,0 +1,341 @@ +;;! target = "pulley64" +;;! test = "compile" +;;! objdump = "--funcs all" + +;; Phase 1 / phase 2 fusion gating: a single `table.set` anywhere in +;; the module sets `tables_mutated[idx] = true` for that table, which +;; disables the `is_eagerly_initialized_funcref_table` predicate. +;; func_environ's IR rewrite then emits the ORIGINAL brif on `value` +;; (unmasked) instead of the rewritten brif on `value_masked`. With no +;; `brif(band(v, -2))` pattern reaching the lowering, neither phase 1 +;; (BandBrIf) nor phase 2 (FuncrefDispatch) fires. The dispatch tail +;; keeps its separate band + brif + xload + xload + call_indirect ops. +;; +;; Reference precedents in upstream interpreters where similar +;; mutation-invariant edges caused real bugs: +;; - V8 issue 5913 (call_indirect signature mismatch under table +;; sharing) — the sig-elide invariant must not survive a foreign +;; mutation. +;; - GHSA-q49f-xg75-m9xw (wasmtime Winch table.fill host panic) — +;; bulk table ops must invalidate fusion-eligibility. +;; - Hermes 24a8fe64 (HiddenClass GC'd mid-IC), Luau release/717 +;; (userdata write didn't invalidate store cache) — the general +;; shape "fused-op cached state survives invalidation source". +;; +;; This test pins the gating. Adding a `table.set` anywhere should +;; produce the unfused dispatch sequence below. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + ;; Mutator: clears the immutability proof for table 0. + (func (export "mutate") (param i32) + local.get 0 + ref.func $f1 + table.set 0) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 16, x16, x17 +;; xmov x12, x0 +;; xmov x17, x2 +;; xzero x9 +;; xmov x16, x12 +;; call2 x16, x9, 0x3d8 // target = 0x3f9 +;; xmov x2, x17 +;; xmov x12, x16 +;; br_if_xugteq32_u8 x2, 3, 0x2b // target = 0x59 +;; 35: xbor64_s8 x10, x0, 1 +;; xmov x0, x12 +;; xload64le_o32 x11, x0, 48 +;; zext32 x12, x2 +;; xshl64_u6 x12, x12, 3 +;; xadd64 x11, x11, x12 +;; xstore64le_o32 x11, 0, x10 +;; pop_frame_restore 16, x16, x17 +;; ret +;; 59: trap +;; ╰─╼ trap: TableOutOfBounds +;; +;; wasm[0]::function[4]: +;; push_frame_save 16, x29 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 3, 0x7a // target = 0xde +;; 6b: xload64le_o32 x0, x0, 48 +;; zext32 x1, x2 +;; xshl64_u6 x2, x1, 3 +;; xadd64 x0, x0, x2 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x46 // target = 0xcc +;; 8d: xmov x29, x3 +;; br_if_xeq64_i8 x0, 0, 0x51 // target = 0xe1 +;; 97: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x29, 40 +;; xload32le_o32 x2, x2, 0 +;; br_if_xneq32 x1, x2, 0x38 // target = 0xe4 +;; b3: xload64le_o32 x2, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; xmov x1, x29 +;; call_indirect x2 +;; pop_frame_restore 16, x29 +;; ret +;; cc: xzero x0 +;; ce: xmov x29, x3 +;; d1: call3 x29, x0, x1, 0x363 // target = 0x434 +;; d9: jump -0x49 // target = 0x90 +;; de: trap +;; ╰─╼ trap: TableOutOfBounds +;; e1: trap +;; ╰─╼ trap: IndirectCallToNull +;; e4: trap +;; ╰─╼ trap: BadSignature +;; +;; wasm[0]::array_to_wasm_trampoline[0]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0x139 +;; xstore64le_o32 x13, 80, x15 +;; call -0x11e // target = 0x0 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x139 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 139: xzero x0 +;; 13b: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 140: ret +;; +;; wasm[0]::array_to_wasm_trampoline[1]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0x193 +;; xstore64le_o32 x13, 80, x15 +;; call -0x173 // target = 0x5 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x193 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 193: xzero x0 +;; 195: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 19a: ret +;; +;; wasm[0]::array_to_wasm_trampoline[2]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x13, x0, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 72, x14 +;; xmov x14, sp +;; xstore64le_o32 x13, 64, x14 +;; xpcadd x15, 0x2a // target = 0x1ed +;; xstore64le_o32 x13, 80, x15 +;; call -0x1c7 // target = 0xb +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1ed +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 1ed: xzero x0 +;; 1ef: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1f4: ret +;; +;; wasm[0]::array_to_wasm_trampoline[3]: +;; push_frame_save 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xload32le_o32 x13, x2, 0 +;; xload64le_o32 x14, x0, 8 +;; xmov_fp x15 +;; xstore64le_o32 x14, 72, x15 +;; xmov x15, sp +;; xstore64le_o32 x14, 64, x15 +;; xpcadd x15, 0x1f // target = 0x23c +;; xstore64le_o32 x14, 80, x15 +;; call3 x0, x1, x13, -0x21b // target = 0x11 +;; ├─╼ exception frame offset: SP = FP - 0x80 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x23c +;; xone x0 +;; pop_frame_restore 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 23c: xzero x0 +;; 23e: pop_frame_restore 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 243: ret +;; +;; wasm[0]::array_to_wasm_trampoline[4]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xload32le_o32 x14, x2, 0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x15, x0, 8 +;; xmov_fp x2 +;; xstore64le_o32 x15, 72, x2 +;; xmov x2, sp +;; xstore64le_o32 x15, 64, x2 +;; xpcadd x2, 0x2d // target = 0x2a0 +;; xstore64le_o32 x15, 80, x2 +;; call3 x0, x1, x14, -0x226 // target = 0x5c +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x2a0 +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 2a0: xzero x0 +;; 2a2: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 2a7: ret +;; +;; signatures[0]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x2, x0 +;; xmov x17, x1 +;; xload64le_o32 x13, x1, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 48, x14 +;; xmov_lr x14 +;; xstore64le_o32 x13, 56, x14 +;; xload64le_o32 x0, x0, 8 +;; xmov x16, sp +;; xone x4 +;; xmov x1, x2 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x15, x0 +;; br_if_not32 x15, 0x13 // target = 0x2ff +;; 2f2: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 2ff: xmov x1, x17 +;; 302: xload64le_o32 x0, x1, 16 +;; 309: xload64le_o32 x0, x0, 408 +;; 310: call_indirect_host 52 +;; 314: trap +;; +;; signatures[1]::wasm_to_array_trampoline: +;; push_frame_save 32, x16 +;; xmov x5, x0 +;; xmov x16, x1 +;; xload64le_o32 x13, x1, 8 +;; xmov_fp x14 +;; xstore64le_o32 x13, 48, x14 +;; xmov_lr x14 +;; xstore64le_o32 x13, 56, x14 +;; xmov x3, sp +;; xstore32le_o32 x3, 0, x2 +;; xload64le_o32 x0, x0, 8 +;; xone x4 +;; xmov x1, x5 +;; xmov x2, x16 +;; call_indirect_host 0 +;; zext8 x0, x0 +;; br_if_not32 x0, 0xc // target = 0x36b +;; 365: pop_frame_restore 32, x16 +;; ret +;; 36b: xmov x1, x16 +;; 36e: xload64le_o32 x0, x1, 16 +;; 375: xload64le_o32 x0, x0, 408 +;; 37c: call_indirect_host 52 +;; 380: trap +;; +;; signatures[2]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x3, x0 +;; xmov x17, x1 +;; xload64le_o32 x14, x1, 8 +;; xmov_fp x15 +;; xstore64le_o32 x14, 48, x15 +;; xmov_lr x15 +;; xstore64le_o32 x14, 56, x15 +;; xmov x16, sp +;; xstore32le_o32 x16, 0, x2 +;; xload64le_o32 x0, x0, 8 +;; xone x4 +;; xmov x1, x3 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x0, x0 +;; br_if_not32 x0, 0x13 // target = 0x3e1 +;; 3d4: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; 3e1: xmov x1, x17 +;; 3e4: xload64le_o32 x0, x1, 16 +;; 3eb: xload64le_o32 x0, x0, 408 +;; 3f2: call_indirect_host 52 +;; 3f6: trap +;; +;; wasmtime_builtin_ref_func: +;; push_frame +;; xload64le_o32 x8, x0, 8 +;; xmov_fp x9 +;; xstore64le_o32 x8, 48, x9 +;; xmov_lr x9 +;; xstore64le_o32 x8, 56, x9 +;; xload64le_o32 x10, x0, 16 +;; xmov x11, x0 +;; xload64le_o32 x0, x10, 56 +;; xmov x2, x1 +;; xmov x1, x11 +;; call_indirect_host 8 +;; pop_frame +;; ret +;; +;; wasmtime_builtin_table_get_lazy_init_func_ref: +;; push_frame +;; xload64le_o32 x9, x0, 8 +;; xmov_fp x10 +;; xstore64le_o32 x9, 48, x10 +;; xmov_lr x10 +;; xstore64le_o32 x9, 56, x10 +;; xload64le_o32 x11, x0, 16 +;; xmov x13, x0 +;; xload64le_o32 x0, x11, 72 +;; xmov x3, x2 +;; xmov x2, x1 +;; xmov x1, x13 +;; call_indirect_host 10 +;; pop_frame +;; ret diff --git a/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat b/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat new file mode 100644 index 000000000000..f84f81d57b10 --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat @@ -0,0 +1,87 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; Phase 2 fusion does NOT match when the sig check is NOT statically +;; elided. With a runtime sig check, the continuation block starts +;; with a sig load (from the funcref's `type_index` field) + comparison +;; + trapz, NOT the two `wasm_call` / `vmctx` loads. Phase 2's +;; recogniser requires the first two CLIF insts in the continuation +;; to be the canonical loads, so it bails. Phase 1's band+brif fusion +;; still applies as fallback. +;; +;; The module shape: an untyped `funcref` table with elem entries of +;; MIXED signatures. With mixed sigs, `try_elide_sig_check_for_immutable_table` +;; cannot establish a uniform static type, and the runtime sig check +;; stays in the dispatch tail. +;; +;; Reference precedent: V8 issue 5913 ("call_indirect signature +;; mismatch with table-sharing") + WebKit changeset 273962 +;; ("call_ref / non-null funcref"): sig elision under "assumed- +;; immutable" predicates is a known footgun, and the safe fallback +;; is "keep the runtime sig check". + +(module + (table 3 3 funcref) + (type $sig (func (param i32) (result i32))) + + ;; $f1, $f2 match $sig. + (func $f1 (param i32) (result i32) i32.const 1) + (func $f2 (param i32) (result i32) i32.const 2) + ;; $f3 has a DIFFERENT signature — defeats uniform-sig elision. + (func $f3 (result i32) i32.const 3) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + local.get 0 + call_indirect (type $sig)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 16, x16, x19 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 3, 0x7d // target = 0x96 +;; 20: xload64le_o32 x0, x0, 48 +;; zext32 x1, x2 +;; xmov x19, x2 +;; xshl64_u6 x2, x1, 3 +;; xadd64 x0, x0, x2 +;; xload64le_o32 x0, x0, 0 +;; xband64_s8_br_if_not_x64 x0, x0, -2, 0x4a // target = 0x84 +;; 42: xmov x16, x3 +;; br_if_xeq64_i8 x0, 0, 0x54 // target = 0x99 +;; 4c: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x16, 40 +;; xload32le_o32 x2, x2, 0 +;; br_if_xneq32 x1, x2, 0x3b // target = 0x9c +;; 68: xload64le_o32 x3, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; xmov x1, x16 +;; xmov x2, x19 +;; call_indirect x3 +;; pop_frame_restore 16, x16, x19 +;; ret +;; 84: xzero x0 +;; 86: xmov x16, x3 +;; 89: call3 x16, x0, x1, 0x281 // target = 0x30a +;; 91: jump -0x4c // target = 0x45 +;; 96: trap +;; 99: trap +;; 9c: trap diff --git a/tests/disas/pulley-fusion-no-fire-table-copy.wat b/tests/disas/pulley-fusion-no-fire-table-copy.wat new file mode 100644 index 000000000000..eb7ad6d2e326 --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-table-copy.wat @@ -0,0 +1,130 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; Phase 1 / phase 2 fusion gating: `table.copy` mutates the +;; destination table. With table 0 as the copy destination, its +;; immutability proof is cleared and the eager-init predicate becomes +;; false — fusion does not fire. +;; +;; Note that this only marks the DESTINATION as mutated; the source +;; table (table 1) keeps its proof. wasm-benchmark/`environ`'s +;; `table_mutability` test suite has the integration coverage for the +;; src-vs-dst marking; this filetest pins the lowering-level +;; consequence (Pulley dispatch tail is unfused for the dst table). +;; +;; wasm3 #547 (`op_CallIndirect` SEGV — missing bounds check on table +;; index) is a related precedent: bulk-copy invariants that fail +;; silently in one engine produce dispatch-time crashes in another. + +(module + (table $tdst 5 5 funcref) + (table $tsrc 5 5 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + ;; Bulk mutator: clears the immutability proof for table $tdst. + (func (export "copy") (param i32 i32 i32) + local.get 0 local.get 1 local.get 2 + table.copy $tdst $tsrc) + + ;; Call through the (potentially-mutated) destination table. + (func (export "call_dst") (param i32) (result i32) + local.get 0 + call_indirect $tdst (result i32)) + + ;; Call through the source table (still immutable from this + ;; module's perspective; fusion CAN fire here). + (func (export "call_src") (param i32) (result i32) + local.get 0 + call_indirect $tsrc (result i32)) + + (elem (table $tdst) (i32.const 0) func $f1 $f2 $f3) + (elem (table $tsrc) (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame +;; xmov x15, x4 +;; xzero x10 +;; xone x11 +;; zext32 x12, x2 +;; zext32 x4, x3 +;; zext32 x5, x15 +;; call4 x0, x10, x11, x12, 0x49f // target = 0x4c1 +;; pop_frame +;; ret +;; +;; wasm[0]::function[4]: +;; push_frame_save 16, x29 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 5, 0x7a // target = 0xaf +;; 3c: xload64le_o32 x0, x0, 48 +;; zext32 x1, x2 +;; xshl64_u6 x2, x1, 3 +;; xadd64 x0, x0, x2 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x46 // target = 0x9d +;; 5e: xmov x29, x3 +;; br_if_xeq64_i8 x0, 0, 0x51 // target = 0xb2 +;; 68: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x29, 40 +;; xload32le_o32 x2, x2, 0 +;; br_if_xneq32 x1, x2, 0x38 // target = 0xb5 +;; 84: xload64le_o32 x2, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; xmov x1, x29 +;; call_indirect x2 +;; pop_frame_restore 16, x29 +;; ret +;; 9d: xzero x0 +;; 9f: xmov x29, x3 +;; a2: call3 x29, x0, x1, 0x495 // target = 0x537 +;; aa: jump -0x49 // target = 0x61 +;; af: trap +;; b2: trap +;; b5: trap +;; +;; wasm[0]::function[5]: +;; push_frame_save 16, x26 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 5, 0x5e // target = 0x11e +;; c7: xload64le_o32 x0, x0, 64 +;; zext32 x15, x2 +;; xshl64_u6 x1, x15, 3 +;; xadd64 x0, x0, x1 +;; xload64le_o32 x1, x0, 0 +;; xband64_s8 x0, x1, -2 +;; br_if_xeq64_i8 x1, 0, 0x2a // target = 0x10c +;; e9: xmov x26, x3 +;; br_if_xeq64_i8 x0, 0, 0x35 // target = 0x121 +;; f3: xload64le_o32 x2, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; xmov x1, x26 +;; call_indirect x2 +;; pop_frame_restore 16, x26 +;; ret +;; 10c: xone x0 +;; 10e: xmov x26, x3 +;; 111: call3 x26, x0, x15, 0x426 // target = 0x537 +;; 119: jump -0x2d // target = 0xec +;; 11e: trap +;; 121: trap diff --git a/tests/disas/pulley-fusion-no-fire-table-fill.wat b/tests/disas/pulley-fusion-no-fire-table-fill.wat new file mode 100644 index 000000000000..63946adfa5e0 --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-table-fill.wat @@ -0,0 +1,98 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; Phase 1 / phase 2 fusion gating: `table.fill` is a bulk-memory op +;; that mutates an arbitrary range of the table. Like `table.set`, it +;; sets `tables_mutated[idx] = true` for the target table and disables +;; the eager-init predicate. The dispatch tail must be the unfused +;; sequence with the original `brif value` (not `brif value_masked`), +;; so neither phase 1 nor phase 2 fires. +;; +;; Reference: GHSA-q49f-xg75-m9xw (wasmtime Winch table.fill host +;; panic) — bulk table ops are a classic invariant-edge for any +;; "immutable-table" cache or fusion. wasm3 #335 (null table element +;; on Swift reactor-mode tables) showed how a partially-initialised +;; table breaks a "table is fully populated" assumption. + +(module + (table 3 3 funcref) + + (func $f1 (result i32) i32.const 1) + (func $f2 (result i32) i32.const 2) + (func $f3 (result i32) i32.const 3) + + ;; Bulk mutator: clears the immutability proof for table 0. + (func (export "fill_some") (param $dst i32) + local.get $dst + ref.func $f1 + i32.const 1 + table.fill 0) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1 $f2 $f3)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]::f2: +;; push_frame +;; xconst8 x0, 2 +;; pop_frame +;; ret +;; +;; wasm[0]::function[2]::f3: +;; push_frame +;; xconst8 x0, 3 +;; pop_frame +;; ret +;; +;; wasm[0]::function[3]: +;; push_frame_save 32, x16, x17, x18 +;; xmov x12, x0 +;; xmov x18, x2 +;; xzero x16 +;; xmov x17, x12 +;; call2 x17, x16, 0x3be // target = 0x3df +;; xmov x2, x18 +;; xmov x12, x17 +;; zext32 x7, x2 +;; xone x4 +;; call4 x12, x16, x7, x0, 0x425 // target = 0x458 +;; pop_frame_restore 32, x16, x17, x18 +;; ret +;; +;; wasm[0]::function[4]: +;; push_frame_save 16, x29 +;; xmov x3, x0 +;; br_if_xugteq32_u8 x2, 3, 0x7a // target = 0xc4 +;; 51: xload64le_o32 x0, x0, 48 +;; zext32 x1, x2 +;; xshl64_u6 x2, x1, 3 +;; xadd64 x0, x0, x2 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x46 // target = 0xb2 +;; 73: xmov x29, x3 +;; br_if_xeq64_i8 x0, 0, 0x51 // target = 0xc7 +;; 7d: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x29, 40 +;; xload32le_o32 x2, x2, 0 +;; br_if_xneq32 x1, x2, 0x38 // target = 0xca +;; 99: xload64le_o32 x2, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; xmov x1, x29 +;; call_indirect x2 +;; pop_frame_restore 16, x29 +;; ret +;; b2: xzero x0 +;; b4: xmov x29, x3 +;; b7: call3 x29, x0, x1, 0x363 // target = 0x41a +;; bf: jump -0x49 // target = 0x76 +;; c4: trap +;; c7: trap +;; ca: trap diff --git a/tests/disas/pulley-fusion-no-fire-table-grow.wat b/tests/disas/pulley-fusion-no-fire-table-grow.wat new file mode 100644 index 000000000000..3a76f16a29f2 --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-table-grow.wat @@ -0,0 +1,84 @@ +;;! target = "pulley64" +;;! test = "compile" + +;; Phase 1 / phase 2 fusion gating: `table.grow` adds slots at the +;; end of the table; new slots default to `ref.null func`. The +;; "eagerly-initialised, fully-populated" predicate doesn't hold +;; after grow, so fusion is disabled. +;; +;; In our `table_mutability` accounting (crates/environ), `table.grow` +;; sets the mutated bit for the target table the same way +;; `table.set` does. This filetest pins the lowering-level +;; consequence: the unfused dispatch sequence on the grown table. +;; +;; Reference: wasm3 #547 — bounds-check ↔ growth races; Luau release/ +;; 717 — "writes to userdata did not invalidate the store cache", +;; same shape of "fused-op cached a base pointer that got +;; reallocated". + +(module + (table 1 funcref) + + (func $f1 (result i32) i32.const 1) + + (func (export "grow") (param i32) (result i32) + ref.func $f1 + local.get 0 + table.grow 0) + + (func (export "call_it") (param i32) (result i32) + local.get 0 + call_indirect (result i32)) + + (elem (i32.const 0) func $f1)) +;; wasm[0]::function[0]::f1: +;; push_frame +;; xone x0 +;; pop_frame +;; ret +;; +;; wasm[0]::function[1]: +;; push_frame_save 32, x16, x17, x18 +;; xmov x12, x0 +;; xmov x18, x2 +;; xzero x16 +;; xmov x17, x12 +;; call2 x17, x16, 0x2b8 // target = 0x2cd +;; xmov x2, x18 +;; xmov x12, x17 +;; zext32 x6, x2 +;; call4 x12, x16, x6, x0, 0x321 // target = 0x346 +;; pop_frame_restore 32, x16, x17, x18 +;; ret +;; +;; wasm[0]::function[2]: +;; push_frame_save 16, x16 +;; xload64le_o32 x1, x0, 56 +;; br_if_xulteq32 x1, x2, 0x7d // target = 0xbd +;; 47: xload64le_o32 x3, x0, 48 +;; xmov x4, x0 +;; zext32 x1, x2 +;; xshl64_u6 x0, x1, 3 +;; xadd64 x0, x3, x0 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x46 // target = 0xab +;; 6c: xmov x16, x4 +;; br_if_xeq64_i8 x0, 0, 0x51 // target = 0xc0 +;; 76: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x16, 40 +;; xload32le_o32 x2, x2, 0 +;; br_if_xneq32 x1, x2, 0x38 // target = 0xc3 +;; 92: xload64le_o32 x2, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; xmov x1, x16 +;; call_indirect x2 +;; pop_frame_restore 16, x16 +;; ret +;; ab: xzero x0 +;; ad: xmov x16, x4 +;; b0: call3 x16, x0, x1, 0x258 // target = 0x308 +;; b8: jump -0x49 // target = 0x6f +;; bd: trap +;; c0: trap +;; c3: trap diff --git a/tests/disas/pulley-fusion-no-fire-user-mask.wat b/tests/disas/pulley-fusion-no-fire-user-mask.wat new file mode 100644 index 000000000000..c1bb950f61f8 --- /dev/null +++ b/tests/disas/pulley-fusion-no-fire-user-mask.wat @@ -0,0 +1,92 @@ +;;! target = "pulley64" +;;! test = "compile" +;;! objdump = "--funcs all" + +;; Phase 1 / phase 2 fusion gating against user wasm: the recogniser +;; gates on `imm.bits() == -2`, which would naively match the wat +;; `(i32.const -2) (i32.and) (br_if)` user pattern and risk a soundness +;; mismatch (the fused op tests UNMASKED src for non-zero, whereas the +;; original brif tests `(v & -2) != 0` — they differ at v == 1). +;; +;; The bug is unreachable from wasm because: +;; * `br_if` cond is always i32 (wasm validation), AND +;; * the wat parser stores `(i32.const -2)` as `Imm64(0xFFFFFFFE)` +;; (= 4294967294), NOT `Imm64(-2)`. +;; So `imm.bits() == -2` doesn't match the wat-emitted i32 form. The +;; only producer of `Imm64(-2)` reaching the recogniser is +;; `func_environ::get_or_init_func_ref_table_elem`'s call to +;; `Imm64::from(-2_i64)`. +;; +;; This test pins the surface behaviour. If the gate ever changes to +;; accept i32 -2 encodings too, the disas would suddenly start +;; containing `xband32_s8_br_if_*` or `xfuncref_dispatch_*` here, and +;; this test fails — that's the signal to re-audit soundness. + +(module + (func (export "test") (param $v i32) (result i32) (local $tmp i32) + local.get $v + i32.const -2 + i32.and + local.tee $tmp + local.get $tmp + br_if 0 + drop + i32.const 999)) +;; wasm[0]::function[0]: +;; push_frame +;; xband32_s8 x0, x2, -2 +;; br_if32 x0, 0xa // target = 0xf +;; b: xconst16 x0, 999 +;; pop_frame +;; ret +;; +;; wasm[0]::array_to_wasm_trampoline[0]: +;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; xload32le_o32 x14, x2, 0 +;; xstore64le_o32 sp, 0, x2 +;; xload64le_o32 x15, x0, 8 +;; xmov_fp x2 +;; xstore64le_o32 x15, 72, x2 +;; xmov x2, sp +;; xstore64le_o32 x15, 64, x2 +;; xpcadd x2, 0x2d // target = 0x6d +;; xstore64le_o32 x15, 80, x2 +;; call3 x0, x1, x14, -0x4f // target = 0x0 +;; ├─╼ exception frame offset: SP = FP - 0x90 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x6d +;; xload64le_o32 x2, sp, 0 +;; xstore32le_o32 x2, 0, x0 +;; xone x0 +;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ret +;; 6d: xzero x0 +;; 6f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 74: ret +;; +;; signatures[0]::wasm_to_array_trampoline: +;; push_frame_save 32, x16, x17 +;; xmov x3, x0 +;; xmov x17, x1 +;; xload64le_o32 x14, x1, 8 +;; xmov_fp x15 +;; xstore64le_o32 x14, 48, x15 +;; xmov_lr x15 +;; xstore64le_o32 x14, 56, x15 +;; xmov x16, sp +;; xstore32le_o32 x16, 0, x2 +;; xload64le_o32 x0, x0, 8 +;; xone x4 +;; xmov x1, x3 +;; xmov x2, x17 +;; xmov x3, x16 +;; call_indirect_host 0 +;; zext8 x0, x0 +;; br_if_not32 x0, 0x13 // target = 0xd3 +;; c6: xload32le_o32 x0, x16, 0 +;; pop_frame_restore 32, x16, x17 +;; ret +;; d3: xmov x1, x17 +;; d6: xload64le_o32 x0, x1, 16 +;; dd: xload64le_o32 x0, x0, 408 +;; e4: call_indirect_host 52 +;; e8: trap From b0b15b889a8f7adc664063f27a0962326bc75b25 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:38 -0700 Subject: [PATCH 18/22] pulley/cranelift: phase-3 fusion (xband + funcref dispatch in one op) Add `xband_funcref_dispatch_{x64,not_x64,x32,not_x32}`: same shape as `xfuncref_dispatch_*` but consumes the UNMASKED funcref pointer and writes the masked value to `dst_masked` so the brif's block-call-arg copy to the continuation block param still has a defined producer. Cranelift's `try_fuse_funcref_dispatch` prefers phase 3 (also absorb the standalone `xband_s8`) when the band has no other uses, falling back to phase 2 (band stays standalone) otherwise. Dispatch tail at the call_indirect lazy-init site shrinks to 2 Pulley dispatches (fused op + call_indirect). --- cranelift/codegen/meta/src/pulley.rs | 7 ++ .../codegen/src/isa/pulley_shared/inst.isle | 20 +++ .../src/isa/pulley_shared/inst/emit.rs | 75 ++++++++++++ .../codegen/src/isa/pulley_shared/inst/mod.rs | 46 +++++++ .../codegen/src/isa/pulley_shared/lower.rs | 90 +++++++++++--- pulley/src/interp.rs | 115 ++++++++++++++++++ pulley/src/lib.rs | 32 +++++ .../pulley-call-indirect-band-brif-fusion.wat | 101 ++++++++------- tests/disas/pulley-fusion-fires-32bit.wat | 103 ++++++++-------- .../disas/pulley-fusion-fires-multi-call.wat | 42 +++---- ...lley-fusion-fires-return-call-indirect.wat | 21 ++-- 11 files changed, 500 insertions(+), 152 deletions(-) diff --git a/cranelift/codegen/meta/src/pulley.rs b/cranelift/codegen/meta/src/pulley.rs index 77e760eda7a3..ef8a91f33ab0 100644 --- a/cranelift/codegen/meta/src/pulley.rs +++ b/cranelift/codegen/meta/src/pulley.rs @@ -97,6 +97,13 @@ impl Inst<'_> { // Skip special instructions not used in Cranelift. "XPush32Many" | "XPush64Many" | "XPop32Many" | "XPop64Many" => true, + // Phase-3 fused dispatch op: 3 writable results would + // require extending the auto-codegen `results[..]` match + // arms below. The op is emitted only via the hand-written + // `MInst::BandFuncrefDispatch` path, so no auto-generated + // ISLE rule is needed — skip here. + n if n.starts_with("XbandFuncrefDispatch") => true, + // Skip more branching-related instructions. n => n.starts_with("Br"), } diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle index 18cf6ac84a5c..40413d55d2d6 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst.isle +++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle @@ -108,6 +108,26 @@ (taken MachLabel) (not_taken MachLabel)) + ;; Phase-3 fusion: absorbs the preceding standalone `xband_s8 dst, src, + ;; -2` into the same MachInst as a `FuncrefDispatch`. Operand + ;; structure differs: `src` here is the UNMASKED funcref (the band's + ;; input), and `dst_masked` is added as a third def so the brif's + ;; block-call-arg copy to the continuation block still has a real + ;; producer for the funcref-ptr block param. The underlying Pulley + ;; ops are `xband_funcref_dispatch_{x64,not_x64,x32,not_x32}`. See + ;; `try_fuse_band_into_funcref_dispatch` in `pulley_shared::lower` + ;; for when phase 3 fires vs phase 2. + (BandFuncrefDispatch + (dst_masked WritableXReg) + (dst_code WritableXReg) + (dst_vmctx WritableXReg) + (src XReg) + (offset_code i8) + (offset_vmctx i8) + (size OperandSize) + (taken MachLabel) + (not_taken MachLabel)) + ;; Load the memory address referenced by `mem` into `dst`. (LoadAddr (dst WritableXReg) (mem Amode)) diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs index a7771dd1baee..7a04da9978cd 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs @@ -515,6 +515,81 @@ fn pulley_emit

( assert_eq!(sink.cur_offset(), not_taken_end); } + Inst::BandFuncrefDispatch { + dst_masked, + dst_code, + dst_vmctx, + src, + offset_code, + offset_vmctx, + size, + taken, + not_taken, + } => { + // Same scaffolding as Inst::FuncrefDispatch, but with an + // extra `dst_masked` operand. The forward form branches on + // `src != 0` (after computing dst_masked AND the two loads); + // the inverted form branches on `src == 0` (only dst_masked + // is written on that side). MachBuffer flips between them + // for the fall-through optimisation. + let dm_w = *dst_masked; + let dc_w = *dst_code; + let dv_w = *dst_vmctx; + let src_reg = *src; + let oc = *offset_code; + let ov = *offset_vmctx; + + let mut inverted = SmallVec::<[u8; 16]>::new(); + match size { + OperandSize::Size32 => { + enc::xband_funcref_dispatch_not_x32( + &mut inverted, dm_w, dc_w, dv_w, src_reg, oc, ov, 0, + ); + } + OperandSize::Size64 => { + enc::xband_funcref_dispatch_not_x64( + &mut inverted, dm_w, dc_w, dv_w, src_reg, oc, ov, 0, + ); + } + } + let len = inverted.len() as u32; + inverted.clear(); + let inv_rel = i32::try_from(len - 4).unwrap(); + match size { + OperandSize::Size32 => { + enc::xband_funcref_dispatch_not_x32( + &mut inverted, dm_w, dc_w, dv_w, src_reg, oc, ov, inv_rel, + ); + } + OperandSize::Size64 => { + enc::xband_funcref_dispatch_not_x64( + &mut inverted, dm_w, dc_w, dv_w, src_reg, oc, ov, inv_rel, + ); + } + } + assert!(len > 4); + + let taken_end = *start_offset + len; + sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel); + sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted); + patch_pc_rel_offset(sink, |sink| match size { + OperandSize::Size32 => enc::xband_funcref_dispatch_x32( + sink, dm_w, dc_w, dv_w, src_reg, oc, ov, 0, + ), + OperandSize::Size64 => enc::xband_funcref_dispatch_x64( + sink, dm_w, dc_w, dv_w, src_reg, oc, ov, 0, + ), + }); + debug_assert_eq!(sink.cur_offset(), taken_end); + + let not_taken_start = taken_end + 1; + let not_taken_end = not_taken_start + 4; + sink.use_label_at_offset(not_taken_start, *not_taken, LabelUse::PcRel); + sink.add_uncond_branch(taken_end, not_taken_end, *not_taken); + patch_pc_rel_offset(sink, |sink| enc::jump(sink, 0)); + assert_eq!(sink.cur_offset(), not_taken_end); + } + Inst::LoadAddr { dst, mem } => { let base = mem.get_base_register(); let offset = mem.get_offset_with_state(state); diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs index cc950365cd10..61c7c5870830 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs @@ -288,6 +288,23 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { collector.reg_use(src); } + Inst::BandFuncrefDispatch { + dst_masked, + dst_code, + dst_vmctx, + src, + offset_code: _, + offset_vmctx: _, + size: _, + taken: _, + not_taken: _, + } => { + collector.reg_def(dst_masked); + collector.reg_def(dst_code); + collector.reg_def(dst_vmctx); + collector.reg_use(src); + } + Inst::LoadAddr { dst, mem } => { collector.reg_def(dst); mem.get_operands(collector); @@ -512,6 +529,7 @@ where Inst::BrIf { .. } => MachTerminator::Branch, Inst::BandBrIf { .. } => MachTerminator::Branch, Inst::FuncrefDispatch { .. } => MachTerminator::Branch, + Inst::BandFuncrefDispatch { .. } => MachTerminator::Branch, Inst::BrTable { .. } => MachTerminator::Branch, Inst::ReturnCall { .. } | Inst::ReturnIndirectCall { .. } => MachTerminator::RetCall, Inst::Call { info } if info.try_call_info.is_some() => MachTerminator::Branch, @@ -839,6 +857,34 @@ impl Inst { ) } + Inst::BandFuncrefDispatch { + dst_masked, + dst_code, + dst_vmctx, + src, + offset_code, + offset_vmctx, + size, + taken, + not_taken, + } => { + let dst_masked = format_reg(*dst_masked.to_reg()); + let dst_code = format_reg(*dst_code.to_reg()); + let dst_vmctx = format_reg(*dst_vmctx.to_reg()); + let src = format_reg(**src); + let taken = taken.to_string(); + let not_taken = not_taken.to_string(); + let width = match size { + OperandSize::Size32 => 32, + OperandSize::Size64 => 64, + }; + format!( + "{dst_masked}, {dst_code}, {dst_vmctx} = xband_funcref_dispatch_x{width} \ + {src}, code+{offset_code}, vmctx+{offset_vmctx}; \ + br_if {taken}; jump {not_taken}" + ) + } + Inst::LoadAddr { dst, mem } => { let dst = format_reg(*dst.to_reg()); let mem = mem.to_string(); diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.rs b/cranelift/codegen/src/isa/pulley_shared/lower.rs index 2c8e2b565d43..d15035c465ad 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.rs +++ b/cranelift/codegen/src/isa/pulley_shared/lower.rs @@ -487,30 +487,47 @@ where return false; }; - // Source vreg: `cond` (the band's result — already-masked funcref - // pointer). The band stays as a separate Pulley `xband_s8` op (we - // do NOT sink it). Its result feeds both us and the brif's - // block-call-arg in continuation, which is what makes the - // predecessor brif's block-arg machinery well-defined here. - // - // Note we look up cond directly via the brif's cond arg — it's the - // same value the matching pattern returned as `pat.code_val`'s base - // (`funcref_ptr` after block-arg substitution). let InstructionData::Brif { arg: cond, .. } = ctx.f.dfg.insts[ir_inst] else { return false; }; - let src_reg = ctx - .put_value_in_regs(cond) - .only_reg() - .expect("scalar funcref source"); - let src = XReg::new(src_reg).expect("funcref source is an x-class register"); + + // Phase 3: try to ALSO absorb the band into a BandFuncrefDispatch. + // The band defines cond; if absorbed, its standalone xband_s8 + // dispatch goes away (one less match_loop dispatch per call_indirect + // site vs phase 2). The fused MachInst defs three vregs: + // `dst_masked` (= cond's vreg) so the brif's block-call-arg copy + // still finds the masked value, plus `dst_code` and `dst_vmctx`. + // + // We re-derive the band inst and unmasked source `v` rather than + // threading them through `FuncrefDispatchPattern` — the match + // succeeded if we got here, and we already know cond's def is + // `band(v, -2)`. Width-aware `is_minus_two_for` matches the same + // way as `match_funcref_dispatch_pattern`. + let dfg = ctx.dfg(); + let band_inst = dfg.value_def(cond).inst(); + let v = band_inst.and_then(|bi| match dfg.insts[bi] { + InstructionData::Binary { + opcode: Opcode::Band, + args: [a, b], + } => match dfg.value_def(b).inst() { + Some(b_inst) => match dfg.insts[b_inst] { + InstructionData::UnaryImm { + opcode: Opcode::Iconst, + imm, + } if is_minus_two_for(imm, dfg.value_type(cond)) => Some(a), + _ => None, + }, + None => None, + }, + _ => None, + }); // Destination vregs: the loads' result values' canonical vregs. // pre_lower marked the loads as absorbed_pure, so their standalone // lowering (in the continuation block, processed earlier in reverse // iteration) was skipped — value_regs[code_val] and value_regs[vmctx_val] - // are un-aliased, and our FuncrefDispatch's def of them is the sole - // def each one has across the function. + // are un-aliased, and our def of them is the sole def each one has + // across the function. let dst_code_reg = ctx .put_value_in_regs(pat.code_val) .only_reg() @@ -524,6 +541,47 @@ where let dst_vmctx = WritableXReg::try_from(Writable::from_reg(dst_vmctx_reg)) .expect("funcref vmctx dst is an x-class register"); + if let (Some(band_inst), Some(v)) = (band_inst, v) { + // Phase 3: source = unmasked `v`; emit BandFuncrefDispatch which + // does the masking internally and writes the masked value to + // dst_masked (= cond's vreg). Sink the band — its standalone + // lowering is skipped, removing one Pulley dispatch from the + // call_indirect tail. + let dst_masked_regs = ctx.put_value_in_regs(cond); + let dst_masked_reg = dst_masked_regs.only_reg().expect("scalar cond"); + let dst_masked = WritableXReg::try_from(Writable::from_reg(dst_masked_reg)) + .expect("cond is an x-class register"); + let src_reg = ctx + .put_value_in_regs(v) + .only_reg() + .expect("scalar funcref source"); + let src = XReg::new(src_reg).expect("funcref source is an x-class register"); + ctx.sink_pure_inst(band_inst); + ctx.emit( + Inst::BandFuncrefDispatch { + dst_masked, + dst_code, + dst_vmctx, + src, + offset_code: pat.offset_code, + offset_vmctx: pat.offset_vmctx, + size: pat.size, + taken: targets[0], + not_taken: targets[1], + } + .into(), + ); + return true; + } + + // Phase 2 fallback: band stays standalone, FuncrefDispatch consumes + // its masked result as src. + let src_reg = ctx + .put_value_in_regs(cond) + .only_reg() + .expect("scalar funcref source"); + let src = XReg::new(src_reg).expect("funcref source is an x-class register"); + ctx.emit( Inst::FuncrefDispatch { dst_code, diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index 7520bca7d92c..f3ef2f91e040 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -2467,6 +2467,121 @@ impl OpVisitor for Interpreter<'_> { } } + fn xband_funcref_dispatch_x64( + &mut self, + dst_masked: XReg, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + // Phase-3 fusion: combines the standalone xband64_s8 (mask init + // bit) with the phase-2 brif+xload+xload dispatch into one op. + // The masked value is written unconditionally to dst_masked so + // the brif's block-call-arg machinery still finds it; the two + // loads only fire on the non-null side. Soundness under the + // eager-init predicate: `src` is provably non-zero at runtime, + // so the loads are valid memory accesses. + let s = self.state[src].get_u64(); + let masked = s & !1u64; + self.state[dst_masked].set_u64(masked); + if s != 0 { + let base = masked as *const u8; + unsafe { + let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); + let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + self.state[dst_code].set_i64(code); + self.state[dst_vmctx].set_i64(vmctx); + } + self.pc_rel_jump::(offset) + } else { + ControlFlow::Continue(()) + } + } + + fn xband_funcref_dispatch_not_x64( + &mut self, + dst_masked: XReg, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_u64(); + let masked = s & !1u64; + self.state[dst_masked].set_u64(masked); + if s == 0 { + self.pc_rel_jump::(offset) + } else { + let base = masked as *const u8; + unsafe { + let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); + let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + self.state[dst_code].set_i64(code); + self.state[dst_vmctx].set_i64(vmctx); + } + ControlFlow::Continue(()) + } + } + + fn xband_funcref_dispatch_x32( + &mut self, + dst_masked: XReg, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_u32(); + let masked = s & !1u32; + self.state[dst_masked].set_u32(masked); + if s != 0 { + let base = masked as usize as *const u8; + unsafe { + let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); + let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + self.state[dst_code].set_i32(code); + self.state[dst_vmctx].set_i32(vmctx); + } + self.pc_rel_jump::(offset) + } else { + ControlFlow::Continue(()) + } + } + + fn xband_funcref_dispatch_not_x32( + &mut self, + dst_masked: XReg, + dst_code: XReg, + dst_vmctx: XReg, + src: XReg, + offset_code: i8, + offset_vmctx: i8, + offset: PcRelOffset, + ) -> ControlFlow { + let s = self.state[src].get_u32(); + let masked = s & !1u32; + self.state[dst_masked].set_u32(masked); + if s == 0 { + self.pc_rel_jump::(offset) + } else { + let base = masked as usize as *const u8; + unsafe { + let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); + let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + self.state[dst_code].set_i32(code); + self.state[dst_vmctx].set_i32(vmctx); + } + ControlFlow::Continue(()) + } + } + fn xbor32(&mut self, operands: BinaryOperands) -> ControlFlow { let a = self.state[operands.src1].get_u32(); let b = self.state[operands.src2].get_u32(); diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index 9eb074c7e3e9..2d532ed1ff59 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -625,6 +625,38 @@ macro_rules! for_each_op { /// Inverted form of `xfuncref_dispatch_x32`. xfuncref_dispatch_not_x32 = XfuncrefDispatchNotX32 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// Phase-3 fusion: combine `xband64_s8 dst_masked, src, -2` with + /// `xfuncref_dispatch_*_x64 dst_code, dst_vmctx, dst_masked, + /// offset_code, offset_vmctx, offset` into a single Pulley + /// dispatch. `src` is the UNMASKED funcref pointer; the + /// init-bit strip happens internally. + /// + /// 64-bit forward form: `dst_masked = src & -2` unconditionally; + /// if `src != 0`, load `wasm_call` from `dst_masked + offset_code` + /// into `dst_code`, load callee `vmctx` from `dst_masked + + /// offset_vmctx` into `dst_vmctx`, and branch by `offset`. The + /// null side falls through to the slow path. + /// + /// Soundness: same as `xfuncref_dispatch_*` — gated on + /// `is_eagerly_initialized_funcref_table` so `src` is provably + /// non-zero at runtime. Testing the unmasked `src` for null vs + /// the masked `dst_masked` differs only at `v == 1` + /// (tagged-null) which the predicate excludes. Saves one more + /// match_loop dispatch per call_indirect site vs phase 2 (the + /// preceding standalone `xband64_s8` is absorbed). + xband_funcref_dispatch_x64 = XbandFuncrefDispatchX64 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// Inverted form: branch when `src == 0`, loads-and-fall-through + /// on `src != 0`. Used by MachBuffer's branch-direction flip + /// when the fast path is the natural fall-through. The + /// `dst_masked = src & -2` write is unconditional in both + /// forms. + xband_funcref_dispatch_not_x64 = XbandFuncrefDispatchNotX64 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// 32-bit pointer-width form of `xband_funcref_dispatch_x64`. + /// Used on `pulley32` / arm64_32-apple-watchos. + xband_funcref_dispatch_x32 = XbandFuncrefDispatchX32 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// Inverted form of `xband_funcref_dispatch_x32`. + xband_funcref_dispatch_not_x32 = XbandFuncrefDispatchNotX32 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; + /// `low32(dst) = low32(src1) | low32(src2)` xbor32 = XBor32 { operands: BinaryOperands }; /// Same as `xbor64` but `src2` is a sign-extended 8-bit immediate. diff --git a/tests/disas/pulley-call-indirect-band-brif-fusion.wat b/tests/disas/pulley-call-indirect-band-brif-fusion.wat index 5d0ff495cf46..96137e0425b9 100644 --- a/tests/disas/pulley-call-indirect-band-brif-fusion.wat +++ b/tests/disas/pulley-call-indirect-band-brif-fusion.wat @@ -53,30 +53,29 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 32, x16, x17, x26 +;; push_frame_save 32, x16, x17, x25 ;; xmov x3, x0 -;; br_if_xugteq32_u8 x2, 3, 0x58 // target = 0x71 +;; br_if_xugteq32_u8 x2, 3, 0x55 // target = 0x6e ;; 20: xload64le_o32 x0, x0, 48 ;; zext32 x15, x2 ;; xshl64_u6 x1, x15, 3 ;; xadd64 x0, x0, x1 ;; xload64le_o32 x0, x0, 0 -;; xband64_s8 x0, x0, -2 -;; xfuncref_dispatch_not_x64 x16, x17, x0, 8, 24, 0x1b // target = 0x56 +;; xband_funcref_dispatch_not_x64 x0, x16, x17, x0, 8, 24, 0x1c // target = 0x53 ;; xmov x2, x0 ;; xmov x1, x3 ;; xmov x0, x17 ;; call_indirect x16 -;; pop_frame_restore 32, x16, x17, x26 +;; pop_frame_restore 32, x16, x17, x25 ;; ret -;; 56: xzero x0 -;; 58: xmov x26, x3 -;; 5b: call3 x26, x0, x15, 0x270 // target = 0x2cb -;; 63: xmov x2, x0 -;; 66: xmov x0, x17 -;; 69: xmov x1, x26 -;; 6c: jump -0x1e // target = 0x4e -;; 71: trap +;; 53: xzero x0 +;; 55: xmov x25, x3 +;; 58: call3 x25, x0, x15, 0x270 // target = 0x2c8 +;; 60: xmov x2, x0 +;; 63: xmov x0, x17 +;; 66: xmov x1, x25 +;; 69: jump -0x1e // target = 0x4b +;; 6e: trap ;; ╰─╼ trap: TableOutOfBounds ;; ;; wasm[0]::array_to_wasm_trampoline[0]: @@ -87,19 +86,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0xc6 +;; xpcadd x15, 0x2a // target = 0xc3 ;; xstore64le_o32 x13, 80, x15 -;; call -0xab // target = 0x0 +;; call -0xa8 // target = 0x0 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc6 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc3 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; c6: xzero x0 -;; c8: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; cd: ret +;; c3: xzero x0 +;; c5: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ca: ret ;; ;; wasm[0]::array_to_wasm_trampoline[1]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -109,19 +108,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x120 +;; xpcadd x15, 0x2a // target = 0x11d ;; xstore64le_o32 x13, 80, x15 -;; call -0x100 // target = 0x5 +;; call -0xfd // target = 0x5 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x120 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x11d ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 120: xzero x0 -;; 122: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 127: ret +;; 11d: xzero x0 +;; 11f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 124: ret ;; ;; wasm[0]::array_to_wasm_trampoline[2]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -131,19 +130,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x17a +;; xpcadd x15, 0x2a // target = 0x177 ;; xstore64le_o32 x13, 80, x15 -;; call -0x154 // target = 0xb +;; call -0x151 // target = 0xb ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x17a +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x177 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 17a: xzero x0 -;; 17c: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 181: ret +;; 177: xzero x0 +;; 179: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 17e: ret ;; ;; wasm[0]::array_to_wasm_trampoline[3]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -154,19 +153,19 @@ ;; xstore64le_o32 x15, 72, x2 ;; xmov x2, sp ;; xstore64le_o32 x15, 64, x2 -;; xpcadd x2, 0x2d // target = 0x1de +;; xpcadd x2, 0x2d // target = 0x1db ;; xstore64le_o32 x15, 80, x2 -;; call3 x0, x1, x14, -0x1af // target = 0x11 +;; call3 x0, x1, x14, -0x1ac // target = 0x11 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1de +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1db ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 1de: xzero x0 -;; 1e0: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 1e5: ret +;; 1db: xzero x0 +;; 1dd: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1e2: ret ;; ;; signatures[0]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -185,15 +184,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x15, x0 -;; br_if_not32 x15, 0x13 // target = 0x23d -;; 230: xload32le_o32 x0, x16, 0 +;; br_if_not32 x15, 0x13 // target = 0x23a +;; 22d: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 23d: xmov x1, x17 -;; 240: xload64le_o32 x0, x1, 16 -;; 247: xload64le_o32 x0, x0, 408 -;; 24e: call_indirect_host 52 -;; 252: trap +;; 23a: xmov x1, x17 +;; 23d: xload64le_o32 x0, x1, 16 +;; 244: xload64le_o32 x0, x0, 408 +;; 24b: call_indirect_host 52 +;; 24f: trap ;; ;; signatures[1]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -213,15 +212,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x0, x0 -;; br_if_not32 x0, 0x13 // target = 0x2b3 -;; 2a6: xload32le_o32 x0, x16, 0 +;; br_if_not32 x0, 0x13 // target = 0x2b0 +;; 2a3: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 2b3: xmov x1, x17 -;; 2b6: xload64le_o32 x0, x1, 16 -;; 2bd: xload64le_o32 x0, x0, 408 -;; 2c4: call_indirect_host 52 -;; 2c8: trap +;; 2b0: xmov x1, x17 +;; 2b3: xload64le_o32 x0, x1, 16 +;; 2ba: xload64le_o32 x0, x0, 408 +;; 2c1: call_indirect_host 52 +;; 2c5: trap ;; ;; wasmtime_builtin_table_get_lazy_init_func_ref: ;; push_frame diff --git a/tests/disas/pulley-fusion-fires-32bit.wat b/tests/disas/pulley-fusion-fires-32bit.wat index c48a7b4c6a06..a540637aae2e 100644 --- a/tests/disas/pulley-fusion-fires-32bit.wat +++ b/tests/disas/pulley-fusion-fires-32bit.wat @@ -49,30 +49,29 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 32, x16, x17, x26 -;; br_if_xugteq32_u8 x2, 3, 0x5b // target = 0x71 +;; push_frame_save 32, x16, x17, x25 +;; br_if_xugteq32_u8 x2, 3, 0x58 // target = 0x6e ;; 1d: xload32le_o32 x15, x0, 24 ;; xmov x3, x0 ;; xshl32_u6 x0, x2, 2 ;; xadd32 x15, x15, x0 ;; xload32le_o32 x15, x15, 0 -;; xband32_s8 x0, x15, -2 -;; xfuncref_dispatch_not_x32 x16, x17, x0, 4, 12, 0x1b // target = 0x53 +;; xband_funcref_dispatch_not_x32 x0, x16, x17, x15, 4, 12, 0x1c // target = 0x50 ;; xmov x2, x0 ;; xmov x1, x3 ;; xmov x0, x17 ;; call_indirect x16 -;; pop_frame_restore 32, x16, x17, x26 +;; pop_frame_restore 32, x16, x17, x25 ;; ret -;; 53: xzero x0 -;; 55: zext32 x1, x2 -;; 58: xmov x26, x3 -;; 5b: call3 x26, x0, x1, 0x270 // target = 0x2cb -;; 63: xmov x2, x0 -;; 66: xmov x0, x17 -;; 69: xmov x1, x26 -;; 6c: jump -0x21 // target = 0x4b -;; 71: trap +;; 50: xzero x0 +;; 52: zext32 x1, x2 +;; 55: xmov x25, x3 +;; 58: call3 x25, x0, x1, 0x270 // target = 0x2c8 +;; 60: xmov x2, x0 +;; 63: xmov x0, x17 +;; 66: xmov x1, x25 +;; 69: jump -0x21 // target = 0x48 +;; 6e: trap ;; ╰─╼ trap: TableOutOfBounds ;; ;; wasm[0]::array_to_wasm_trampoline[0]: @@ -83,19 +82,19 @@ ;; xstore32le_o32 x13, 48, x14 ;; xmov x14, sp ;; xstore32le_o32 x13, 44, x14 -;; xpcadd x15, 0x2a // target = 0xc6 +;; xpcadd x15, 0x2a // target = 0xc3 ;; xstore32le_o32 x13, 52, x15 -;; call -0xab // target = 0x0 +;; call -0xa8 // target = 0x0 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc6 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc3 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; c6: xzero x0 -;; c8: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; cd: ret +;; c3: xzero x0 +;; c5: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ca: ret ;; ;; wasm[0]::array_to_wasm_trampoline[1]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -105,19 +104,19 @@ ;; xstore32le_o32 x13, 48, x14 ;; xmov x14, sp ;; xstore32le_o32 x13, 44, x14 -;; xpcadd x15, 0x2a // target = 0x120 +;; xpcadd x15, 0x2a // target = 0x11d ;; xstore32le_o32 x13, 52, x15 -;; call -0x100 // target = 0x5 +;; call -0xfd // target = 0x5 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x120 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x11d ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 120: xzero x0 -;; 122: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 127: ret +;; 11d: xzero x0 +;; 11f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 124: ret ;; ;; wasm[0]::array_to_wasm_trampoline[2]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -127,19 +126,19 @@ ;; xstore32le_o32 x13, 48, x14 ;; xmov x14, sp ;; xstore32le_o32 x13, 44, x14 -;; xpcadd x15, 0x2a // target = 0x17a +;; xpcadd x15, 0x2a // target = 0x177 ;; xstore32le_o32 x13, 52, x15 -;; call -0x154 // target = 0xb +;; call -0x151 // target = 0xb ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x17a +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x177 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 17a: xzero x0 -;; 17c: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 181: ret +;; 177: xzero x0 +;; 179: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 17e: ret ;; ;; wasm[0]::array_to_wasm_trampoline[3]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -150,19 +149,19 @@ ;; xstore32le_o32 x15, 48, x2 ;; xmov x2, sp ;; xstore32le_o32 x15, 44, x2 -;; xpcadd x2, 0x2d // target = 0x1de +;; xpcadd x2, 0x2d // target = 0x1db ;; xstore32le_o32 x15, 52, x2 -;; call3 x0, x1, x14, -0x1af // target = 0x11 +;; call3 x0, x1, x14, -0x1ac // target = 0x11 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1de +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1db ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 1de: xzero x0 -;; 1e0: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 1e5: ret +;; 1db: xzero x0 +;; 1dd: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1e2: ret ;; ;; signatures[0]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -181,15 +180,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x15, x0 -;; br_if_not32 x15, 0x13 // target = 0x23d -;; 230: xload32le_o32 x0, x16, 0 +;; br_if_not32 x15, 0x13 // target = 0x23a +;; 22d: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 23d: xmov x1, x17 -;; 240: xload32le_o32 x0, x1, 8 -;; 247: xload32le_o32 x0, x0, 204 -;; 24e: call_indirect_host 52 -;; 252: trap +;; 23a: xmov x1, x17 +;; 23d: xload32le_o32 x0, x1, 8 +;; 244: xload32le_o32 x0, x0, 204 +;; 24b: call_indirect_host 52 +;; 24f: trap ;; ;; signatures[1]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -209,15 +208,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x0, x0 -;; br_if_not32 x0, 0x13 // target = 0x2b3 -;; 2a6: xload32le_o32 x0, x16, 0 +;; br_if_not32 x0, 0x13 // target = 0x2b0 +;; 2a3: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 2b3: xmov x1, x17 -;; 2b6: xload32le_o32 x0, x1, 8 -;; 2bd: xload32le_o32 x0, x0, 204 -;; 2c4: call_indirect_host 52 -;; 2c8: trap +;; 2b0: xmov x1, x17 +;; 2b3: xload32le_o32 x0, x1, 8 +;; 2ba: xload32le_o32 x0, x0, 204 +;; 2c1: call_indirect_host 52 +;; 2c5: trap ;; ;; wasmtime_builtin_table_get_lazy_init_func_ref: ;; push_frame diff --git a/tests/disas/pulley-fusion-fires-multi-call.wat b/tests/disas/pulley-fusion-fires-multi-call.wat index de9b7a2cd04a..4e3bf866747f 100644 --- a/tests/disas/pulley-fusion-fires-multi-call.wat +++ b/tests/disas/pulley-fusion-fires-multi-call.wat @@ -53,27 +53,25 @@ ;; push_frame_save 48, x16, x17, x18, x27, x28, x29 ;; xmov x18, x0 ;; xmov x29, x3 -;; br_if_xugteq32_u8 x2, 3, 0xac // target = 0xc8 +;; br_if_xugteq32_u8 x2, 3, 0xa6 // target = 0xc2 ;; 23: xload64le_o32 x28, x0, 48 ;; zext32 x1, x2 ;; xshl64_u6 x0, x1, 3 ;; xadd64 x0, x28, x0 ;; xload64le_o32 x0, x0, 0 -;; xband64_s8 x0, x0, -2 -;; xfuncref_dispatch_not_x64 x16, x17, x0, 8, 24, 0x57 // target = 0x95 +;; xband_funcref_dispatch_not_x64 x0, x16, x17, x0, 8, 24, 0x55 // target = 0x8f ;; xmov x2, x0 ;; xmov x0, x17 ;; xmov x1, x18 ;; call_indirect x16 ;; xmov x3, x29 ;; xmov x17, x0 -;; br_if_xugteq32_u8 x3, 3, 0x72 // target = 0xcb -;; 60: zext32 x1, x3 +;; br_if_xugteq32_u8 x3, 3, 0x6f // target = 0xc5 +;; 5d: zext32 x1, x3 ;; xshl64_u6 x0, x1, 3 ;; xadd64 x0, x28, x0 ;; xload64le_o32 x0, x0, 0 -;; xband64_s8 x0, x0, -2 -;; xfuncref_dispatch_not_x64 x27, x28, x0, 8, 24, 0x39 // target = 0xad +;; xband_funcref_dispatch_not_x64 x0, x27, x28, x0, 8, 24, 0x3a // target = 0xa7 ;; xmov x2, x0 ;; xmov x1, x18 ;; xmov x0, x28 @@ -82,18 +80,18 @@ ;; xadd32 x0, x1, x0 ;; pop_frame_restore 48, x16, x17, x18, x27, x28, x29 ;; ret -;; 95: xzero x0 -;; 97: xmov x2, x18 -;; 9a: call3 x2, x0, x1, 0x29e // target = 0x338 -;; a2: xmov x2, x0 -;; a5: xmov x0, x17 -;; a8: jump -0x5a // target = 0x4e -;; ad: xzero x0 -;; af: xmov x16, x18 -;; b2: call3 x16, x0, x1, 0x286 // target = 0x338 -;; ba: xmov x2, x0 -;; bd: xmov x0, x28 -;; c0: xmov x1, x16 -;; c3: jump -0x3c // target = 0x87 -;; c8: trap -;; cb: trap +;; 8f: xzero x0 +;; 91: xmov x2, x18 +;; 94: call3 x2, x0, x1, 0x29e // target = 0x332 +;; 9c: xmov x2, x0 +;; 9f: xmov x0, x17 +;; a2: jump -0x57 // target = 0x4b +;; a7: xzero x0 +;; a9: xmov x16, x18 +;; ac: call3 x16, x0, x1, 0x286 // target = 0x332 +;; b4: xmov x2, x0 +;; b7: xmov x0, x28 +;; ba: xmov x1, x16 +;; bd: jump -0x3c // target = 0x81 +;; c2: trap +;; c5: trap diff --git a/tests/disas/pulley-fusion-fires-return-call-indirect.wat b/tests/disas/pulley-fusion-fires-return-call-indirect.wat index a008f4d59609..db63dbc4f1a2 100644 --- a/tests/disas/pulley-fusion-fires-return-call-indirect.wat +++ b/tests/disas/pulley-fusion-fires-return-call-indirect.wat @@ -37,27 +37,26 @@ ;; ret ;; ;; wasm[0]::function[1]: -;; push_frame_save 32, x16, x17, x25 -;; br_if_xneq32_i8 x2, 0, 0x5d // target = 0x67 +;; push_frame_save 32, x16, x17, x24 +;; br_if_xneq32_i8 x2, 0, 0x5a // target = 0x64 ;; 11: xload64le_o32 x15, x0, 48 ;; xmov x1, x0 ;; zext32 x14, x2 ;; xshl64_u6 x0, x14, 3 ;; xadd64 x15, x15, x0 ;; xload64le_o32 x15, x15, 0 -;; xband64_s8 x0, x15, -2 -;; xfuncref_dispatch_not_x64 x16, x17, x0, 8, 24, 0x1a // target = 0x49 +;; xband_funcref_dispatch_not_x64 x0, x16, x17, x15, 8, 24, 0x1b // target = 0x46 ;; xmov x15, x16 ;; xmov x2, x0 ;; xmov x0, x17 -;; pop_frame_restore 32, x16, x17, x25 +;; pop_frame_restore 32, x16, x17, x24 ;; xjump x15 -;; 49: xzero x0 -;; xmov x25, x1 -;; call3 x25, x0, x14, 0x1bf // target = 0x20d +;; 46: xzero x0 +;; xmov x24, x1 +;; call3 x24, x0, x14, 0x1bf // target = 0x20a ;; xmov x2, x0 ;; xmov x0, x17 -;; xmov x1, x25 +;; xmov x1, x24 ;; xmov x15, x16 -;; jump -0x20 // target = 0x42 -;; 67: trap +;; jump -0x20 // target = 0x3f +;; 64: trap From 69201c292bb04e665ad80f2533bd423780669b0a Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:38 -0700 Subject: [PATCH 19/22] pulley: add call_indirect{1,2,3,4} fused indirect-call ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror of the direct-call `call{1,2,3,4}` family: each new op combines `xmov xN, argN` ABI fixups with the indirect call. Reads arg values before writing the ABI registers so the sequence is safe when an argN aliases the corresponding ABI register. `call_indirect1 dst, arg1`: x0 = state[arg1] lr = pc pc = state[dst] Saves up to N Pulley dispatches per call_indirect site (one per moved arg). In practice at least one — the callee vmctx ABI fixup. Cranelift wiring in the next commit. --- pulley/src/interp.rs | 78 ++++++++++++++++++++++++++++++++++++++++++++ pulley/src/lib.rs | 11 +++++++ 2 files changed, 89 insertions(+) diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index f3ef2f91e040..a3cbd4449b09 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -1425,6 +1425,84 @@ impl OpVisitor for Interpreter<'_> { ControlFlow::Continue(()) } + fn call_indirect1(&mut self, dst: XReg, arg1: XReg) -> ControlFlow { + // Phase-4 fusion: combines `xmov x0, arg1` with `call_indirect dst`. + // Read arg1 BEFORE writing x0 so this is safe even when `arg1 == x0`. + let arg1_val = self.state[arg1]; + let target = self.state[dst].get_ptr(); + let return_addr = self.pc.as_ptr(); + self.state.lr = return_addr.as_ptr(); + self.state[XReg::x0] = arg1_val; + // SAFETY: same as `call_indirect`. + unsafe { + self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target)); + } + ControlFlow::Continue(()) + } + + fn call_indirect2(&mut self, dst: XReg, arg1: XReg, arg2: XReg) -> ControlFlow { + let (a1, a2) = (self.state[arg1], self.state[arg2]); + let target = self.state[dst].get_ptr(); + let return_addr = self.pc.as_ptr(); + self.state.lr = return_addr.as_ptr(); + self.state[XReg::x0] = a1; + self.state[XReg::x1] = a2; + // SAFETY: same as `call_indirect`. + unsafe { + self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target)); + } + ControlFlow::Continue(()) + } + + fn call_indirect3( + &mut self, + dst: XReg, + arg1: XReg, + arg2: XReg, + arg3: XReg, + ) -> ControlFlow { + let (a1, a2, a3) = (self.state[arg1], self.state[arg2], self.state[arg3]); + let target = self.state[dst].get_ptr(); + let return_addr = self.pc.as_ptr(); + self.state.lr = return_addr.as_ptr(); + self.state[XReg::x0] = a1; + self.state[XReg::x1] = a2; + self.state[XReg::x2] = a3; + // SAFETY: same as `call_indirect`. + unsafe { + self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target)); + } + ControlFlow::Continue(()) + } + + fn call_indirect4( + &mut self, + dst: XReg, + arg1: XReg, + arg2: XReg, + arg3: XReg, + arg4: XReg, + ) -> ControlFlow { + let (a1, a2, a3, a4) = ( + self.state[arg1], + self.state[arg2], + self.state[arg3], + self.state[arg4], + ); + let target = self.state[dst].get_ptr(); + let return_addr = self.pc.as_ptr(); + self.state.lr = return_addr.as_ptr(); + self.state[XReg::x0] = a1; + self.state[XReg::x1] = a2; + self.state[XReg::x2] = a3; + self.state[XReg::x3] = a4; + // SAFETY: same as `call_indirect`. + unsafe { + self.pc = UnsafeBytecodeStream::new(NonNull::new_unchecked(target)); + } + ControlFlow::Continue(()) + } + fn jump(&mut self, offset: PcRelOffset) -> ControlFlow { self.pc_rel_jump::(offset) } diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index 2d532ed1ff59..1e7fc851bc39 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -115,6 +115,17 @@ macro_rules! for_each_op { /// Transfer control to the PC in `reg` and set `lr` to the PC just /// after this instruction. call_indirect = CallIndirect { reg: XReg }; + /// Like `call_indirect`, but also `x0 = arg1`. Saves one Pulley + /// dispatch vs `xmov x0, arg1; call_indirect reg` for the common + /// call_indirect pattern where one ABI register (usually `vmctx`) + /// is set up immediately before the indirect call. + call_indirect1 = CallIndirect1 { reg: XReg, arg1: XReg }; + /// Like `call_indirect`, but also `x0, x1 = arg1, arg2`. + call_indirect2 = CallIndirect2 { reg: XReg, arg1: XReg, arg2: XReg }; + /// Like `call_indirect`, but also `x0, x1, x2 = arg1, arg2, arg3`. + call_indirect3 = CallIndirect3 { reg: XReg, arg1: XReg, arg2: XReg, arg3: XReg }; + /// Like `call_indirect`, but also `x0, x1, x2, x3 = arg1, arg2, arg3, arg4`. + call_indirect4 = CallIndirect4 { reg: XReg, arg1: XReg, arg2: XReg, arg3: XReg, arg4: XReg }; /// Unconditionally transfer control to the PC at the given offset. jump = Jump { offset: PcRelOffset }; From c08ada977f587b0a50c3dfcf053edef2286d7093 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:38 -0700 Subject: [PATCH 20/22] cranelift/pulley: pass first 4 indirect-call args via call_indirectN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend `Inst::IndirectCall`'s `info.dest` from `XReg` to `PulleyCallIndirect { target, args: SmallVec<[XReg; 4]> }`, parallel to `PulleyCall`. `gen_call_ind_info` pulls the first 0–4 integer args from `uses` (where they were going through regalloc's `reg_fixed_use`, synthesising an `xmov` each) into `args`, where they flow as free reg uses and the emitted `call_indirect{1,2,3,4}` opcode moves them at call time. The emit side picks the narrowest op after the same "drop args already in their ABI register" loop used by direct calls. Phase-3's `xband_funcref_dispatch_*` writing `dst_vmctx` into a free register + `call_indirect1 dst_code, dst_vmctx` is the headline shrink (one fewer Pulley dispatch per call_indirect on the eager-table fast path). Filetest snapshots updated for the new `dest` shape. --- .../src/isa/pulley_shared/inst/args.rs | 20 ++++++++++++ .../src/isa/pulley_shared/inst/emit.rs | 18 ++++++++++- .../codegen/src/isa/pulley_shared/inst/mod.rs | 15 +++++++-- .../src/isa/pulley_shared/lower/isle.rs | 32 ++++++++++++++++--- .../filetests/isa/pulley32/call.clif | 2 +- .../filetests/isa/pulley32/exceptions.clif | 2 +- .../filetests/isa/pulley32/preserve-all.clif | 4 +-- .../filetests/isa/pulley64/call.clif | 2 +- .../filetests/isa/pulley64/exceptions.clif | 2 +- .../filetests/isa/pulley64/preserve-all.clif | 4 +-- 10 files changed, 85 insertions(+), 16 deletions(-) diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/args.rs b/cranelift/codegen/src/isa/pulley_shared/inst/args.rs index e97e3303ef99..79bafbe2fa39 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/args.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/args.rs @@ -577,6 +577,26 @@ pub struct PulleyCall { pub args: SmallVec<[XReg; 4]>, } +/// Payload of `CallInfo` for indirect-call instructions. +/// +/// Mirror of `PulleyCall` for `Inst::IndirectCall`: the call target is a +/// runtime register (the loaded `wasm_call` pointer at the call_indirect +/// dispatch tail), and the first 0–4 integer ABI args are passed as free +/// registers so the `call_indirect1/2/3/4` opcodes can move them into +/// `x0..x3` as part of the call (saving one `xmov` per arg on the hot +/// dispatch path). Remaining args live in `CallInfo::uses` with fixed +/// pregs, just as for `PulleyCall`. +#[derive(Clone, Debug)] +pub struct PulleyCallIndirect { + /// The register holding the call target (e.g. the `wasm_call` pointer + /// loaded out of a `VMFuncRef`). + pub target: XReg, + /// Up to 4 integer args destined for `x0..x3`. Tracked separately so + /// regalloc doesn't insert moves and the `call_indirectN` opcode moves + /// them itself. + pub args: SmallVec<[XReg; 4]>, +} + pub use super::super::lower::isle::generated_code::AddrO32; impl Copy for AddrO32 {} diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs index 7a04da9978cd..832744924b82 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs @@ -233,7 +233,23 @@ fn pulley_emit

( } Inst::IndirectCall { info } => { - enc::call_indirect(sink, info.dest); + // If x0..xN args are already in their correct ABI register + // (because regalloc allocated the producer's vreg there), drop + // them off the end so we can use a narrower `call_indirectN` + // op — mirror of the direct-call shrink loop above. + let target = info.dest.target; + let mut args = &info.dest.args[..]; + while !args.is_empty() && args.last().copied() == XReg::new(x_reg(args.len() - 1)) { + args = &args[..args.len() - 1]; + } + match args { + [] => enc::call_indirect(sink, target), + [x0] => enc::call_indirect1(sink, target, *x0), + [x0, x1] => enc::call_indirect2(sink, target, *x0, *x1), + [x0, x1, x2] => enc::call_indirect3(sink, target, *x0, *x1, *x2), + [x0, x1, x2, x3] => enc::call_indirect4(sink, target, *x0, *x1, *x2, *x3), + _ => unreachable!(), + } if let Some(s) = state.take_stack_map() { let offset = sink.cur_offset(); diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs index 61c7c5870830..c0268297a361 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs @@ -206,14 +206,25 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { } } Inst::IndirectCall { info } => { - collector.reg_use(&mut info.dest); let CallInfo { uses, defs, + dest, try_call_info, clobbers, .. } = &mut **info; + + // Phase-4: the target and the first up-to-4 integer args live + // in `dest` and are passed as free reg uses; the emitted + // `call_indirect{1,2,3,4}` op moves the args into x0..x3 at + // call time. Remaining args still flow through `uses` with + // fixed pregs as before. + let PulleyCallIndirect { target, args } = dest; + collector.reg_use(target); + for arg in args { + collector.reg_use(arg); + } for CallArgPair { vreg, preg } in uses { collector.reg_fixed_use(vreg, *preg); } @@ -770,7 +781,7 @@ impl Inst { } Inst::IndirectCall { info } => { - let callee = format_reg(*info.dest); + let callee = format_reg(*info.dest.target); let try_call = info .try_call_info .as_ref() diff --git a/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs b/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs index 3068fb1137ff..c065c45a2c6a 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs +++ b/cranelift/codegen/src/isa/pulley_shared/lower/isle.rs @@ -10,8 +10,8 @@ use crate::ir::{condcodes::*, immediates::*, types::*, *}; use crate::isa::CallConv; use crate::isa::pulley_shared::{ inst::{ - FReg, OperandSize, PulleyCall, ReturnCallInfo, VReg, WritableFReg, WritableVReg, - WritableXReg, XReg, + FReg, OperandSize, PulleyCall, PulleyCallIndirect, ReturnCallInfo, VReg, WritableFReg, + WritableVReg, WritableXReg, XReg, }, lower::{Cond, regs}, *, @@ -30,7 +30,7 @@ type Unit = (); type VecArgPair = Vec; type VecRetPair = Vec; type BoxCallInfo = Box>; -type BoxCallIndInfo = Box>; +type BoxCallIndInfo = Box>; type BoxCallIndirectHostInfo = Box>; type BoxReturnCallInfo = Box>; type BoxReturnCallIndInfo = Box>; @@ -124,7 +124,7 @@ where &mut self, sig: Sig, dest: Reg, - uses: CallArgList, + mut uses: CallArgList, defs: CallRetList, try_call_info: Option, ) -> BoxCallIndInfo { @@ -133,8 +133,30 @@ where self.lower_ctx .abi_mut() .accumulate_outgoing_args_size(stack_ret_space + stack_arg_space); + let call_conv = self.lower_ctx.sigs()[sig].call_conv(); - let dest = XReg::new(dest).unwrap(); + // Mirror of `gen_call_info`: take out the first four integer + // arguments (x0..x3) and pass them through the `args` list so the + // emitted `call_indirect{1,2,3,4}` op can move them at call time. + // Saves one Pulley dispatch per moved arg vs the previous "regalloc + // emits xmov; then `call_indirect`" sequence. + let mut args = SmallVec::new(); + uses.sort_by_key(|arg| arg.preg); + if call_conv != CallConv::PreserveAll { + uses.retain(|arg| { + if arg.preg != regs::x0() + && arg.preg != regs::x1() + && arg.preg != regs::x2() + && arg.preg != regs::x3() + { + return true; + } + args.push(XReg::new(arg.vreg).unwrap()); + false + }); + } + let target = XReg::new(dest).unwrap(); + let dest = PulleyCallIndirect { target, args }; Box::new( self.lower_ctx .gen_call_info(sig, dest, uses, defs, try_call_info, false), diff --git a/cranelift/filetests/filetests/isa/pulley32/call.clif b/cranelift/filetests/filetests/isa/pulley32/call.clif index c2dc9a09f6c9..aece47fc9a19 100644 --- a/cranelift/filetests/filetests/isa/pulley32/call.clif +++ b/cranelift/filetests/filetests/isa/pulley32/call.clif @@ -291,7 +291,7 @@ block0(v0: i32): ; VCode: ; push_frame ; block0: -; indirect_call x0, CallInfo { dest: XReg(p0i), uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x0, CallInfo { dest: PulleyCallIndirect { target: XReg(p0i), args: [] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false } ; pop_frame ; ret ; diff --git a/cranelift/filetests/filetests/isa/pulley32/exceptions.clif b/cranelift/filetests/filetests/isa/pulley32/exceptions.clif index 2d3dfef3e853..eeed198535d2 100644 --- a/cranelift/filetests/filetests/isa/pulley32/exceptions.clif +++ b/cranelift/filetests/filetests/isa/pulley32/exceptions.clif @@ -77,7 +77,7 @@ function %f2(i32, i32) -> i32, f32, f64 { ; block0: ; fconst64 f1, 4607182418800017408 ; fstore64 Slot(0), f1 // flags = notrap aligned -; indirect_call x1, CallInfo { dest: XReg(p1i), uses: [CallArgPair { vreg: p0i, preg: p0i }], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I32) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I32) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)] +; indirect_call x1, CallInfo { dest: PulleyCallIndirect { target: XReg(p1i), args: [XReg(p0i)] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I32) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I32) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)] ; block1: ; xone x0 ; f1 = fload64 Slot(0) // flags = notrap aligned diff --git a/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif b/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif index c7d523d4f6a6..c698bb1f71ea 100644 --- a/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif +++ b/cranelift/filetests/filetests/isa/pulley32/preserve-all.clif @@ -15,8 +15,8 @@ block0(v0: i64): ; xmov x3, x0 ; xmov x1, x3 ; xmov x2, x3 -; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } -; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } ; pop_frame ; ret ; diff --git a/cranelift/filetests/filetests/isa/pulley64/call.clif b/cranelift/filetests/filetests/isa/pulley64/call.clif index 16b271835620..bd6f9bba825f 100644 --- a/cranelift/filetests/filetests/isa/pulley64/call.clif +++ b/cranelift/filetests/filetests/isa/pulley64/call.clif @@ -291,7 +291,7 @@ block0(v0: i64): ; VCode: ; push_frame ; block0: -; indirect_call x0, CallInfo { dest: XReg(p0i), uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x0, CallInfo { dest: PulleyCallIndirect { target: XReg(p0i), args: [] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }], clobbers: PRegSet { bits: [65534, 4294967295, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: None, patchable: false } ; pop_frame ; ret ; diff --git a/cranelift/filetests/filetests/isa/pulley64/exceptions.clif b/cranelift/filetests/filetests/isa/pulley64/exceptions.clif index 88c5528c1935..6a0b7b1577a1 100644 --- a/cranelift/filetests/filetests/isa/pulley64/exceptions.clif +++ b/cranelift/filetests/filetests/isa/pulley64/exceptions.clif @@ -79,7 +79,7 @@ function %f2(i32, i64) -> i32, f32, f64 { ; block0: ; fconst64 f1, 4607182418800017408 ; fstore64 Slot(0), f1 // flags = notrap aligned -; indirect_call x1, CallInfo { dest: XReg(p1i), uses: [CallArgPair { vreg: p0i, preg: p0i }], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I64) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)] +; indirect_call x1, CallInfo { dest: PulleyCallIndirect { target: XReg(p1i), args: [XReg(p0i)] }, uses: [], defs: [CallRetPair { vreg: Writable { reg: p0f }, location: Reg(p0f, types::F32) }, CallRetPair { vreg: Writable { reg: p0i }, location: Reg(p0i, types::I64) }, CallRetPair { vreg: Writable { reg: p1i }, location: Reg(p1i, types::I64) }], clobbers: PRegSet { bits: [4294967292, 4294967294, 4294967295, 0] }, callee_conv: Tail, caller_conv: Fast, callee_pop_size: 0, try_call_info: Some(TryCallInfo { continuation: MachLabel(1), exception_handlers: [Default(MachLabel(2))] }), patchable: false }; jump MachLabel(1); catch [default: MachLabel(2)] ; block1: ; xone x0 ; f1 = fload64 Slot(0) // flags = notrap aligned diff --git a/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif b/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif index 2b6a28ce9ece..44bc72fcaf25 100644 --- a/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif +++ b/cranelift/filetests/filetests/isa/pulley64/preserve-all.clif @@ -15,8 +15,8 @@ block0(v0: i64): ; xmov x3, x0 ; xmov x1, x3 ; xmov x2, x3 -; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } -; indirect_call x3, CallInfo { dest: XReg(p3i), uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } +; indirect_call x3, CallInfo { dest: PulleyCallIndirect { target: XReg(p3i), args: [] }, uses: [CallArgPair { vreg: p0i, preg: p0i }, CallArgPair { vreg: p1i, preg: p1i }, CallArgPair { vreg: p2i, preg: p2i }, CallArgPair { vreg: p3i, preg: p3i }], defs: [], clobbers: PRegSet { bits: [0, 0, 0, 0] }, callee_conv: PreserveAll, caller_conv: SystemV, callee_pop_size: 0, try_call_info: None, patchable: false } ; pop_frame ; ret ; From c80089172ea72be8b217c0830023807fda8c21f2 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Thu, 21 May 2026 20:02:38 -0700 Subject: [PATCH 21/22] pulley: trap on null in 8 fused funcref-dispatch handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review on the rebeckerspecialties wasmtime fork PR pointed out that phase-2/3's continuation-block load absorption breaks the lazy-init slow path's correctness: the slow path's libcall rejoins `continuation_block` via a block param, and after absorption the loads are gone — `call_indirect` would see uninitialized `dst_code`/`dst_vmctx` if the slow path is ever reached. Fusion is gated on `is_eagerly_initialized_funcref_table` so the slow path is unreachable at runtime, but the previous handler's `ControlFlow::Continue(())` on null was advertised as defence-in- depth and was itself broken. Replace it with `done_trap` in the 8 affected handlers (4 forward + 4 `_not` variants across x64/x32 × xfuncref_dispatch/xband_funcref_dispatch). `offset` on the `_not` variants becomes vestigial; kept for encoding-shape parity. --- .../codegen/src/isa/pulley_shared/inst.isle | 45 ++-- .../src/isa/pulley_shared/inst/args.rs | 21 +- .../src/isa/pulley_shared/inst/emit.rs | 123 ++++++++--- .../codegen/src/isa/pulley_shared/inst/mod.rs | 8 +- .../codegen/src/isa/pulley_shared/lower.rs | 194 ++++-------------- crates/cranelift/src/func_environ.rs | 16 +- pulley/src/interp.rs | 135 ++++++++---- pulley/src/lib.rs | 106 +++------- tests/all/pulley.rs | 123 ++++------- .../pulley-call-indirect-band-brif-fusion.wat | 112 +++++----- tests/disas/pulley-fusion-fires-32bit.wat | 111 +++++----- .../disas/pulley-fusion-fires-multi-call.wat | 52 ++--- .../pulley-fusion-no-fire-mutable-table.wat | 160 +++++++-------- ...ulley-fusion-no-fire-sig-runtime-check.wat | 44 ++-- .../pulley-fusion-no-fire-table-copy.wat | 147 +++++++++---- .../pulley-fusion-no-fire-table-fill.wat | 76 ++++--- .../pulley-fusion-no-fire-table-grow.wat | 75 ++++--- .../disas/pulley-fusion-no-fire-user-mask.wat | 4 +- tests/disas/pulley/call.wat | 6 +- 19 files changed, 747 insertions(+), 811 deletions(-) diff --git a/cranelift/codegen/src/isa/pulley_shared/inst.isle b/cranelift/codegen/src/isa/pulley_shared/inst.isle index 40413d55d2d6..bab0fa9a25de 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst.isle +++ b/cranelift/codegen/src/isa/pulley_shared/inst.isle @@ -67,16 +67,10 @@ ;; Jump to `then` if `c` is true, otherwise to `else`. (BrIf (cond Cond) (taken MachLabel) (not_taken MachLabel)) - ;; Fused band-immediate + brif: compute `dst = src & sign_extend(mask)`, - ;; then conditionally branch to `taken` if `src` is non-zero (low-32 or - ;; full 64-bit comparison per `size`), otherwise fall through to - ;; `not_taken`. The mask + dst write happen unconditionally. - ;; - ;; Emitted by the Cranelift Pulley backend at the call_indirect lazy-init - ;; brif site when the same funcref-pointer value feeds both the init-bit - ;; mask (`band v, -2`) and the null-check branch (`brif v`). The fusion - ;; saves one match_loop dispatch per call_indirect site. See pulley/ - ;; `xband*_s8_br_if_*` ops for the underlying bytecode. + ;; Fused `band src, mask` + `brif src` emitted at the call_indirect + ;; lazy-init brif site. `dst = src & sign_extend(mask)` is + ;; unconditional; the branch test is on `src`'s low-32 or full-64 bits + ;; per `size`. Pulley-side: `xband*_s8_br_if_*`. (BandBrIf (dst WritableXReg) (src XReg) @@ -85,19 +79,11 @@ (taken MachLabel) (not_taken MachLabel)) - ;; Funcref-dispatch fusion: branch-and-load fused dispatch op emitted - ;; at the call_indirect lazy-init brif site under the eager-init - ;; predicate + statically-elided sig check. Fuses - ;; `band v, -2 ; brif ; xload (v_masked + offset_code) ; xload (v_masked + offset_vmctx)` - ;; from across the brif's predecessor and continuation blocks into one - ;; Pulley dispatch. The pulley-side ops are `xfuncref_dispatch_{x64, - ;; not_x64, x32, not_x32}`; the runtime null check is defence-in-depth - ;; (the predicate guarantees the funcref is non-null at runtime, but - ;; the handler must match the original brif's null branch as a - ;; correctness fallback). See `try_fuse_funcref_dispatch` in - ;; `pulley_shared::lower` for the recogniser + Pulley's - ;; `pre_lower_analysis` for the cross-block sink that lets the - ;; continuation-block loads be absorbed into one MachInst here. + ;; Funcref-dispatch fusion: `brif (band v -2) + load code + load vmctx` + ;; across the brif and its continuation block. Emitted at the + ;; call_indirect lazy-init site under + ;; `is_eagerly_initialized_funcref_table`. Pulley-side: + ;; `xfuncref_dispatch_{x64,not_x64,x32,not_x32}`. (FuncrefDispatch (dst_code WritableXReg) (dst_vmctx WritableXReg) @@ -108,15 +94,10 @@ (taken MachLabel) (not_taken MachLabel)) - ;; Phase-3 fusion: absorbs the preceding standalone `xband_s8 dst, src, - ;; -2` into the same MachInst as a `FuncrefDispatch`. Operand - ;; structure differs: `src` here is the UNMASKED funcref (the band's - ;; input), and `dst_masked` is added as a third def so the brif's - ;; block-call-arg copy to the continuation block still has a real - ;; producer for the funcref-ptr block param. The underlying Pulley - ;; ops are `xband_funcref_dispatch_{x64,not_x64,x32,not_x32}`. See - ;; `try_fuse_band_into_funcref_dispatch` in `pulley_shared::lower` - ;; for when phase 3 fires vs phase 2. + ;; FuncrefDispatch + the preceding `xband_s8 -2` absorbed. `src` is + ;; the unmasked funcref; the fused op writes `dst_masked = src & -2` + ;; so the brif's block-call-arg copy still has a producer. + ;; Pulley-side: `xband_funcref_dispatch_{x64,not_x64,x32,not_x32}`. (BandFuncrefDispatch (dst_masked WritableXReg) (dst_code WritableXReg) diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/args.rs b/cranelift/codegen/src/isa/pulley_shared/inst/args.rs index 79bafbe2fa39..8e385df62a2e 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/args.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/args.rs @@ -577,23 +577,16 @@ pub struct PulleyCall { pub args: SmallVec<[XReg; 4]>, } -/// Payload of `CallInfo` for indirect-call instructions. -/// -/// Mirror of `PulleyCall` for `Inst::IndirectCall`: the call target is a -/// runtime register (the loaded `wasm_call` pointer at the call_indirect -/// dispatch tail), and the first 0–4 integer ABI args are passed as free -/// registers so the `call_indirect1/2/3/4` opcodes can move them into -/// `x0..x3` as part of the call (saving one `xmov` per arg on the hot -/// dispatch path). Remaining args live in `CallInfo::uses` with fixed -/// pregs, just as for `PulleyCall`. +/// Payload of `CallInfo` for `Inst::IndirectCall`. Mirror of `PulleyCall`: +/// the first 0–4 integer ABI args are tracked here so the emitted +/// `call_indirect{1,2,3,4}` opcode moves them into `x0..x3` itself +/// instead of regalloc synthesising `xmov`s. Remaining args use the +/// fixed-preg path in `CallInfo::uses`. #[derive(Clone, Debug)] pub struct PulleyCallIndirect { - /// The register holding the call target (e.g. the `wasm_call` pointer - /// loaded out of a `VMFuncRef`). + /// The register holding the call target. pub target: XReg, - /// Up to 4 integer args destined for `x0..x3`. Tracked separately so - /// regalloc doesn't insert moves and the `call_indirectN` opcode moves - /// them itself. + /// Up to 4 integer args destined for `x0..x3`. pub args: SmallVec<[XReg; 4]>, } diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs index 832744924b82..b06535c56312 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/emit.rs @@ -233,10 +233,9 @@ fn pulley_emit

( } Inst::IndirectCall { info } => { - // If x0..xN args are already in their correct ABI register - // (because regalloc allocated the producer's vreg there), drop - // them off the end so we can use a narrower `call_indirectN` - // op — mirror of the direct-call shrink loop above. + // Drop args already in their ABI register so we can pick a + // narrower `call_indirectN` — mirrors the direct-call shrink + // above. let target = info.dest.target; let mut args = &info.dest.args[..]; while !args.is_empty() && args.last().copied() == XReg::new(x_reg(args.len() - 1)) { @@ -407,12 +406,20 @@ fn pulley_emit

( match size { OperandSize::Size32 => { enc::xband32_s8_br_if_not_x32( - &mut inverted, dst_writable, src_reg, mask_imm, 0, + &mut inverted, + dst_writable, + src_reg, + mask_imm, + 0, ); } OperandSize::Size64 => { enc::xband64_s8_br_if_not_x64( - &mut inverted, dst_writable, src_reg, mask_imm, 0, + &mut inverted, + dst_writable, + src_reg, + mask_imm, + 0, ); } } @@ -422,12 +429,20 @@ fn pulley_emit

( match size { OperandSize::Size32 => { enc::xband32_s8_br_if_not_x32( - &mut inverted, dst_writable, src_reg, mask_imm, inv_rel, + &mut inverted, + dst_writable, + src_reg, + mask_imm, + inv_rel, ); } OperandSize::Size64 => { enc::xband64_s8_br_if_not_x64( - &mut inverted, dst_writable, src_reg, mask_imm, inv_rel, + &mut inverted, + dst_writable, + src_reg, + mask_imm, + inv_rel, ); } } @@ -482,12 +497,24 @@ fn pulley_emit

( match size { OperandSize::Size32 => { enc::xfuncref_dispatch_not_x32( - &mut inverted, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0, + &mut inverted, + dst_code_w, + dst_vmctx_w, + src_reg, + oc, + ov, + 0, ); } OperandSize::Size64 => { enc::xfuncref_dispatch_not_x64( - &mut inverted, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0, + &mut inverted, + dst_code_w, + dst_vmctx_w, + src_reg, + oc, + ov, + 0, ); } } @@ -497,12 +524,24 @@ fn pulley_emit

( match size { OperandSize::Size32 => { enc::xfuncref_dispatch_not_x32( - &mut inverted, dst_code_w, dst_vmctx_w, src_reg, oc, ov, inv_rel, + &mut inverted, + dst_code_w, + dst_vmctx_w, + src_reg, + oc, + ov, + inv_rel, ); } OperandSize::Size64 => { enc::xfuncref_dispatch_not_x64( - &mut inverted, dst_code_w, dst_vmctx_w, src_reg, oc, ov, inv_rel, + &mut inverted, + dst_code_w, + dst_vmctx_w, + src_reg, + oc, + ov, + inv_rel, ); } } @@ -513,12 +552,12 @@ fn pulley_emit

( sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel); sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted); patch_pc_rel_offset(sink, |sink| match size { - OperandSize::Size32 => enc::xfuncref_dispatch_x32( - sink, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0, - ), - OperandSize::Size64 => enc::xfuncref_dispatch_x64( - sink, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0, - ), + OperandSize::Size32 => { + enc::xfuncref_dispatch_x32(sink, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0) + } + OperandSize::Size64 => { + enc::xfuncref_dispatch_x64(sink, dst_code_w, dst_vmctx_w, src_reg, oc, ov, 0) + } }); debug_assert_eq!(sink.cur_offset(), taken_end); @@ -559,12 +598,26 @@ fn pulley_emit

( match size { OperandSize::Size32 => { enc::xband_funcref_dispatch_not_x32( - &mut inverted, dm_w, dc_w, dv_w, src_reg, oc, ov, 0, + &mut inverted, + dm_w, + dc_w, + dv_w, + src_reg, + oc, + ov, + 0, ); } OperandSize::Size64 => { enc::xband_funcref_dispatch_not_x64( - &mut inverted, dm_w, dc_w, dv_w, src_reg, oc, ov, 0, + &mut inverted, + dm_w, + dc_w, + dv_w, + src_reg, + oc, + ov, + 0, ); } } @@ -574,12 +627,26 @@ fn pulley_emit

( match size { OperandSize::Size32 => { enc::xband_funcref_dispatch_not_x32( - &mut inverted, dm_w, dc_w, dv_w, src_reg, oc, ov, inv_rel, + &mut inverted, + dm_w, + dc_w, + dv_w, + src_reg, + oc, + ov, + inv_rel, ); } OperandSize::Size64 => { enc::xband_funcref_dispatch_not_x64( - &mut inverted, dm_w, dc_w, dv_w, src_reg, oc, ov, inv_rel, + &mut inverted, + dm_w, + dc_w, + dv_w, + src_reg, + oc, + ov, + inv_rel, ); } } @@ -589,12 +656,12 @@ fn pulley_emit

( sink.use_label_at_offset(taken_end - 4, *taken, LabelUse::PcRel); sink.add_cond_branch(*start_offset, taken_end, *taken, &inverted); patch_pc_rel_offset(sink, |sink| match size { - OperandSize::Size32 => enc::xband_funcref_dispatch_x32( - sink, dm_w, dc_w, dv_w, src_reg, oc, ov, 0, - ), - OperandSize::Size64 => enc::xband_funcref_dispatch_x64( - sink, dm_w, dc_w, dv_w, src_reg, oc, ov, 0, - ), + OperandSize::Size32 => { + enc::xband_funcref_dispatch_x32(sink, dm_w, dc_w, dv_w, src_reg, oc, ov, 0) + } + OperandSize::Size64 => { + enc::xband_funcref_dispatch_x64(sink, dm_w, dc_w, dv_w, src_reg, oc, ov, 0) + } }); debug_assert_eq!(sink.cur_offset(), taken_end); diff --git a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs index c0268297a361..f9b1a518ae32 100644 --- a/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs +++ b/cranelift/codegen/src/isa/pulley_shared/inst/mod.rs @@ -215,11 +215,9 @@ fn pulley_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { .. } = &mut **info; - // Phase-4: the target and the first up-to-4 integer args live - // in `dest` and are passed as free reg uses; the emitted - // `call_indirect{1,2,3,4}` op moves the args into x0..x3 at - // call time. Remaining args still flow through `uses` with - // fixed pregs as before. + // First 0–4 integer args are passed as free reg uses; the + // emitted `call_indirect{1,2,3,4}` op moves them into x0..x3. + // Remaining args use the fixed-preg path in `uses`. let PulleyCallIndirect { target, args } = dest; collector.reg_use(target); for arg in args { diff --git a/cranelift/codegen/src/isa/pulley_shared/lower.rs b/cranelift/codegen/src/isa/pulley_shared/lower.rs index d15035c465ad..23f9c00865f4 100644 --- a/cranelift/codegen/src/isa/pulley_shared/lower.rs +++ b/cranelift/codegen/src/isa/pulley_shared/lower.rs @@ -25,18 +25,12 @@ where ir_inst: ir::Inst, targets: &[MachLabel], ) -> Option<()> { - // Phase-2 first: try fusing band+brif+xload+xload across the brif's - // predecessor block and its taken (continuation) target. The matching - // continuation-block loads were marked `absorbed_pure` by the - // `pre_lower` analysis hook below, so they have already been skipped - // in `lower_clif_block` and the FuncrefDispatch MachInst here defs - // their result vregs directly. + // Phase-2/3 fuse band+brif+xload+xload across the brif and its + // continuation block; phase-1 just band+brif. Both gated on the + // eager-init predicate. if try_fuse_funcref_dispatch::

(ctx, ir_inst, targets) { return Some(()); } - // Phase-1 fallback: fuse just band+brif (no continuation loads). - // Emits MInst::BandBrIf. See the doc-comment on the variant in - // `pulley_shared::inst::Inst`. if try_fuse_band_brif(ctx, ir_inst, targets) { return Some(()); } @@ -49,39 +43,22 @@ where } fn pre_lower(&self, ctx: &mut Lower) { - // Cross-block fusion analysis for phase-2 funcref dispatch. - // - // The main block-lowering loop runs in reverse layout order, so by - // the time `lower_branch` fires for the predecessor's brif, its - // taken target (the continuation block) has already had its - // instructions emitted to VCode. Marking the continuation's loads - // as `inst_absorbed_pure` AFTER that point is too late — the loads - // have already been lowered into MachInsts that write to the - // result vregs, and the FuncrefDispatch we'd emit at brif time - // would double-write to those same vregs (SSA violation). - // - // This analysis runs once before any block is lowered. For each - // brif whose cond is `band(v, -2)` AND whose taken target is a - // block that starts with two loads from the brif's first - // block-call-arg at the canonical VMFuncRef wasm_call / vmctx - // offsets, mark band + the two loads as absorbed_pure. The brif - // lowering then sees a clean slate (no double-writes) and emits - // one FuncrefDispatch MachInst. + // Block lowering runs in reverse layout order, so by the time + // `lower_branch` sees the brif, the continuation block has already + // been lowered. Marking the continuation's loads `absorbed_pure` + // after the fact would create double-writes to their result vregs. + // Run the recogniser once up front instead. pre_lower_pulley(ctx, P::pointer_width().bytes()); } } -/// Recognise the `brif (band v c) block(...) cold` shape emitted by -/// `func_environ::get_or_init_func_ref_table_elem` under the -/// `is_eagerly_initialized_funcref_table` predicate, and fuse it into a -/// single `MInst::BandBrIf`. Returns `true` if the fusion fired; the caller -/// then skips the generic ISLE rule. +/// Recognise `brif (band v -2) ...` at the call_indirect lazy-init site +/// and fuse it into `MInst::BandBrIf`. Returns true if fusion fired. /// -/// Soundness: testing `v_masked != 0` instead of `v != 0` is identical on -/// every funcref-slot value REACHABLE in eagerly-initialized tables. The -/// only differing case is `v == 1` (the explicit tagged-null slot value), -/// which can only appear via runtime `table.fill(null)` and is therefore -/// excluded by the `tables_mutated == false` half of the predicate. +/// Soundness: testing `v_masked != 0` instead of `v != 0` is identical for +/// every reachable funcref-slot value under +/// `is_eagerly_initialized_funcref_table` — they differ only at the +/// tagged-null value `1`, which the predicate excludes. fn try_fuse_band_brif

( ctx: &mut Lower>, ir_inst: ir::Inst, @@ -104,50 +81,15 @@ where return false; }; - // The brif's cond must be defined by a `band v -2`. The mask = -2 gate - // is load-bearing for two distinct reasons: - // - // 1. Soundness. The fused op tests the UNMASKED `src` for non-zero, not - // the masked `dst`. That equivalence holds iff `(v & mask != 0) <=> - // (v != 0)`. For mask = -2 the equivalence fails only at `v == 1` - // (tagged-null), which the eager-init predicate excludes at - // runtime. For other masks the equivalence fails on a much wider - // range of `v` and the fused branch direction would silently flip. - // - // 2. Regalloc safety + scope. Before this gate was added, the recogniser - // accepted any `i8::try_from(imm.bits())`-fitting mask, which - // matched user-code `band(v, 127)` / `band(v, 60)` etc. in real - // workloads (e.g. xmrsplayer). Absorbing those user-code bands via - // `sink_pure_inst` violated SSA assumptions when the band's result - // had multiple uses, crashing regalloc with `EntryLivein`. The - // mask = -2 gate confines the fusion to the call_indirect IR-rewrite - // site (where `func_environ::get_or_init_func_ref_table_elem` emits - // `band_imm(value, Imm64::from(-2))` — i.e. `Imm64(-2)` literally). - // - // Note that wasm's own `br_if` cond is always i32, and the wat parser - // encodes `(i32.const -2)` as `Imm64(0xFFFFFFFE)` (= 4294967294), - // NOT `Imm64(-2)`. So even though the surface check looks like it - // would match user wasm with `(i32.const -2)`, that branch of the - // imm-encoding decision tree is unreachable from wasm input. The - // only producer of `Imm64(-2)` reaching here is func_environ's own - // call to `Imm64::from(-2_i64)`. This is the de facto narrowing that - // makes the gate strong against wasm-side abuse. + // The brif's cond must be `band(v, -2)` with a bit-exact `Imm64(-2)`. + // The bit-exact match is load-bearing: it confines the fusion to + // func_environ's `Imm64::from(-2_i64)` IR-rewrite site. The wat parser + // encodes `(i32.const -2)` as `Imm64(0xFFFFFFFE)`, so user wasm can't + // produce `Imm64(-2)` and slip into this code path. let band_inst = match dfg.value_def(cond).inst() { Some(inst) => inst, None => return false, }; - // Phase-1's mask check is intentionally bit-exact (`imm.bits() == -2`). - // The wider, width-aware check `is_minus_two_for` is reserved for - // phase 2 — phase 2's stronger pattern (continuation-block 2-load - // shape) makes it unreachable from wasm user code, so it can safely - // accept the i32-canonicalised `Imm64(0xFFFFFFFE)` encoding. Phase 1 - // has only the mask gate to keep it away from wasm user code, so - // the bit-exact gate is load-bearing — `Imm64(-2)` is only produced - // by func_environ's `Imm64::from(-2_i64)`, never by the wat parser - // for `(i32.const -2)`. The cost of this strictness: phase 1 does - // NOT fire on pulley32 funcref dispatch (the i32 band's imm is - // egraph-canonicalised to `Imm64(0xFFFFFFFE)` and bails). Phase 2 - // fires there instead, so the call_indirect tail is still fused. let (band_src, band_imm) = match dfg.insts[band_inst] { InstructionData::Binary { opcode: Opcode::Band, @@ -185,11 +127,8 @@ where let src = XReg::new(ctx.put_value_in_regs(band_src).only_reg().expect("scalar")) .expect("band source is an x-class register"); - // `put_value_in_regs(cond)` bumped value_lowered_uses[cond] above zero, - // which would normally force the band's standalone lowering. Sink the - // band as a pure absorption: the BandBrIf MInst we emit below produces - // exactly the same dst vreg, so any future use of `cond` (e.g. the - // brif's block-call argument) finds the right value already populated. + // Sink the band: the BandBrIf we emit below defines the same dst vreg, + // so downstream uses of `cond` still find the value populated. ctx.sink_pure_inst(band_inst); ctx.emit( @@ -207,20 +146,9 @@ where true } -/// Does this Imm64 encode `-2` when interpreted in `ty`'s width? -/// -/// Cranelift stores all immediates as `Imm64` regardless of the CLIF type, -/// and the egraph canonicalises i32 immediates to their unsigned u32-in-i64 -/// encoding (so `i32(-2)` is stored as `Imm64(0xFFFFFFFE)` = 4294967294, -/// NOT `Imm64(-2)` = 0xFFFFFFFFFFFFFFFE). To check "this is the -2 mask the -/// IR rewrite from `get_or_init_func_ref_table_elem` produces", we have to -/// width-aware-compare against -2 in the type the band operates on. -/// -/// This affects pulley32 specifically: the funcref pointer is i32, the -/// band is `band_imm(i32_value, Imm64::from(-2))`, and after egraph the -/// imm shows up as `Imm64(0xFFFFFFFE)`. Without width-aware comparison, -/// phase 1 / phase 2 fusion would silently fail to fire on -/// arm64_32-apple-watchos. +/// True iff `imm` encodes `-2` in `ty`'s width. The egraph canonicalises +/// `i32(-2)` as `Imm64(0xFFFFFFFE)`, not `Imm64(-2)`, so a width-aware +/// compare is needed for pulley32. fn is_minus_two_for(imm: ir::immediates::Imm64, ty: ir::Type) -> bool { match ty { ir::types::I32 => (imm.bits() as u32) == (-2_i32 as u32), @@ -229,38 +157,25 @@ fn is_minus_two_for(imm: ir::immediates::Imm64, ty: ir::Type) -> bool { } } -/// VMFuncRef field offsets, parameterised on the Pulley pointer width. -/// -/// Mirrors `crates/environ/src/vmoffsets.rs`'s `vm_func_ref_wasm_call` (= -/// 1 * size) and `vm_func_ref_vmctx` (= 3 * size). Both fit in i8 for both -/// pointer widths (8 + 24 on 64-bit, 4 + 12 on 32-bit), which is the -/// constraint imposed by the pulley `xfuncref_dispatch_*` ops (i8 -/// sign-extended offsets). +/// `(wasm_call, vmctx)` byte offsets in `VMFuncRef`. Both fit in i8 (8/24 +/// on 64-bit, 4/12 on 32-bit), matching the `xfuncref_dispatch_*` ops' +/// sign-extended-i8 offset operand. fn vm_func_ref_offsets(pointer_bytes: u8) -> (i8, i8) { let size = pointer_bytes as i8; (size, size.checked_mul(3).expect("VMFuncRef offsets fit i8")) } -/// Recognise the canonical funcref-dispatch shape produced by -/// `func_environ::get_or_init_func_ref_table_elem` followed by -/// `load_code_and_vmctx` under the eager-init predicate + statically- -/// elided sig check: +/// Recognise the canonical funcref-dispatch shape: /// /// ```text /// predecessor: /// value = load .ptr (table_entry + 0) /// value_masked = band value, -2 /// brif value_masked, continuation([value_masked]), null_block([]) -/// /// continuation(funcref_ptr): /// code = load .ptr (funcref_ptr + offset_code) /// vmctx = load .ptr (funcref_ptr + offset_vmctx) -/// ... <- other uses of code, vmctx /// ``` -/// -/// If found, returns the brif inst, the band inst, the two load insts (in -/// continuation), the funcref source value `v` (band's first arg), the -/// CLIF result values `code` and `vmctx`, and the offsets. Otherwise None. fn match_funcref_dispatch_pattern( f: &ir::Function, brif_inst: ir::Inst, @@ -334,9 +249,8 @@ fn match_funcref_dispatch_pattern( } let funcref_ptr = cont_params[0]; - // First two instructions in the continuation block must be the two - // canonical loads. We tolerate the block-param ordering: load1 is - // at offset_code, load2 at offset_vmctx (in either positional order). + // The first two instructions in the continuation block must be the + // two field loads in either order. let (offset_code_expected, offset_vmctx_expected) = vm_func_ref_offsets(pointer_bytes); let mut iter = f.layout.block_insts(continuation); let load1 = iter.next()?; @@ -433,18 +347,8 @@ fn pre_lower_pulley

(ctx: &mut Lower>, pointer_bytes: u8) where P: PulleyTargetKind, { - // Collect candidates first so we don't hold &ctx.f while calling - // sink_pure_inst (which takes &mut ctx). - // - // We only absorb the two field loads, NOT the band. The band stays - // as a separate Pulley `xband_s8` op because `cond` (the band's - // result) is the SOURCE vreg consumed by FuncrefDispatch — that - // already-masked value gives us the branch test (`src != 0`) with - // the same predictor-anchor semantics as the original brif. If we - // also absorbed the band, FuncrefDispatch would have nothing - // defining `cond`'s vreg, and the predecessor brif's block-call-arg - // copy (which passes `cond` to the continuation block param) would - // see an undefined vreg. + // Collect candidates first so `&ctx.f` isn't held across the + // `sink_pure_inst` calls below. let mut to_sink: smallvec::SmallVec<[(ir::Inst, ir::Inst); 8]> = smallvec::SmallVec::new(); { let f = ctx.f; @@ -491,18 +395,9 @@ where return false; }; - // Phase 3: try to ALSO absorb the band into a BandFuncrefDispatch. - // The band defines cond; if absorbed, its standalone xband_s8 - // dispatch goes away (one less match_loop dispatch per call_indirect - // site vs phase 2). The fused MachInst defs three vregs: - // `dst_masked` (= cond's vreg) so the brif's block-call-arg copy - // still finds the masked value, plus `dst_code` and `dst_vmctx`. - // - // We re-derive the band inst and unmasked source `v` rather than - // threading them through `FuncrefDispatchPattern` — the match - // succeeded if we got here, and we already know cond's def is - // `band(v, -2)`. Width-aware `is_minus_two_for` matches the same - // way as `match_funcref_dispatch_pattern`. + // Try phase-3 (absorb the band into BandFuncrefDispatch). The fused + // op defines `dst_masked` (= cond's vreg) so the brif's block-call + // copy still has a producer, plus `dst_code` and `dst_vmctx`. let dfg = ctx.dfg(); let band_inst = dfg.value_def(cond).inst(); let v = band_inst.and_then(|bi| match dfg.insts[bi] { @@ -522,12 +417,8 @@ where _ => None, }); - // Destination vregs: the loads' result values' canonical vregs. - // pre_lower marked the loads as absorbed_pure, so their standalone - // lowering (in the continuation block, processed earlier in reverse - // iteration) was skipped — value_regs[code_val] and value_regs[vmctx_val] - // are un-aliased, and our def of them is the sole def each one has - // across the function. + // The loads' result vregs become the fused op's defs. Their original + // lowering was skipped via `sink_pure_inst` in `pre_lower_pulley`. let dst_code_reg = ctx .put_value_in_regs(pat.code_val) .only_reg() @@ -542,11 +433,8 @@ where .expect("funcref vmctx dst is an x-class register"); if let (Some(band_inst), Some(v)) = (band_inst, v) { - // Phase 3: source = unmasked `v`; emit BandFuncrefDispatch which - // does the masking internally and writes the masked value to - // dst_masked (= cond's vreg). Sink the band — its standalone - // lowering is skipped, removing one Pulley dispatch from the - // call_indirect tail. + // Phase 3 fires: source is the unmasked `v`; the fused op masks + // internally and writes `dst_masked = cond`. let dst_masked_regs = ctx.put_value_in_regs(cond); let dst_masked_reg = dst_masked_regs.only_reg().expect("scalar cond"); let dst_masked = WritableXReg::try_from(Writable::from_reg(dst_masked_reg)) @@ -574,8 +462,8 @@ where return true; } - // Phase 2 fallback: band stays standalone, FuncrefDispatch consumes - // its masked result as src. + // Phase-2 fallback: band stays as a standalone op; FuncrefDispatch + // consumes its masked result. let src_reg = ctx .put_value_in_regs(cond) .only_reg() diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 59366715c5ef..744d13ad89f1 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -1074,18 +1074,10 @@ impl<'module_environment> FuncEnvironment<'module_environment> { let result_param = builder.append_block_param(continuation_block, pointer_type); builder.set_cold_block(null_block); - // When the table is eagerly-initialized (immutable + precomputed + - // fully-covered + no-null), every funcref slot at runtime is either - // `0` (uninitialized, never observable here) or `addr | 1` (a real - // tagged pointer). The tagged-null value `1` — produced only by - // `table.fill(null)` on a tagged table — is excluded by the - // immutability half of the predicate. Under those conditions, - // `value != 0` and `value_masked != 0` agree on every reachable - // slot value, so we can test the masked result and unlock the - // Pulley backend's `band + brif` fusion at lowering time (see - // `MInst::BandBrIf` and `pulley_shared::lower::try_fuse_band_brif`). - // The fusion saves one match_loop dispatch per call_indirect - // site — the main lever once c1-7 pinned the predictor anchor. + // Under `is_eagerly_initialized_funcref_table`, `value != 0` and + // `value_masked != 0` agree on every reachable slot, so we can + // test the masked result. The Pulley backend then fuses the + // `band + brif` pair. let brif_cond = if self .module .is_eagerly_initialized_funcref_table(table_index) diff --git a/pulley/src/interp.rs b/pulley/src/interp.rs index a3cbd4449b09..07c73584afd9 100644 --- a/pulley/src/interp.rs +++ b/pulley/src/interp.rs @@ -1426,8 +1426,7 @@ impl OpVisitor for Interpreter<'_> { } fn call_indirect1(&mut self, dst: XReg, arg1: XReg) -> ControlFlow { - // Phase-4 fusion: combines `xmov x0, arg1` with `call_indirect dst`. - // Read arg1 BEFORE writing x0 so this is safe even when `arg1 == x0`. + // Read arg1 before writing x0 so this is safe when `arg1 == x0`. let arg1_val = self.state[arg1]; let target = self.state[dst].get_ptr(); let return_addr = self.pc.as_ptr(); @@ -2447,25 +2446,26 @@ impl OpVisitor for Interpreter<'_> { offset_vmctx: i8, offset: PcRelOffset, ) -> ControlFlow { - // `src` is the ALREADY-MASKED funcref (`band v, -2` upstream — the - // band stays as a separate Pulley op; this fusion absorbs only the - // brif + the two field loads). The branch fires when src != 0, - // matching the brif's original semantics in the eager-init - // predicate's IR rewrite (`brif value_masked, taken, null`). Under - // the predicate `src` is never zero at runtime, but the handler - // still has to match the brif's null fall-through as - // defence-in-depth. + // `src` is the already-masked funcref. The null side traps: the + // fusion absorbed the continuation-block loads, so the lazy-init + // slow path's rejoin would see uninitialized dst_code/dst_vmctx. + // Gated on `is_eagerly_initialized_funcref_table`, so trapping + // here is unreachable in correct code. let s = self.state[src].get_u64(); if s == 0 { - ControlFlow::Continue(()) + self.done_trap::() } else { - // SAFETY: under the eager-init predicate, the wasmtime runtime - // enforces that the funcref slot contains a real VMFuncRef - // pointer, so the field loads are valid memory accesses. + // SAFETY: predicate guarantees `src` points to a real VMFuncRef. let base = s as *const u8; unsafe { - let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); - let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); self.state[dst_code].set_i64(code); self.state[dst_vmctx].set_i64(vmctx); } @@ -2482,14 +2482,23 @@ impl OpVisitor for Interpreter<'_> { offset_vmctx: i8, offset: PcRelOffset, ) -> ControlFlow { + // Inverted form: fast path falls through; null side traps. + // `offset` is unused (kept for encoding shape parity). + let _ = offset; let s = self.state[src].get_u64(); if s == 0 { - self.pc_rel_jump::(offset) + self.done_trap::() } else { let base = s as *const u8; unsafe { - let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); - let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); self.state[dst_code].set_i64(code); self.state[dst_vmctx].set_i64(vmctx); } @@ -2508,12 +2517,18 @@ impl OpVisitor for Interpreter<'_> { ) -> ControlFlow { let s = self.state[src].get_u32(); if s == 0 { - ControlFlow::Continue(()) + self.done_trap::() } else { let base = s as usize as *const u8; unsafe { - let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); - let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); self.state[dst_code].set_i32(code); self.state[dst_vmctx].set_i32(vmctx); } @@ -2530,14 +2545,21 @@ impl OpVisitor for Interpreter<'_> { offset_vmctx: i8, offset: PcRelOffset, ) -> ControlFlow { + let _ = offset; let s = self.state[src].get_u32(); if s == 0 { - self.pc_rel_jump::(offset) + self.done_trap::() } else { let base = s as usize as *const u8; unsafe { - let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); - let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); self.state[dst_code].set_i32(code); self.state[dst_vmctx].set_i32(vmctx); } @@ -2555,27 +2577,31 @@ impl OpVisitor for Interpreter<'_> { offset_vmctx: i8, offset: PcRelOffset, ) -> ControlFlow { - // Phase-3 fusion: combines the standalone xband64_s8 (mask init - // bit) with the phase-2 brif+xload+xload dispatch into one op. - // The masked value is written unconditionally to dst_masked so - // the brif's block-call-arg machinery still finds it; the two - // loads only fire on the non-null side. Soundness under the - // eager-init predicate: `src` is provably non-zero at runtime, - // so the loads are valid memory accesses. + // Combines the standalone xband64_s8 with the xfuncref dispatch. + // `src` is unmasked. `dst_masked = src & -2` is written + // unconditionally so the brif's block-call-arg copy still finds a + // producer; the loads + branch fire on `src != 0`. Null traps + // (same rationale as `xfuncref_dispatch_x64`). let s = self.state[src].get_u64(); let masked = s & !1u64; self.state[dst_masked].set_u64(masked); if s != 0 { let base = masked as *const u8; unsafe { - let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); - let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); self.state[dst_code].set_i64(code); self.state[dst_vmctx].set_i64(vmctx); } self.pc_rel_jump::(offset) } else { - ControlFlow::Continue(()) + self.done_trap::() } } @@ -2589,16 +2615,24 @@ impl OpVisitor for Interpreter<'_> { offset_vmctx: i8, offset: PcRelOffset, ) -> ControlFlow { + // Inverted form; `offset` is vestigial after the trap-on-null fix. + let _ = offset; let s = self.state[src].get_u64(); let masked = s & !1u64; self.state[dst_masked].set_u64(masked); if s == 0 { - self.pc_rel_jump::(offset) + self.done_trap::() } else { let base = masked as *const u8; unsafe { - let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); - let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); self.state[dst_code].set_i64(code); self.state[dst_vmctx].set_i64(vmctx); } @@ -2622,14 +2656,20 @@ impl OpVisitor for Interpreter<'_> { if s != 0 { let base = masked as usize as *const u8; unsafe { - let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); - let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); self.state[dst_code].set_i32(code); self.state[dst_vmctx].set_i32(vmctx); } self.pc_rel_jump::(offset) } else { - ControlFlow::Continue(()) + self.done_trap::() } } @@ -2643,16 +2683,23 @@ impl OpVisitor for Interpreter<'_> { offset_vmctx: i8, offset: PcRelOffset, ) -> ControlFlow { + let _ = offset; let s = self.state[src].get_u32(); let masked = s & !1u32; self.state[dst_masked].set_u32(masked); if s == 0 { - self.pc_rel_jump::(offset) + self.done_trap::() } else { let base = masked as usize as *const u8; unsafe { - let code = base.byte_offset(offset_code as isize).cast::().read_unaligned(); - let vmctx = base.byte_offset(offset_vmctx as isize).cast::().read_unaligned(); + let code = base + .byte_offset(offset_code as isize) + .cast::() + .read_unaligned(); + let vmctx = base + .byte_offset(offset_vmctx as isize) + .cast::() + .read_unaligned(); self.state[dst_code].set_i32(code); self.state[dst_vmctx].set_i32(vmctx); } diff --git a/pulley/src/lib.rs b/pulley/src/lib.rs index 1e7fc851bc39..de2210d7e4b0 100644 --- a/pulley/src/lib.rs +++ b/pulley/src/lib.rs @@ -115,10 +115,7 @@ macro_rules! for_each_op { /// Transfer control to the PC in `reg` and set `lr` to the PC just /// after this instruction. call_indirect = CallIndirect { reg: XReg }; - /// Like `call_indirect`, but also `x0 = arg1`. Saves one Pulley - /// dispatch vs `xmov x0, arg1; call_indirect reg` for the common - /// call_indirect pattern where one ABI register (usually `vmctx`) - /// is set up immediately before the indirect call. + /// Like `call_indirect`, but also `x0 = arg1`. call_indirect1 = CallIndirect1 { reg: XReg, arg1: XReg }; /// Like `call_indirect`, but also `x0, x1 = arg1, arg2`. call_indirect2 = CallIndirect2 { reg: XReg, arg1: XReg, arg2: XReg }; @@ -574,96 +571,53 @@ macro_rules! for_each_op { /// Same as `xband64` but `src2` is a sign-extended 32-bit immediate. xband64_s32 = Xband64S32 { dst: XReg, src1: XReg, src2: i32 }; - /// `low32(dst) = low32(src) & sign_extend(mask)`, then conditionally - /// branch by `offset` if `low32(src)` is non-zero. - /// - /// Fused form of `xband32_s8 dst, src, mask` + `br_if32 src, offset`, - /// emitted by the Cranelift Pulley backend at call_indirect lazy-init - /// brif sites where the same loaded funcref value feeds both the - /// init-bit mask (`band v, -2`) and the null-check branch - /// (`brif v`). Shaves one match_loop dispatch per call_indirect - /// site. See pulley/PR for the full design discussion. + /// `low32(dst) = low32(src) & sign_extend(mask)`, then branch by + /// `offset` if `low32(src)` is non-zero. Fused `xband32_s8 + + /// br_if32` for the call_indirect lazy-init brif site. xband32_s8_br_if_x32 = Xband32S8BrIfX32 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; /// Inverted form of `xband32_s8_br_if_x32`: branch if `low32(src)` - /// is zero. The mask + dst write happen unconditionally. Used by - /// MachBuffer's branch-direction-flip fallthrough optimization. + /// is zero. Mask + dst write are unconditional. xband32_s8_br_if_not_x32 = Xband32S8BrIfNotX32 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; - /// 64-bit form: `dst = src & sign_extend(mask)`, then conditionally - /// branch by `offset` if `src` is non-zero. Same fusion as - /// `xband32_s8_br_if_x32` but for 64-bit pointer-width Pulley. + /// 64-bit form of `xband32_s8_br_if_x32`. xband64_s8_br_if_x64 = Xband64S8BrIfX64 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; /// Inverted form of `xband64_s8_br_if_x64`: branch if `src` is zero. xband64_s8_br_if_not_x64 = Xband64S8BrIfNotX64 { dst: XReg, src: XReg, mask: i8, offset: PcRelOffset }; - /// Funcref-dispatch fusion (64-bit pointer width): if `src` is - /// non-zero, load the `wasm_call` code pointer from - /// `src + offset_code` into `dst_code`, load the callee vmctx - /// pointer from `src + offset_vmctx` into `dst_vmctx`, then - /// conditionally branch by `offset`. `src` is the - /// already-masked funcref pointer (`band v, -2` upstream). - /// - /// Forward form: loads-and-branch fire on the non-null side - /// (`src != 0`); the null side falls through. Used at the - /// call_indirect lazy-init brif site under - /// `is_eagerly_initialized_funcref_table` AND when the signature - /// check is statically elided — under those conditions the only - /// uses of the masked funcref pointer are the two `VMFuncRef` - /// field loads, and the brif's null branch is provably - /// unreachable at runtime. The handler's runtime null check is - /// defence-in-depth (matching the original brif's role); it MUST - /// fall through on null so the slow path's lazy-init builtin - /// stays callable in the (provably-unreachable) error case. + /// Funcref-dispatch fusion (64-bit). If `src != 0`, load + /// `dst_code = [src + offset_code]`, `dst_vmctx = [src + + /// offset_vmctx]`, and branch by `offset`. `src` is the + /// already-masked funcref pointer. /// - /// Fused form of `br_if + xload64 + xload64` (the preceding - /// `xband64_s8` stays as a separate op since `src` is consumed - /// here as the band's result). Saves 2 match_loop dispatches - /// per call_indirect site vs the unfused sequence. At the same - /// call site, phase-1's `xband64_s8_br_if_*` ops are not - /// emitted (the recogniser prefers this larger fusion when its - /// pattern matches), so the per-new-opcode predictor cost - /// stays at one new op family rather than two. + /// The null side traps. The fusion absorbs the two field loads + /// from the brif's continuation block; if execution reached the + /// original lazy-init slow path, it would rejoin that + /// continuation with `dst_code`/`dst_vmctx` uninitialized, so + /// the null path can no longer fall through safely. Gated on + /// `is_eagerly_initialized_funcref_table`, which guarantees the + /// null path is unreachable at runtime. xfuncref_dispatch_x64 = XfuncrefDispatchX64 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; - /// Inverted form of `xfuncref_dispatch_x64`: the null side - /// branches and the loads-and-fall-through fire on `src != 0`. - /// Used by MachBuffer's branch-direction-flip fallthrough - /// optimization when the fast path is the natural fall-through. + /// Inverted form of `xfuncref_dispatch_x64`: fast path falls + /// through; null path traps. `offset` is vestigial (kept for + /// shape parity with the forward variant). xfuncref_dispatch_not_x64 = XfuncrefDispatchNotX64 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; - /// 32-bit pointer-width form of `xfuncref_dispatch_x64`. Same - /// semantics; `src`, `dst_code`, `dst_vmctx` are i32 loaded into - /// the low halves of their XReg slots. Used on `pulley32` / - /// arm64_32-apple-watchos. + /// 32-bit pointer-width form of `xfuncref_dispatch_x64`. xfuncref_dispatch_x32 = XfuncrefDispatchX32 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; /// Inverted form of `xfuncref_dispatch_x32`. xfuncref_dispatch_not_x32 = XfuncrefDispatchNotX32 { dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; - /// Phase-3 fusion: combine `xband64_s8 dst_masked, src, -2` with - /// `xfuncref_dispatch_*_x64 dst_code, dst_vmctx, dst_masked, - /// offset_code, offset_vmctx, offset` into a single Pulley - /// dispatch. `src` is the UNMASKED funcref pointer; the - /// init-bit strip happens internally. - /// - /// 64-bit forward form: `dst_masked = src & -2` unconditionally; - /// if `src != 0`, load `wasm_call` from `dst_masked + offset_code` - /// into `dst_code`, load callee `vmctx` from `dst_masked + - /// offset_vmctx` into `dst_vmctx`, and branch by `offset`. The - /// null side falls through to the slow path. + /// Combines `xband64_s8 dst_masked, src, -2` with + /// `xfuncref_dispatch_*_x64` into one op. `src` is the unmasked + /// funcref; the init-bit strip is internal. /// - /// Soundness: same as `xfuncref_dispatch_*` — gated on - /// `is_eagerly_initialized_funcref_table` so `src` is provably - /// non-zero at runtime. Testing the unmasked `src` for null vs - /// the masked `dst_masked` differs only at `v == 1` - /// (tagged-null) which the predicate excludes. Saves one more - /// match_loop dispatch per call_indirect site vs phase 2 (the - /// preceding standalone `xband64_s8` is absorbed). + /// `dst_masked = src & -2` unconditionally. If `src != 0`, do + /// the two loads and branch by `offset`. Null side traps (same + /// rationale as `xfuncref_dispatch_*`). xband_funcref_dispatch_x64 = XbandFuncrefDispatchX64 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; - /// Inverted form: branch when `src == 0`, loads-and-fall-through - /// on `src != 0`. Used by MachBuffer's branch-direction flip - /// when the fast path is the natural fall-through. The - /// `dst_masked = src & -2` write is unconditional in both - /// forms. + /// Inverted form of `xband_funcref_dispatch_x64`: fast path + /// falls through; null path traps. `dst_masked` is still + /// written unconditionally. `offset` is vestigial. xband_funcref_dispatch_not_x64 = XbandFuncrefDispatchNotX64 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; /// 32-bit pointer-width form of `xband_funcref_dispatch_x64`. - /// Used on `pulley32` / arm64_32-apple-watchos. xband_funcref_dispatch_x32 = XbandFuncrefDispatchX32 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; /// Inverted form of `xband_funcref_dispatch_x32`. xband_funcref_dispatch_not_x32 = XbandFuncrefDispatchNotX32 { dst_masked: XReg, dst_code: XReg, dst_vmctx: XReg, src: XReg, offset_code: i8, offset_vmctx: i8, offset: PcRelOffset }; diff --git a/tests/all/pulley.rs b/tests/all/pulley.rs index dedf59c68070..7af28655567f 100644 --- a/tests/all/pulley.rs +++ b/tests/all/pulley.rs @@ -516,23 +516,13 @@ fn decode_unaligned() -> Result<()> { Ok(()) } -// --- Pulley opcode-fusion (band+brif and funcref-dispatch) integration --- -// -// These tests pin runtime semantics for the Pulley call_indirect lazy-init -// fusion stack (`tests/disas/pulley-fusion-*.wat` pins the static disasm). -// They exercise edges identified by upstream-engine bug-classes the -// fusion shape echoes (V8 issue 5913 cross-module table sharing; WAMR -// #4041 call_indirect index > 0; WasmEdge #4757 null-from-typed-table; -// ChakraCore #5915 multi-call-site IC poisoning; Luau release/717 -// store-cache invalidation on mutation). Each test runs identically -// against Pulley AND wasmtime's native Cranelift backend, asserting the -// results agree — the fusion is only present on the Pulley side, so any -// divergence indicates a phase-1 or phase-2 lowering bug. - -/// Pulley config that's safe for tests that exercise traps -/// (call_indirect to null, OOB indices, etc.) — `signals_based_traps(false)` -/// is required because Pulley's interpreter cannot catch signals; it must -/// see explicit trapz / bounds-check emissions. +// Runtime-semantics tests for the call_indirect fusion stack +// (`tests/disas/pulley-fusion-*.wat` covers the static disasm side). +// Each test runs the same wasm under Pulley and native Cranelift and +// asserts the results agree. + +/// Pulley config for tests that exercise traps. The interpreter can't +/// catch signals, so trap emission must be explicit. fn pulley_trap_safe_config() -> Config { let mut config = pulley_config(); config.signals_based_traps(false); @@ -572,12 +562,8 @@ where Ok(pulley) } -/// Phase 2 firing returns the right callee result for every in-bounds -/// table index, AND traps at the right index for OOB. -/// -/// Reference: WAMR #4041 ("call_indirect index > 0 in AOT silently -/// broken — only `table[0]` callable"); wasm3 #547 ("op_CallIndirect -/// SEGV — missing bounds check"). +/// Fusion returns the right callee for every in-bounds index and traps +/// on OOB. #[test] fn fusion_call_indirect_every_index() -> Result<()> { let wat = r#" @@ -595,8 +581,8 @@ fn fusion_call_indirect_every_index() -> Result<()> { let got: i32 = pulley_and_native_agree(wat, "call", idx)?; assert_eq!(got, expected, "idx {idx}"); } - // Index 3 is OOB; check Pulley only (native trap-via-signal - // interacts badly with `cargo test`'s debug-mode signal handlers). + // Pulley only — native signal-based traps interact badly with + // `cargo test`'s debug-mode signal handlers. let bytes = wat::parse_str(wat)?; let engine = Engine::new(&pulley_trap_safe_config())?; let module = Module::new(&engine, &bytes)?; @@ -609,13 +595,8 @@ fn fusion_call_indirect_every_index() -> Result<()> { Ok(()) } -/// Two call_indirect sites in the same function. Phase 2 must fire -/// per-site (each fused MachInst defs its own dst vregs; the pre-pass -/// `to_sink` list doesn't dedup or drop one). -/// -/// Reference: ChakraCore #5915 ("setPrototypeOf does not invalidate -/// cached instanceof IC inside currently-executing frame") — per-site -/// IC state must be independent. +/// Two call_indirect sites in the same function; each must fuse +/// independently. #[test] fn fusion_call_indirect_multi_site() -> Result<()> { let wat = r#" @@ -637,10 +618,7 @@ fn fusion_call_indirect_multi_site() -> Result<()> { Ok(()) } -/// `return_call_indirect` (tail call). Phase 2 fires here too — see -/// `tests/disas/pulley-fusion-fires-return-call-indirect.wat`. This -/// test pins the runtime correctness: the tail call uses the -/// fused-op-loaded code+vmctx and returns the right value. +/// `return_call_indirect` correctness with fusion applied. #[test] fn fusion_return_call_indirect() -> Result<()> { let wat = r#" @@ -661,15 +639,8 @@ fn fusion_return_call_indirect() -> Result<()> { Ok(()) } -/// Host mutates the table via `Table::set` to `ref.null func`. Both -/// Pulley and native must trap `IndirectCallToNull` at the now-null -/// slot. The phase-2 fused op's runtime null check has to catch the -/// host-injected null. -/// -/// Reference: GHSA-q49f-xg75-m9xw (Winch `table.fill` host panic); -/// V8 CVE-2024-2887 ("JS-to-wasm boundary funcref injection bypasses -/// immutability"). The predicate is compile-time; host mutation at -/// runtime is OK only because the fused op still does the null check. +/// Host mutates a slot to `ref.null func`; call_indirect must trap +/// `IndirectCallToNull`. #[test] fn fusion_call_indirect_with_host_null_set() -> Result<()> { let wat = r#" @@ -698,20 +669,12 @@ fn fusion_call_indirect_with_host_null_set() -> Result<()> { assert_eq!(call.call(&mut store, 0)?, 100); let err = call.call(&mut store, 1).unwrap_err(); let trap = err.downcast_ref::().expect("Trap"); - assert_eq!( - *trap, - Trap::IndirectCallToNull, - "phase-2 fused op missed runtime null check" - ); + assert_eq!(*trap, Trap::IndirectCallToNull); Ok(()) } -/// Host `Table::set` to a different (non-null) funcref between calls. -/// The fused op must re-load `wasm_call` and `vmctx` on every dispatch -/// — no caching of code/vmctx across calls. -/// -/// Reference: Luau release/717 ("writes to userdata did not invalidate -/// the store cache" — fused-op cached state survived a mutation). +/// Host `Table::set` swaps to a different funcref between calls; the +/// second call must observe the new target. #[test] fn fusion_call_indirect_with_host_swap() -> Result<()> { let wat = r#" @@ -729,7 +692,11 @@ fn fusion_call_indirect_with_host_swap() -> Result<()> { let bytes = wat::parse_str(wat)?; for use_pulley in [true, false] { - let cfg = if use_pulley { pulley_trap_safe_config() } else { Config::new() }; + let cfg = if use_pulley { + pulley_trap_safe_config() + } else { + Config::new() + }; let engine = Engine::new(&cfg)?; let module = Module::new(&engine, &bytes)?; let mut store = Store::new(&engine, ()); @@ -744,24 +711,15 @@ fn fusion_call_indirect_with_host_swap() -> Result<()> { let table = inst.get_table(&mut store, "t").expect("table export"); table.set(&mut store, 0, wasmtime::Ref::Func(Some(f1_ref)))?; - assert_eq!( - call.call(&mut store, 0)?, - 200, - "use_pulley={use_pulley}: fused op cached stale code/vmctx?" - ); + assert_eq!(call.call(&mut store, 0)?, 200, "use_pulley={use_pulley}"); } Ok(()) } -/// Module B imports module A's table and calls into it via -/// call_indirect. Module B's fusion (if any) must use module A's -/// actual VMFuncRef layout, not B's local assumptions about the -/// table. -/// -/// Reference: V8 issue 5913 ("call_indirect signature mismatch with -/// table-sharing"); the predicate scope is module-local; an imported -/// table breaks the "tables_mutated == false" assumption from the -/// importer's perspective. +/// Module B imports module A's table and calls into it. Tables are +/// imported, so the importer's `tables_mutated` is `true` and no +/// fusion fires on B's side; the call must still produce the right +/// result. #[test] fn fusion_call_indirect_imported_table() -> Result<()> { let wat_a = r#" @@ -782,7 +740,11 @@ fn fusion_call_indirect_imported_table() -> Result<()> { let bytes_b = wat::parse_str(wat_b)?; for use_pulley in [true, false] { - let cfg = if use_pulley { pulley_trap_safe_config() } else { Config::new() }; + let cfg = if use_pulley { + pulley_trap_safe_config() + } else { + Config::new() + }; let engine = Engine::new(&cfg)?; let module_a = Module::new(&engine, &bytes_a)?; let module_b = Module::new(&engine, &bytes_b)?; @@ -810,14 +772,7 @@ fn fusion_call_indirect_imported_table() -> Result<()> { /// op's runtime null check must trap cleanly with the right trap kind, /// not crash on the field deref. /// -/// For an uninitialised slot the trap kind is `UninitializedElement` -/// (the slot's contents are the lazy-init sentinel `0`, which is -/// distinct from an explicit `Ref::Func(None)` — see -/// `fusion_call_indirect_with_host_null_set` for the latter). -/// -/// Reference: WasmEdge #4757 ("GC null ref from concrete-typed table -/// SEGV — null check sequenced after deref"). Our handler does the -/// null check BEFORE the field deref, so this should trap cleanly. +/// Call into an uninitialised table slot must trap. #[test] fn fusion_call_indirect_null_slot() -> Result<()> { let wat = r#" @@ -828,13 +783,7 @@ fn fusion_call_indirect_null_slot() -> Result<()> { call_indirect (result i32))) "#; let bytes = wat::parse_str(wat)?; - // Pulley only: native-backend trap-via-signal interacts with cargo - // test's signal-handler setup in debug mode and shows up as a - // SIGSEGV instead of a Trap. Running the same code via `cargo run - // --release` or directly outside the test harness traps cleanly, - // so this is a test-harness limitation rather than a real native - // backend bug. Pulley uses explicit trapz (no signals) so it works - // in both modes. + // Pulley only — see note on `fusion_call_indirect_every_index`. let engine = Engine::new(&pulley_trap_safe_config())?; let module = Module::new(&engine, &bytes)?; let mut store = Store::new(&engine, ()); diff --git a/tests/disas/pulley-call-indirect-band-brif-fusion.wat b/tests/disas/pulley-call-indirect-band-brif-fusion.wat index 96137e0425b9..5cd6509e88db 100644 --- a/tests/disas/pulley-call-indirect-band-brif-fusion.wat +++ b/tests/disas/pulley-call-indirect-band-brif-fusion.wat @@ -53,30 +53,26 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 32, x16, x17, x25 +;; push_frame_save 32, x16, x17, x24 ;; xmov x3, x0 -;; br_if_xugteq32_u8 x2, 3, 0x55 // target = 0x6e -;; 20: xload64le_o32 x0, x0, 48 +;; br_if_xugteq32_u8 x2, 3, 0x4b // target = 0x64 +;; 20: xmov x1, x3 +;; xload64le_o32 x0, x1, 48 ;; zext32 x15, x2 ;; xshl64_u6 x1, x15, 3 ;; xadd64 x0, x0, x1 ;; xload64le_o32 x0, x0, 0 -;; xband_funcref_dispatch_not_x64 x0, x16, x17, x0, 8, 24, 0x1c // target = 0x53 -;; xmov x2, x0 -;; xmov x1, x3 -;; xmov x0, x17 -;; call_indirect x16 -;; pop_frame_restore 32, x16, x17, x25 +;; xband_funcref_dispatch_not_x64 x0, x17, x16, x0, 8, 24, 0x18 // target = 0x52 +;; xmov x24, x3 +;; call_indirect2 x17, x16, x24 +;; pop_frame_restore 32, x16, x17, x24 ;; ret -;; 53: xzero x0 -;; 55: xmov x25, x3 -;; 58: call3 x25, x0, x15, 0x270 // target = 0x2c8 -;; 60: xmov x2, x0 -;; 63: xmov x0, x17 -;; 66: xmov x1, x25 -;; 69: jump -0x1e // target = 0x4b -;; 6e: trap -;; ╰─╼ trap: TableOutOfBounds +;; 52: xzero x0 +;; 54: xmov x24, x3 +;; 57: call3 x24, x0, x15, 0x267 // target = 0x2be +;; 5f: jump -0x17 // target = 0x48 +;; 64: trap +;; ╰─╼ trap: Normal(TableOutOfBounds) ;; ;; wasm[0]::array_to_wasm_trampoline[0]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -86,19 +82,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0xc3 +;; xpcadd x15, 0x2a // target = 0xb9 ;; xstore64le_o32 x13, 80, x15 -;; call -0xa8 // target = 0x0 +;; call -0x9e // target = 0x0 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc3 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xb9 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; c3: xzero x0 -;; c5: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; ca: ret +;; b9: xzero x0 +;; bb: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; c0: ret ;; ;; wasm[0]::array_to_wasm_trampoline[1]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -108,19 +104,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x11d +;; xpcadd x15, 0x2a // target = 0x113 ;; xstore64le_o32 x13, 80, x15 -;; call -0xfd // target = 0x5 +;; call -0xf3 // target = 0x5 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x11d +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x113 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 11d: xzero x0 -;; 11f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 124: ret +;; 113: xzero x0 +;; 115: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 11a: ret ;; ;; wasm[0]::array_to_wasm_trampoline[2]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -130,19 +126,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x177 +;; xpcadd x15, 0x2a // target = 0x16d ;; xstore64le_o32 x13, 80, x15 -;; call -0x151 // target = 0xb +;; call -0x147 // target = 0xb ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x177 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x16d ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 177: xzero x0 -;; 179: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 17e: ret +;; 16d: xzero x0 +;; 16f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 174: ret ;; ;; wasm[0]::array_to_wasm_trampoline[3]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -153,19 +149,19 @@ ;; xstore64le_o32 x15, 72, x2 ;; xmov x2, sp ;; xstore64le_o32 x15, 64, x2 -;; xpcadd x2, 0x2d // target = 0x1db +;; xpcadd x2, 0x2d // target = 0x1d1 ;; xstore64le_o32 x15, 80, x2 -;; call3 x0, x1, x14, -0x1ac // target = 0x11 +;; call3 x0, x1, x14, -0x1a2 // target = 0x11 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1db +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1d1 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 1db: xzero x0 -;; 1dd: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 1e2: ret +;; 1d1: xzero x0 +;; 1d3: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1d8: ret ;; ;; signatures[0]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -184,15 +180,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x15, x0 -;; br_if_not32 x15, 0x13 // target = 0x23a -;; 22d: xload32le_o32 x0, x16, 0 +;; br_if_not32 x15, 0x13 // target = 0x230 +;; 223: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 23a: xmov x1, x17 -;; 23d: xload64le_o32 x0, x1, 16 -;; 244: xload64le_o32 x0, x0, 408 -;; 24b: call_indirect_host 52 -;; 24f: trap +;; 230: xmov x1, x17 +;; 233: xload64le_o32 x0, x1, 16 +;; 23a: xload64le_o32 x0, x0, 328 +;; 241: call_indirect_host 42 +;; 245: trap ;; ;; signatures[1]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -212,15 +208,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x0, x0 -;; br_if_not32 x0, 0x13 // target = 0x2b0 -;; 2a3: xload32le_o32 x0, x16, 0 +;; br_if_not32 x0, 0x13 // target = 0x2a6 +;; 299: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 2b0: xmov x1, x17 -;; 2b3: xload64le_o32 x0, x1, 16 -;; 2ba: xload64le_o32 x0, x0, 408 -;; 2c1: call_indirect_host 52 -;; 2c5: trap +;; 2a6: xmov x1, x17 +;; 2a9: xload64le_o32 x0, x1, 16 +;; 2b0: xload64le_o32 x0, x0, 328 +;; 2b7: call_indirect_host 42 +;; 2bb: trap ;; ;; wasmtime_builtin_table_get_lazy_init_func_ref: ;; push_frame @@ -231,10 +227,10 @@ ;; xstore64le_o32 x9, 56, x10 ;; xload64le_o32 x11, x0, 16 ;; xmov x13, x0 -;; xload64le_o32 x0, x11, 72 +;; xload64le_o32 x0, x11, 56 ;; xmov x3, x2 ;; xmov x2, x1 ;; xmov x1, x13 -;; call_indirect_host 10 +;; call_indirect_host 8 ;; pop_frame ;; ret diff --git a/tests/disas/pulley-fusion-fires-32bit.wat b/tests/disas/pulley-fusion-fires-32bit.wat index a540637aae2e..481ed46c6056 100644 --- a/tests/disas/pulley-fusion-fires-32bit.wat +++ b/tests/disas/pulley-fusion-fires-32bit.wat @@ -49,30 +49,23 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 32, x16, x17, x25 -;; br_if_xugteq32_u8 x2, 3, 0x58 // target = 0x6e +;; push_frame_save 32, x16, x17, x24 +;; br_if_xugteq32_u8 x2, 3, 0x45 // target = 0x5b ;; 1d: xload32le_o32 x15, x0, 24 -;; xmov x3, x0 +;; xmov x24, x0 ;; xshl32_u6 x0, x2, 2 ;; xadd32 x15, x15, x0 ;; xload32le_o32 x15, x15, 0 -;; xband_funcref_dispatch_not_x32 x0, x16, x17, x15, 4, 12, 0x1c // target = 0x50 -;; xmov x2, x0 -;; xmov x1, x3 -;; xmov x0, x17 -;; call_indirect x16 -;; pop_frame_restore 32, x16, x17, x25 +;; xband_funcref_dispatch_not_x32 x0, x17, x16, x15, 4, 12, 0x15 // target = 0x49 +;; call_indirect2 x17, x16, x24 +;; pop_frame_restore 32, x16, x17, x24 ;; ret -;; 50: xzero x0 -;; 52: zext32 x1, x2 -;; 55: xmov x25, x3 -;; 58: call3 x25, x0, x1, 0x270 // target = 0x2c8 -;; 60: xmov x2, x0 -;; 63: xmov x0, x17 -;; 66: xmov x1, x25 -;; 69: jump -0x21 // target = 0x48 -;; 6e: trap -;; ╰─╼ trap: TableOutOfBounds +;; 49: xzero x0 +;; 4b: zext32 x1, x2 +;; 4e: call3 x24, x0, x1, 0x267 // target = 0x2b5 +;; 56: jump -0x17 // target = 0x3f +;; 5b: trap +;; ╰─╼ trap: Normal(TableOutOfBounds) ;; ;; wasm[0]::array_to_wasm_trampoline[0]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -82,19 +75,19 @@ ;; xstore32le_o32 x13, 48, x14 ;; xmov x14, sp ;; xstore32le_o32 x13, 44, x14 -;; xpcadd x15, 0x2a // target = 0xc3 +;; xpcadd x15, 0x2a // target = 0xb0 ;; xstore32le_o32 x13, 52, x15 -;; call -0xa8 // target = 0x0 +;; call -0x95 // target = 0x0 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc3 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xb0 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; c3: xzero x0 -;; c5: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; ca: ret +;; b0: xzero x0 +;; b2: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; b7: ret ;; ;; wasm[0]::array_to_wasm_trampoline[1]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -104,19 +97,19 @@ ;; xstore32le_o32 x13, 48, x14 ;; xmov x14, sp ;; xstore32le_o32 x13, 44, x14 -;; xpcadd x15, 0x2a // target = 0x11d +;; xpcadd x15, 0x2a // target = 0x10a ;; xstore32le_o32 x13, 52, x15 -;; call -0xfd // target = 0x5 +;; call -0xea // target = 0x5 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x11d +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x10a ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 11d: xzero x0 -;; 11f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 124: ret +;; 10a: xzero x0 +;; 10c: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 111: ret ;; ;; wasm[0]::array_to_wasm_trampoline[2]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -126,19 +119,19 @@ ;; xstore32le_o32 x13, 48, x14 ;; xmov x14, sp ;; xstore32le_o32 x13, 44, x14 -;; xpcadd x15, 0x2a // target = 0x177 +;; xpcadd x15, 0x2a // target = 0x164 ;; xstore32le_o32 x13, 52, x15 -;; call -0x151 // target = 0xb +;; call -0x13e // target = 0xb ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x177 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x164 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 177: xzero x0 -;; 179: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 17e: ret +;; 164: xzero x0 +;; 166: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 16b: ret ;; ;; wasm[0]::array_to_wasm_trampoline[3]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -149,19 +142,19 @@ ;; xstore32le_o32 x15, 48, x2 ;; xmov x2, sp ;; xstore32le_o32 x15, 44, x2 -;; xpcadd x2, 0x2d // target = 0x1db +;; xpcadd x2, 0x2d // target = 0x1c8 ;; xstore32le_o32 x15, 52, x2 -;; call3 x0, x1, x14, -0x1ac // target = 0x11 +;; call3 x0, x1, x14, -0x199 // target = 0x11 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1db +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1c8 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 1db: xzero x0 -;; 1dd: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 1e2: ret +;; 1c8: xzero x0 +;; 1ca: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1cf: ret ;; ;; signatures[0]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -180,15 +173,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x15, x0 -;; br_if_not32 x15, 0x13 // target = 0x23a -;; 22d: xload32le_o32 x0, x16, 0 +;; br_if_not32 x15, 0x13 // target = 0x227 +;; 21a: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 23a: xmov x1, x17 -;; 23d: xload32le_o32 x0, x1, 8 -;; 244: xload32le_o32 x0, x0, 204 -;; 24b: call_indirect_host 52 -;; 24f: trap +;; 227: xmov x1, x17 +;; 22a: xload32le_o32 x0, x1, 8 +;; 231: xload32le_o32 x0, x0, 164 +;; 238: call_indirect_host 42 +;; 23c: trap ;; ;; signatures[1]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -208,15 +201,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x0, x0 -;; br_if_not32 x0, 0x13 // target = 0x2b0 -;; 2a3: xload32le_o32 x0, x16, 0 +;; br_if_not32 x0, 0x13 // target = 0x29d +;; 290: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 2b0: xmov x1, x17 -;; 2b3: xload32le_o32 x0, x1, 8 -;; 2ba: xload32le_o32 x0, x0, 204 -;; 2c1: call_indirect_host 52 -;; 2c5: trap +;; 29d: xmov x1, x17 +;; 2a0: xload32le_o32 x0, x1, 8 +;; 2a7: xload32le_o32 x0, x0, 164 +;; 2ae: call_indirect_host 42 +;; 2b2: trap ;; ;; wasmtime_builtin_table_get_lazy_init_func_ref: ;; push_frame @@ -227,10 +220,10 @@ ;; xstore32le_o32 x9, 40, x10 ;; xload32le_o32 x11, x0, 8 ;; xmov x13, x0 -;; xload32le_o32 x0, x11, 36 +;; xload32le_o32 x0, x11, 28 ;; xmov x3, x2 ;; xmov x2, x1 ;; xmov x1, x13 -;; call_indirect_host 10 +;; call_indirect_host 8 ;; pop_frame ;; ret diff --git a/tests/disas/pulley-fusion-fires-multi-call.wat b/tests/disas/pulley-fusion-fires-multi-call.wat index 4e3bf866747f..cfccdb5a0b27 100644 --- a/tests/disas/pulley-fusion-fires-multi-call.wat +++ b/tests/disas/pulley-fusion-fires-multi-call.wat @@ -51,47 +51,39 @@ ;; ;; wasm[0]::function[3]: ;; push_frame_save 48, x16, x17, x18, x27, x28, x29 -;; xmov x18, x0 ;; xmov x29, x3 -;; br_if_xugteq32_u8 x2, 3, 0xa6 // target = 0xc2 -;; 23: xload64le_o32 x28, x0, 48 +;; br_if_xugteq32_u8 x2, 3, 0x95 // target = 0xae +;; 20: xload64le_o32 x28, x0, 48 +;; xmov x4, x0 ;; zext32 x1, x2 ;; xshl64_u6 x0, x1, 3 ;; xadd64 x0, x28, x0 ;; xload64le_o32 x0, x0, 0 -;; xband_funcref_dispatch_not_x64 x0, x16, x17, x0, 8, 24, 0x55 // target = 0x8f -;; xmov x2, x0 -;; xmov x0, x17 -;; xmov x1, x18 -;; call_indirect x16 +;; xband_funcref_dispatch_not_x64 x0, x17, x16, x0, 8, 24, 0x50 // target = 0x8a +;; xmov x18, x4 +;; call_indirect2 x17, x16, x18 ;; xmov x3, x29 +;; xmov x4, x18 ;; xmov x17, x0 -;; br_if_xugteq32_u8 x3, 3, 0x6f // target = 0xc5 -;; 5d: zext32 x1, x3 +;; br_if_xugteq32_u8 x3, 3, 0x5c // target = 0xb1 +;; 5c: zext32 x1, x3 ;; xshl64_u6 x0, x1, 3 ;; xadd64 x0, x28, x0 ;; xload64le_o32 x0, x0, 0 -;; xband_funcref_dispatch_not_x64 x0, x27, x28, x0, 8, 24, 0x3a // target = 0xa7 -;; xmov x2, x0 -;; xmov x1, x18 -;; xmov x0, x28 -;; call_indirect x27 +;; xband_funcref_dispatch_not_x64 x0, x27, x28, x0, 8, 24, 0x30 // target = 0x9c +;; xmov x16, x4 +;; call_indirect2 x27, x28, x16 ;; xmov x1, x17 ;; xadd32 x0, x1, x0 ;; pop_frame_restore 48, x16, x17, x18, x27, x28, x29 ;; ret -;; 8f: xzero x0 -;; 91: xmov x2, x18 -;; 94: call3 x2, x0, x1, 0x29e // target = 0x332 -;; 9c: xmov x2, x0 -;; 9f: xmov x0, x17 -;; a2: jump -0x57 // target = 0x4b -;; a7: xzero x0 -;; a9: xmov x16, x18 -;; ac: call3 x16, x0, x1, 0x286 // target = 0x332 -;; b4: xmov x2, x0 -;; b7: xmov x0, x28 -;; ba: xmov x1, x16 -;; bd: jump -0x3c // target = 0x81 -;; c2: trap -;; c5: trap +;; 8a: xzero x0 +;; 8c: xmov x18, x4 +;; 8f: call3 x18, x0, x1, 0x28f // target = 0x31e +;; 97: jump -0x4f // target = 0x48 +;; 9c: xzero x0 +;; 9e: xmov x16, x4 +;; a1: call3 x16, x0, x1, 0x27d // target = 0x31e +;; a9: jump -0x2f // target = 0x7a +;; ae: trap +;; b1: trap diff --git a/tests/disas/pulley-fusion-no-fire-mutable-table.wat b/tests/disas/pulley-fusion-no-fire-mutable-table.wat index f582f8a28526..f1c57adc892a 100644 --- a/tests/disas/pulley-fusion-no-fire-mutable-table.wat +++ b/tests/disas/pulley-fusion-no-fire-mutable-table.wat @@ -67,7 +67,7 @@ ;; xmov x17, x2 ;; xzero x9 ;; xmov x16, x12 -;; call2 x16, x9, 0x3d8 // target = 0x3f9 +;; call2 x16, x9, 0x3da // target = 0x3fb ;; xmov x2, x17 ;; xmov x12, x16 ;; br_if_xugteq32_u8 x2, 3, 0x2b // target = 0x59 @@ -81,41 +81,41 @@ ;; pop_frame_restore 16, x16, x17 ;; ret ;; 59: trap -;; ╰─╼ trap: TableOutOfBounds +;; ╰─╼ trap: Normal(TableOutOfBounds) ;; ;; wasm[0]::function[4]: -;; push_frame_save 16, x29 +;; push_frame_save 16, x28 ;; xmov x3, x0 -;; br_if_xugteq32_u8 x2, 3, 0x7a // target = 0xde -;; 6b: xload64le_o32 x0, x0, 48 +;; br_if_xugteq32_u8 x2, 3, 0x7c // target = 0xe0 +;; 6b: xmov x1, x3 +;; xload64le_o32 x0, x1, 48 ;; zext32 x1, x2 ;; xshl64_u6 x2, x1, 3 ;; xadd64 x0, x0, x2 ;; xload64le_o32 x2, x0, 0 ;; xband64_s8 x0, x2, -2 -;; br_if_xeq64_i8 x2, 0, 0x46 // target = 0xcc -;; 8d: xmov x29, x3 -;; br_if_xeq64_i8 x0, 0, 0x51 // target = 0xe1 -;; 97: xload32le_o32 x1, x0, 16 -;; xload64le_o32 x2, x29, 40 +;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0xce +;; 90: xmov x28, x3 +;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0xe3 +;; 9a: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x28, 40 ;; xload32le_o32 x2, x2, 0 -;; br_if_xneq32 x1, x2, 0x38 // target = 0xe4 -;; b3: xload64le_o32 x2, x0, 8 +;; br_if_xneq32 x1, x2, 0x37 // target = 0xe6 +;; b6: xload64le_o32 x1, x0, 8 ;; xload64le_o32 x0, x0, 24 -;; xmov x1, x29 -;; call_indirect x2 -;; pop_frame_restore 16, x29 +;; call_indirect2 x1, x0, x28 +;; pop_frame_restore 16, x28 ;; ret -;; cc: xzero x0 -;; ce: xmov x29, x3 -;; d1: call3 x29, x0, x1, 0x363 // target = 0x434 -;; d9: jump -0x49 // target = 0x90 -;; de: trap -;; ╰─╼ trap: TableOutOfBounds -;; e1: trap -;; ╰─╼ trap: IndirectCallToNull -;; e4: trap -;; ╰─╼ trap: BadSignature +;; ce: xzero x0 +;; d0: xmov x28, x3 +;; d3: call3 x28, x0, x1, 0x363 // target = 0x436 +;; db: jump -0x48 // target = 0x93 +;; e0: trap +;; ╰─╼ trap: Normal(TableOutOfBounds) +;; e3: trap +;; ╰─╼ trap: Normal(IndirectCallToNull) +;; e6: trap +;; ╰─╼ trap: Normal(BadSignature) ;; ;; wasm[0]::array_to_wasm_trampoline[0]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -125,19 +125,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x139 +;; xpcadd x15, 0x2a // target = 0x13b ;; xstore64le_o32 x13, 80, x15 -;; call -0x11e // target = 0x0 +;; call -0x120 // target = 0x0 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x139 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x13b ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 139: xzero x0 -;; 13b: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 140: ret +;; 13b: xzero x0 +;; 13d: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 142: ret ;; ;; wasm[0]::array_to_wasm_trampoline[1]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -147,19 +147,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x193 +;; xpcadd x15, 0x2a // target = 0x195 ;; xstore64le_o32 x13, 80, x15 -;; call -0x173 // target = 0x5 +;; call -0x175 // target = 0x5 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x193 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x195 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 193: xzero x0 -;; 195: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 19a: ret +;; 195: xzero x0 +;; 197: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 19c: ret ;; ;; wasm[0]::array_to_wasm_trampoline[2]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -169,19 +169,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x1ed +;; xpcadd x15, 0x2a // target = 0x1ef ;; xstore64le_o32 x13, 80, x15 -;; call -0x1c7 // target = 0xb +;; call -0x1c9 // target = 0xb ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1ed +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1ef ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 1ed: xzero x0 -;; 1ef: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 1f4: ret +;; 1ef: xzero x0 +;; 1f1: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1f6: ret ;; ;; wasm[0]::array_to_wasm_trampoline[3]: ;; push_frame_save 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -191,17 +191,17 @@ ;; xstore64le_o32 x14, 72, x15 ;; xmov x15, sp ;; xstore64le_o32 x14, 64, x15 -;; xpcadd x15, 0x1f // target = 0x23c +;; xpcadd x15, 0x1f // target = 0x23e ;; xstore64le_o32 x14, 80, x15 -;; call3 x0, x1, x13, -0x21b // target = 0x11 +;; call3 x0, x1, x13, -0x21d // target = 0x11 ;; ├─╼ exception frame offset: SP = FP - 0x80 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x23c +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x23e ;; xone x0 ;; pop_frame_restore 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 23c: xzero x0 -;; 23e: pop_frame_restore 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 243: ret +;; 23e: xzero x0 +;; 240: pop_frame_restore 128, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 245: ret ;; ;; wasm[0]::array_to_wasm_trampoline[4]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -212,19 +212,19 @@ ;; xstore64le_o32 x15, 72, x2 ;; xmov x2, sp ;; xstore64le_o32 x15, 64, x2 -;; xpcadd x2, 0x2d // target = 0x2a0 +;; xpcadd x2, 0x2d // target = 0x2a2 ;; xstore64le_o32 x15, 80, x2 -;; call3 x0, x1, x14, -0x226 // target = 0x5c +;; call3 x0, x1, x14, -0x228 // target = 0x5c ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x2a0 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x2a2 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 2a0: xzero x0 -;; 2a2: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 2a7: ret +;; 2a2: xzero x0 +;; 2a4: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 2a9: ret ;; ;; signatures[0]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -243,15 +243,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x15, x0 -;; br_if_not32 x15, 0x13 // target = 0x2ff -;; 2f2: xload32le_o32 x0, x16, 0 +;; br_if_not32 x15, 0x13 // target = 0x301 +;; 2f4: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 2ff: xmov x1, x17 -;; 302: xload64le_o32 x0, x1, 16 -;; 309: xload64le_o32 x0, x0, 408 -;; 310: call_indirect_host 52 -;; 314: trap +;; 301: xmov x1, x17 +;; 304: xload64le_o32 x0, x1, 16 +;; 30b: xload64le_o32 x0, x0, 328 +;; 312: call_indirect_host 42 +;; 316: trap ;; ;; signatures[1]::wasm_to_array_trampoline: ;; push_frame_save 32, x16 @@ -270,14 +270,14 @@ ;; xmov x2, x16 ;; call_indirect_host 0 ;; zext8 x0, x0 -;; br_if_not32 x0, 0xc // target = 0x36b -;; 365: pop_frame_restore 32, x16 +;; br_if_not32 x0, 0xc // target = 0x36d +;; 367: pop_frame_restore 32, x16 ;; ret -;; 36b: xmov x1, x16 -;; 36e: xload64le_o32 x0, x1, 16 -;; 375: xload64le_o32 x0, x0, 408 -;; 37c: call_indirect_host 52 -;; 380: trap +;; 36d: xmov x1, x16 +;; 370: xload64le_o32 x0, x1, 16 +;; 377: xload64le_o32 x0, x0, 328 +;; 37e: call_indirect_host 42 +;; 382: trap ;; ;; signatures[2]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -297,15 +297,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x0, x0 -;; br_if_not32 x0, 0x13 // target = 0x3e1 -;; 3d4: xload32le_o32 x0, x16, 0 +;; br_if_not32 x0, 0x13 // target = 0x3e3 +;; 3d6: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 3e1: xmov x1, x17 -;; 3e4: xload64le_o32 x0, x1, 16 -;; 3eb: xload64le_o32 x0, x0, 408 -;; 3f2: call_indirect_host 52 -;; 3f6: trap +;; 3e3: xmov x1, x17 +;; 3e6: xload64le_o32 x0, x1, 16 +;; 3ed: xload64le_o32 x0, x0, 328 +;; 3f4: call_indirect_host 42 +;; 3f8: trap ;; ;; wasmtime_builtin_ref_func: ;; push_frame @@ -316,10 +316,10 @@ ;; xstore64le_o32 x8, 56, x9 ;; xload64le_o32 x10, x0, 16 ;; xmov x11, x0 -;; xload64le_o32 x0, x10, 56 +;; xload64le_o32 x0, x10, 48 ;; xmov x2, x1 ;; xmov x1, x11 -;; call_indirect_host 8 +;; call_indirect_host 7 ;; pop_frame ;; ret ;; @@ -332,10 +332,10 @@ ;; xstore64le_o32 x9, 56, x10 ;; xload64le_o32 x11, x0, 16 ;; xmov x13, x0 -;; xload64le_o32 x0, x11, 72 +;; xload64le_o32 x0, x11, 56 ;; xmov x3, x2 ;; xmov x2, x1 ;; xmov x1, x13 -;; call_indirect_host 10 +;; call_indirect_host 8 ;; pop_frame ;; ret diff --git a/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat b/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat index f84f81d57b10..1c3bade1689f 100644 --- a/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat +++ b/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat @@ -55,33 +55,33 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 16, x16, x19 +;; push_frame_save 16, x18, x29 ;; xmov x3, x0 -;; br_if_xugteq32_u8 x2, 3, 0x7d // target = 0x96 -;; 20: xload64le_o32 x0, x0, 48 +;; br_if_xugteq32_u8 x2, 3, 0x7f // target = 0x98 +;; 20: xmov x1, x3 +;; xload64le_o32 x0, x1, 48 ;; zext32 x1, x2 -;; xmov x19, x2 +;; xmov x18, x2 ;; xshl64_u6 x2, x1, 3 ;; xadd64 x0, x0, x2 ;; xload64le_o32 x0, x0, 0 -;; xband64_s8_br_if_not_x64 x0, x0, -2, 0x4a // target = 0x84 -;; 42: xmov x16, x3 -;; br_if_xeq64_i8 x0, 0, 0x54 // target = 0x99 -;; 4c: xload32le_o32 x1, x0, 16 -;; xload64le_o32 x2, x16, 40 +;; xband64_s8_br_if_not_x64 x0, x0, -2, 0x49 // target = 0x86 +;; 45: xmov x29, x3 +;; br_if_xeq64_i8 x0, 0, 0x53 // target = 0x9b +;; 4f: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x29, 40 ;; xload32le_o32 x2, x2, 0 -;; br_if_xneq32 x1, x2, 0x3b // target = 0x9c -;; 68: xload64le_o32 x3, x0, 8 +;; br_if_xneq32 x1, x2, 0x3a // target = 0x9e +;; 6b: xload64le_o32 x1, x0, 8 ;; xload64le_o32 x0, x0, 24 -;; xmov x1, x16 -;; xmov x2, x19 -;; call_indirect x3 -;; pop_frame_restore 16, x16, x19 +;; xmov x2, x18 +;; call_indirect2 x1, x0, x29 +;; pop_frame_restore 16, x18, x29 ;; ret -;; 84: xzero x0 -;; 86: xmov x16, x3 -;; 89: call3 x16, x0, x1, 0x281 // target = 0x30a -;; 91: jump -0x4c // target = 0x45 -;; 96: trap -;; 99: trap -;; 9c: trap +;; 86: xzero x0 +;; 88: xmov x29, x3 +;; 8b: call3 x29, x0, x1, 0x281 // target = 0x30c +;; 93: jump -0x4b // target = 0x48 +;; 98: trap +;; 9b: trap +;; 9e: trap diff --git a/tests/disas/pulley-fusion-no-fire-table-copy.wat b/tests/disas/pulley-fusion-no-fire-table-copy.wat index eb7ad6d2e326..9b89bccf1ec9 100644 --- a/tests/disas/pulley-fusion-no-fire-table-copy.wat +++ b/tests/disas/pulley-fusion-no-fire-table-copy.wat @@ -61,70 +61,127 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame -;; xmov x15, x4 -;; xzero x10 -;; xone x11 -;; zext32 x12, x2 -;; zext32 x4, x3 -;; zext32 x5, x15 -;; call4 x0, x10, x11, x12, 0x49f // target = 0x4c1 -;; pop_frame +;; push_frame_save 64, x16, x17, x20, x21, x24, x26, x28 +;; zext32 x2, x2 +;; zext32 x1, x4 +;; xadd64 x5, x2, x1 +;; br_if_xugt64_u8 x5, 5, 0x109 // target = 0x128 +;; 26: zext32 x5, x3 +;; xadd64 x6, x5, x1 +;; br_if_xugt64_u8 x6, 5, 0xff // target = 0x12b +;; br_if_not32 x4, 0xcf // target = 0x102 +;; 39: xload64le_o32 x6, x0, 48 +;; xshl64_u6 x2, x2, 3 +;; xadd64 x17, x6, x2 +;; xload64le_o32 x16, x0, 64 +;; xmov x6, x0 +;; xshl64_u6 x0, x5, 3 +;; xadd64 x20, x16, x0 +;; xshl64_u6 x0, x1, 3 +;; xadd64 x24, x17, x0 +;; xadd64 x26, x20, x0 +;; xadd32 x28, x3, x4 +;; xmov x0, x3 +;; br_if_xulteq64 x20, x17, 0x12 // target = 0x77 +;; 6c: xmov x21, x6 +;; xmov x28, x0 +;; jump 0x50 // target = 0xc2 +;; 77: xsub32_u8 x28, x28, 1 +;; br_if_xugteq32_u8 x28, 5, 0xb3 // target = 0x12e +;; 82: zext32 x1, x28 +;; xshl64_u6 x0, x1, 3 +;; xadd64 x0, x16, x0 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x72 // target = 0x108 +;; 9d: xmov x21, x6 +;; xbor64_s8 x0, x0, 1 +;; xsub64_u8 x24, x24, 8 +;; xstore64le_o32 x24, 0, x0 +;; xsub64_u8 x26, x26, 8 +;; br_if_xeq64 x26, x20, 0x4f // target = 0x102 +;; ba: xmov x6, x21 +;; jump -0x46 // target = 0x77 +;; br_if_xugteq32_u8 x28, 5, 0x6f // target = 0x131 +;; c9: zext32 x2, x28 +;; xshl64_u6 x3, x2, 3 +;; xadd64 x3, x16, x3 +;; xload64le_o32 x3, x3, 0 +;; xband64_s8 x0, x3, -2 +;; br_if_xeq64_i8 x3, 0, 0x3d // target = 0x11a +;; e4: xbor64_s8 x5, x0, 1 +;; xstore64le_o32 x17, 0, x5 +;; xadd64_u8 x20, x20, 8 +;; xadd64_u8 x17, x17, 8 +;; xadd32_u8 x28, x28, 1 +;; br_if_xneq64 x20, x26, -0x39 // target = 0xc2 +;; 102: pop_frame_restore 64, x16, x17, x20, x21, x24, x26, x28 ;; ret +;; 108: xone x0 +;; 10a: xmov x21, x6 +;; 10d: call3 x21, x0, x1, 0x4bf // target = 0x5cc +;; 115: jump -0x75 // target = 0xa0 +;; 11a: xone x4 +;; 11c: call2 x21, x4, 0x4b0 // target = 0x5cc +;; 123: jump -0x3f // target = 0xe4 +;; 128: trap +;; 12b: trap +;; 12e: trap +;; 131: trap ;; ;; wasm[0]::function[4]: -;; push_frame_save 16, x29 +;; push_frame_save 16, x28 ;; xmov x3, x0 -;; br_if_xugteq32_u8 x2, 5, 0x7a // target = 0xaf -;; 3c: xload64le_o32 x0, x0, 48 +;; br_if_xugteq32_u8 x2, 5, 0x7c // target = 0x1b8 +;; 143: xmov x1, x3 +;; xload64le_o32 x0, x1, 48 ;; zext32 x1, x2 ;; xshl64_u6 x2, x1, 3 ;; xadd64 x0, x0, x2 ;; xload64le_o32 x2, x0, 0 ;; xband64_s8 x0, x2, -2 -;; br_if_xeq64_i8 x2, 0, 0x46 // target = 0x9d -;; 5e: xmov x29, x3 -;; br_if_xeq64_i8 x0, 0, 0x51 // target = 0xb2 -;; 68: xload32le_o32 x1, x0, 16 -;; xload64le_o32 x2, x29, 40 +;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0x1a6 +;; 168: xmov x28, x3 +;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0x1bb +;; 172: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x28, 40 ;; xload32le_o32 x2, x2, 0 -;; br_if_xneq32 x1, x2, 0x38 // target = 0xb5 -;; 84: xload64le_o32 x2, x0, 8 +;; br_if_xneq32 x1, x2, 0x37 // target = 0x1be +;; 18e: xload64le_o32 x1, x0, 8 ;; xload64le_o32 x0, x0, 24 -;; xmov x1, x29 -;; call_indirect x2 -;; pop_frame_restore 16, x29 +;; call_indirect2 x1, x0, x28 +;; pop_frame_restore 16, x28 ;; ret -;; 9d: xzero x0 -;; 9f: xmov x29, x3 -;; a2: call3 x29, x0, x1, 0x495 // target = 0x537 -;; aa: jump -0x49 // target = 0x61 -;; af: trap -;; b2: trap -;; b5: trap +;; 1a6: xzero x0 +;; 1a8: xmov x28, x3 +;; 1ab: call3 x28, x0, x1, 0x421 // target = 0x5cc +;; 1b3: jump -0x48 // target = 0x16b +;; 1b8: trap +;; 1bb: trap +;; 1be: trap ;; ;; wasm[0]::function[5]: -;; push_frame_save 16, x26 +;; push_frame_save 16, x25 ;; xmov x3, x0 -;; br_if_xugteq32_u8 x2, 5, 0x5e // target = 0x11e -;; c7: xload64le_o32 x0, x0, 64 +;; br_if_xugteq32_u8 x2, 5, 0x60 // target = 0x229 +;; 1d0: xmov x1, x3 +;; xload64le_o32 x0, x1, 64 ;; zext32 x15, x2 ;; xshl64_u6 x1, x15, 3 ;; xadd64 x0, x0, x1 ;; xload64le_o32 x1, x0, 0 ;; xband64_s8 x0, x1, -2 -;; br_if_xeq64_i8 x1, 0, 0x2a // target = 0x10c -;; e9: xmov x26, x3 -;; br_if_xeq64_i8 x0, 0, 0x35 // target = 0x121 -;; f3: xload64le_o32 x2, x0, 8 +;; br_if_xeq64_i8 x1, 0, 0x29 // target = 0x217 +;; 1f5: xmov x25, x3 +;; br_if_xeq64_i8 x0, 0, 0x34 // target = 0x22c +;; 1ff: xload64le_o32 x1, x0, 8 ;; xload64le_o32 x0, x0, 24 -;; xmov x1, x26 -;; call_indirect x2 -;; pop_frame_restore 16, x26 +;; call_indirect2 x1, x0, x25 +;; pop_frame_restore 16, x25 ;; ret -;; 10c: xone x0 -;; 10e: xmov x26, x3 -;; 111: call3 x26, x0, x15, 0x426 // target = 0x537 -;; 119: jump -0x2d // target = 0xec -;; 11e: trap -;; 121: trap +;; 217: xone x0 +;; 219: xmov x25, x3 +;; 21c: call3 x25, x0, x15, 0x3b0 // target = 0x5cc +;; 224: jump -0x2c // target = 0x1f8 +;; 229: trap +;; 22c: trap diff --git a/tests/disas/pulley-fusion-no-fire-table-fill.wat b/tests/disas/pulley-fusion-no-fire-table-fill.wat index 63946adfa5e0..9bb480d7ff20 100644 --- a/tests/disas/pulley-fusion-no-fire-table-fill.wat +++ b/tests/disas/pulley-fusion-no-fire-table-fill.wat @@ -52,47 +52,59 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 32, x16, x17, x18 -;; xmov x12, x0 -;; xmov x18, x2 -;; xzero x16 -;; xmov x17, x12 -;; call2 x17, x16, 0x3be // target = 0x3df -;; xmov x2, x18 -;; xmov x12, x17 -;; zext32 x7, x2 -;; xone x4 -;; call4 x12, x16, x7, x0, 0x425 // target = 0x458 -;; pop_frame_restore 32, x16, x17, x18 +;; push_frame_save 16, x16, x20 +;; xmov x16, x2 +;; xzero x12 +;; xmov x20, x0 +;; call2 x20, x12, 0x3f7 // target = 0x415 +;; xmov x15, x0 +;; xmov x2, x16 +;; xmov x0, x20 +;; zext32 x12, x2 +;; xadd64_u8 x13, x12, 1 +;; br_if_xugt64_u8 x13, 3, 0x3e // target = 0x73 +;; 3c: xload64le_o32 x13, x0, 48 +;; xshl64_u6 x14, x12, 3 +;; xadd64 x13, x13, x14 +;; xmov x0, x15 +;; xmov x12, x13 +;; xbor64_s8 x14, x0, 1 +;; xstore64le_o32 x12, 0, x14 +;; xadd64_u8 x15, x12, 8 +;; br_if_xeq64 x12, x13, 0xf // target = 0x6d +;; 65: xmov x12, x15 +;; jump -0x19 // target = 0x4f +;; 6d: pop_frame_restore 16, x16, x20 ;; ret +;; 73: trap ;; ;; wasm[0]::function[4]: -;; push_frame_save 16, x29 +;; push_frame_save 16, x28 ;; xmov x3, x0 -;; br_if_xugteq32_u8 x2, 3, 0x7a // target = 0xc4 -;; 51: xload64le_o32 x0, x0, 48 +;; br_if_xugteq32_u8 x2, 3, 0x7c // target = 0xfa +;; 85: xmov x1, x3 +;; xload64le_o32 x0, x1, 48 ;; zext32 x1, x2 ;; xshl64_u6 x2, x1, 3 ;; xadd64 x0, x0, x2 ;; xload64le_o32 x2, x0, 0 ;; xband64_s8 x0, x2, -2 -;; br_if_xeq64_i8 x2, 0, 0x46 // target = 0xb2 -;; 73: xmov x29, x3 -;; br_if_xeq64_i8 x0, 0, 0x51 // target = 0xc7 -;; 7d: xload32le_o32 x1, x0, 16 -;; xload64le_o32 x2, x29, 40 +;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0xe8 +;; aa: xmov x28, x3 +;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0xfd +;; b4: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x28, 40 ;; xload32le_o32 x2, x2, 0 -;; br_if_xneq32 x1, x2, 0x38 // target = 0xca -;; 99: xload64le_o32 x2, x0, 8 +;; br_if_xneq32 x1, x2, 0x37 // target = 0x100 +;; d0: xload64le_o32 x1, x0, 8 ;; xload64le_o32 x0, x0, 24 -;; xmov x1, x29 -;; call_indirect x2 -;; pop_frame_restore 16, x29 +;; call_indirect2 x1, x0, x28 +;; pop_frame_restore 16, x28 ;; ret -;; b2: xzero x0 -;; b4: xmov x29, x3 -;; b7: call3 x29, x0, x1, 0x363 // target = 0x41a -;; bf: jump -0x49 // target = 0x76 -;; c4: trap -;; c7: trap -;; ca: trap +;; e8: xzero x0 +;; ea: xmov x28, x3 +;; ed: call3 x28, x0, x1, 0x363 // target = 0x450 +;; f5: jump -0x48 // target = 0xad +;; fa: trap +;; fd: trap +;; 100: trap diff --git a/tests/disas/pulley-fusion-no-fire-table-grow.wat b/tests/disas/pulley-fusion-no-fire-table-grow.wat index 3a76f16a29f2..5dcac37c501e 100644 --- a/tests/disas/pulley-fusion-no-fire-table-grow.wat +++ b/tests/disas/pulley-fusion-no-fire-table-grow.wat @@ -38,47 +38,66 @@ ;; ret ;; ;; wasm[0]::function[1]: -;; push_frame_save 32, x16, x17, x18 -;; xmov x12, x0 -;; xmov x18, x2 -;; xzero x16 -;; xmov x17, x12 -;; call2 x17, x16, 0x2b8 // target = 0x2cd -;; xmov x2, x18 -;; xmov x12, x17 -;; zext32 x6, x2 -;; call4 x12, x16, x6, x0, 0x321 // target = 0x346 -;; pop_frame_restore 32, x16, x17, x18 +;; push_frame_save 48, x18, x19, x20, x23, x28 +;; xmov x23, x2 +;; xzero x19 +;; xmov x28, x0 +;; call2 x28, x19, 0x313 // target = 0x325 +;; xmov x20, x0 +;; xmov x2, x23 +;; xmov x0, x28 +;; zext32 x18, x2 +;; call3 x28, x19, x18, 0x379 // target = 0x39e +;; xmov x1, x0 +;; br_if_xeq32_i8 x1, -1, 0x51 // target = 0x81 +;; 37: xload64le_o32 x3, x28, 56 +;; zext32 x2, x1 +;; xadd64 x4, x2, x18 +;; zext32 x0, x3 +;; br_if_xult64 x0, x4, 0x43 // target = 0x8a +;; 4e: xload64le_o32 x0, x28, 48 +;; xshl64_u6 x2, x2, 3 +;; xadd64 x0, x0, x2 +;; xshl64_u6 x2, x18, 3 +;; xadd64 x2, x0, x2 +;; br_if_xeq64_i8 x18, 0, 0x20 // target = 0x81 +;; 68: xmov x3, x20 +;; xbor64_s8 x4, x3, 1 +;; xstore64le_o32 x0, 0, x4 +;; xadd64_u8 x0, x0, 8 +;; br_if_xneq64 x0, x2, -0xf // target = 0x6b +;; 81: xmov x0, x1 +;; pop_frame_restore 48, x18, x19, x20, x23, x28 ;; ret +;; 8a: trap ;; ;; wasm[0]::function[2]: ;; push_frame_save 16, x16 ;; xload64le_o32 x1, x0, 56 -;; br_if_xulteq32 x1, x2, 0x7d // target = 0xbd -;; 47: xload64le_o32 x3, x0, 48 +;; br_if_xulteq32 x1, x2, 0x7c // target = 0x115 +;; a0: xload64le_o32 x3, x0, 48 ;; xmov x4, x0 ;; zext32 x1, x2 ;; xshl64_u6 x0, x1, 3 ;; xadd64 x0, x3, x0 ;; xload64le_o32 x2, x0, 0 ;; xband64_s8 x0, x2, -2 -;; br_if_xeq64_i8 x2, 0, 0x46 // target = 0xab -;; 6c: xmov x16, x4 -;; br_if_xeq64_i8 x0, 0, 0x51 // target = 0xc0 -;; 76: xload32le_o32 x1, x0, 16 +;; br_if_xeq64_i8 x2, 0, 0x45 // target = 0x103 +;; c5: xmov x16, x4 +;; br_if_xeq64_i8 x0, 0, 0x50 // target = 0x118 +;; cf: xload32le_o32 x1, x0, 16 ;; xload64le_o32 x2, x16, 40 ;; xload32le_o32 x2, x2, 0 -;; br_if_xneq32 x1, x2, 0x38 // target = 0xc3 -;; 92: xload64le_o32 x2, x0, 8 +;; br_if_xneq32 x1, x2, 0x37 // target = 0x11b +;; eb: xload64le_o32 x1, x0, 8 ;; xload64le_o32 x0, x0, 24 -;; xmov x1, x16 -;; call_indirect x2 +;; call_indirect2 x1, x0, x16 ;; pop_frame_restore 16, x16 ;; ret -;; ab: xzero x0 -;; ad: xmov x16, x4 -;; b0: call3 x16, x0, x1, 0x258 // target = 0x308 -;; b8: jump -0x49 // target = 0x6f -;; bd: trap -;; c0: trap -;; c3: trap +;; 103: xzero x0 +;; 105: xmov x16, x4 +;; 108: call3 x16, x0, x1, 0x258 // target = 0x360 +;; 110: jump -0x48 // target = 0xc8 +;; 115: trap +;; 118: trap +;; 11b: trap diff --git a/tests/disas/pulley-fusion-no-fire-user-mask.wat b/tests/disas/pulley-fusion-no-fire-user-mask.wat index c1bb950f61f8..2ce412df8cb9 100644 --- a/tests/disas/pulley-fusion-no-fire-user-mask.wat +++ b/tests/disas/pulley-fusion-no-fire-user-mask.wat @@ -87,6 +87,6 @@ ;; ret ;; d3: xmov x1, x17 ;; d6: xload64le_o32 x0, x1, 16 -;; dd: xload64le_o32 x0, x0, 408 -;; e4: call_indirect_host 52 +;; dd: xload64le_o32 x0, x0, 328 +;; e4: call_indirect_host 42 ;; e8: trap diff --git a/tests/disas/pulley/call.wat b/tests/disas/pulley/call.wat index 233ca7be3c35..d9bc3142fd99 100644 --- a/tests/disas/pulley/call.wat +++ b/tests/disas/pulley/call.wat @@ -8,9 +8,7 @@ ;; wasm[0]::function[1]: ;; push_frame ;; xload32le_o32 x3, x0, 28 -;; xmov x6, x0 -;; xload32le_o32 x0, x6, 36 -;; xmov x1, x6 -;; call_indirect x3 +;; xload32le_o32 x4, x0, 36 +;; call_indirect2 x3, x4, x0 ;; pop_frame ;; ret From 45c7f7e44e6dc3519cda38fa25d6d9b0b881ad86 Mon Sep 17 00:00:00 2001 From: Matt Hargett Date: Wed, 10 Jun 2026 19:18:24 -0700 Subject: [PATCH 22/22] adapt to the split table-mutability base The eager-table-initialization commits moved out of the base PR to land separately, and upstream moved MemFlags behind an interner. Two adaptations: - sink_pure_inst: resolve the MemFlags handle through dfg.mem_flags before checking readonly/notrap. - call_indirect lazy-init site: test the raw funcref value again (upstream behavior). Testing the masked value, which lets the Pulley backend absorb the band into the dispatch op, is only sound when every reachable slot is eagerly initialized; that variant returns with the eager-init PR. Goldens re-blessed accordingly: the xband*_s8_brif and band_funcref_dispatch families are dormant until then. The dispatch tail at the lazy-init site is xband ; xfuncref_dispatch ; call_indirectN -- three interpreter dispatches, down from five unfused. --- cranelift/codegen/src/machinst/lower.rs | 12 +- crates/cranelift/src/func_environ.rs | 18 +-- .../call-indirect-immutable-elide-null.wat | 51 +++++---- .../pulley-call-indirect-band-brif-fusion.wat | 103 +++++++++--------- tests/disas/pulley-fusion-fires-32bit.wat | 103 +++++++++--------- .../disas/pulley-fusion-fires-multi-call.wat | 54 +++++---- ...lley-fusion-fires-return-call-indirect.wat | 32 +++--- ...ulley-fusion-no-fire-sig-runtime-check.wat | 35 +++--- 8 files changed, 210 insertions(+), 198 deletions(-) diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index bdb0ac5edf50..5301c30ee454 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -1724,16 +1724,18 @@ impl<'func, I: VCodeInst> Lower<'func, I> { /// unnecessary because we're not moving a side-effecting op — we're /// telling the lowerer it has been handled elsewhere. pub fn sink_pure_inst(&mut self, ir_inst: Inst) { - let dfg_inst = &self.f.dfg.insts[ir_inst]; let is_pure = !has_lowering_side_effect(self.f, ir_inst); - let is_safe_load = matches!( - dfg_inst, + let is_safe_load = match &self.f.dfg.insts[ir_inst] { InstructionData::Load { opcode: crate::ir::Opcode::Load, flags, .. - } if flags.readonly() && flags.notrap() - ); + } => { + let flags = self.f.dfg.mem_flags[*flags]; + flags.readonly() && flags.notrap() + } + _ => false, + }; assert!(is_pure || is_safe_load); self.inst_absorbed_pure.insert(ir_inst); } diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index 744d13ad89f1..975a0a04377d 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -1074,20 +1074,12 @@ impl<'module_environment> FuncEnvironment<'module_environment> { let result_param = builder.append_block_param(continuation_block, pointer_type); builder.set_cold_block(null_block); - // Under `is_eagerly_initialized_funcref_table`, `value != 0` and - // `value_masked != 0` agree on every reachable slot, so we can - // test the masked result. The Pulley backend then fuses the - // `band + brif` pair. - let brif_cond = if self - .module - .is_eagerly_initialized_funcref_table(table_index) - { - value_masked - } else { - value - }; + // Branching on `value_masked` instead (letting the Pulley backend + // fuse the `band + brif` pair) requires a table whose slots are + // all eagerly initialized; that variant comes with eager + // initialization support. builder.ins().brif( - brif_cond, + value, continuation_block, &[value_masked.into()], null_block, diff --git a/tests/disas/call-indirect-immutable-elide-null.wat b/tests/disas/call-indirect-immutable-elide-null.wat index 78ae8359edcf..35e2e0c7f0db 100644 --- a/tests/disas/call-indirect-immutable-elide-null.wat +++ b/tests/disas/call-indirect-immutable-elide-null.wat @@ -27,8 +27,9 @@ ;; Fully cover the table — no null slot anywhere. (elem (i32.const 0) func $f1 $f2 $f3)) ;; function u0:0(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -41,8 +42,9 @@ ;; } ;; ;; function u0:1(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -55,8 +57,9 @@ ;; } ;; ;; function u0:2(i64 vmctx, i64) -> i32 tail { +;; region0 = 8 "VMContext+0x8" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; stack_limit = gv2 ;; @@ -69,14 +72,16 @@ ;; } ;; ;; function u0:3(i64 vmctx, i64, i32) -> i32 tail { +;; region0 = 8 "VMContext+0x8" +;; region1 = 1342177280 "DefinedTable(StaticModuleIndex(0), DefinedTableIndex(0))" ;; gv0 = vmctx -;; gv1 = load.i64 notrap aligned readonly gv0+8 +;; gv1 = load.i64 notrap aligned readonly region0 gv0+8 ;; gv2 = load.i64 notrap aligned gv1+24 ;; gv3 = vmctx ;; gv4 = load.i64 notrap aligned readonly can_move gv3+48 ;; sig0 = (i64 vmctx, i64) -> i32 tail ;; sig1 = (i64 vmctx, i32, i64) -> i64 tail -;; fn0 = colocated u805306368:9 sig1 +;; fn0 = colocated u805306368:7 sig1 ;; stack_limit = gv2 ;; ;; block0(v0: i64, v1: i64, v2: i32): @@ -84,28 +89,28 @@ ;; @0050 v5 = icmp uge v2, v4 ; v4 = 3 ;; @0050 v6 = uextend.i64 v2 ;; @0050 v7 = load.i64 notrap aligned readonly can_move v0+48 -;; v23 = iconst.i64 3 -;; @0050 v8 = ishl v6, v23 ; v23 = 3 -;; @0050 v9 = iadd v7, v8 -;; @0050 v10 = iconst.i64 0 -;; @0050 v11 = select_spectre_guard v5, v10, v9 ; v10 = 0 -;; @0050 v12 = load.i64 user5 aligned table v11 -;; v22 = iconst.i64 -2 -;; @0050 v13 = band v12, v22 ; v22 = -2 -;; @0050 brif v13, block3(v13), block2 +;; @0050 v8 = iconst.i64 3 +;; @0050 v9 = ishl v6, v8 ; v8 = 3 +;; @0050 v10 = iadd v7, v9 +;; @0050 v11 = iconst.i64 0 +;; @0050 v12 = select_spectre_guard v5, v11, v10 ; v11 = 0 +;; @0050 v13 = load.i64 user6 aligned region1 v12 +;; @0050 v14 = iconst.i64 -2 +;; @0050 v15 = band v13, v14 ; v14 = -2 +;; @0050 brif v13, block3(v15), block2 ;; ;; block2 cold: -;; @0050 v15 = iconst.i32 0 -;; @0050 v17 = uextend.i64 v2 -;; @0050 v18 = call fn0(v0, v15, v17) ; v15 = 0 -;; @0050 jump block3(v18) +;; @0050 v17 = iconst.i32 0 +;; @0050 v18 = uextend.i64 v2 +;; @0050 v19 = call fn0(v0, v17, v18) ; v17 = 0 +;; @0050 jump block3(v19) ;; -;; block3(v14: i64): -;; @0050 v19 = load.i64 notrap aligned readonly v14+8 -;; @0050 v20 = load.i64 notrap aligned readonly v14+24 -;; @0050 v21 = call_indirect sig0, v19(v20, v0) +;; block3(v16: i64): +;; @0050 v20 = load.i64 notrap aligned readonly v16+8 +;; @0050 v21 = load.i64 notrap aligned readonly v16+24 +;; @0050 v22 = call_indirect sig0, v20(v21, v0) ;; @0053 jump block1 ;; ;; block1: -;; @0053 return v21 +;; @0053 return v22 ;; } diff --git a/tests/disas/pulley-call-indirect-band-brif-fusion.wat b/tests/disas/pulley-call-indirect-band-brif-fusion.wat index 5cd6509e88db..178f86f72259 100644 --- a/tests/disas/pulley-call-indirect-band-brif-fusion.wat +++ b/tests/disas/pulley-call-indirect-band-brif-fusion.wat @@ -53,25 +53,28 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 32, x16, x17, x24 +;; push_frame_save 16, x25 ;; xmov x3, x0 -;; br_if_xugteq32_u8 x2, 3, 0x4b // target = 0x64 +;; br_if_xugteq32_u8 x2, 3, 0x59 // target = 0x72 ;; 20: xmov x1, x3 ;; xload64le_o32 x0, x1, 48 ;; zext32 x15, x2 ;; xshl64_u6 x1, x15, 3 ;; xadd64 x0, x0, x1 -;; xload64le_o32 x0, x0, 0 -;; xband_funcref_dispatch_not_x64 x0, x17, x16, x0, 8, 24, 0x18 // target = 0x52 -;; xmov x24, x3 -;; call_indirect2 x17, x16, x24 -;; pop_frame_restore 32, x16, x17, x24 +;; xload64le_o32 x1, x0, 0 +;; xband64_s8 x0, x1, -2 +;; br_if_xeq64_i8 x1, 0, 0x22 // target = 0x60 +;; 45: xmov x25, x3 +;; xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x25 +;; pop_frame_restore 16, x25 ;; ret -;; 52: xzero x0 -;; 54: xmov x24, x3 -;; 57: call3 x24, x0, x15, 0x267 // target = 0x2be -;; 5f: jump -0x17 // target = 0x48 -;; 64: trap +;; 60: xzero x0 +;; 62: xmov x25, x3 +;; 65: call3 x25, x0, x15, 0x267 // target = 0x2cc +;; 6d: jump -0x25 // target = 0x48 +;; 72: trap ;; ╰─╼ trap: Normal(TableOutOfBounds) ;; ;; wasm[0]::array_to_wasm_trampoline[0]: @@ -82,19 +85,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0xb9 +;; xpcadd x15, 0x2a // target = 0xc7 ;; xstore64le_o32 x13, 80, x15 -;; call -0x9e // target = 0x0 +;; call -0xac // target = 0x0 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xb9 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc7 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; b9: xzero x0 -;; bb: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; c0: ret +;; c7: xzero x0 +;; c9: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ce: ret ;; ;; wasm[0]::array_to_wasm_trampoline[1]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -104,19 +107,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x113 +;; xpcadd x15, 0x2a // target = 0x121 ;; xstore64le_o32 x13, 80, x15 -;; call -0xf3 // target = 0x5 +;; call -0x101 // target = 0x5 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x113 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x121 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 113: xzero x0 -;; 115: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 11a: ret +;; 121: xzero x0 +;; 123: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 128: ret ;; ;; wasm[0]::array_to_wasm_trampoline[2]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -126,19 +129,19 @@ ;; xstore64le_o32 x13, 72, x14 ;; xmov x14, sp ;; xstore64le_o32 x13, 64, x14 -;; xpcadd x15, 0x2a // target = 0x16d +;; xpcadd x15, 0x2a // target = 0x17b ;; xstore64le_o32 x13, 80, x15 -;; call -0x147 // target = 0xb +;; call -0x155 // target = 0xb ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x16d +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x17b ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 16d: xzero x0 -;; 16f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 174: ret +;; 17b: xzero x0 +;; 17d: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 182: ret ;; ;; wasm[0]::array_to_wasm_trampoline[3]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -149,19 +152,19 @@ ;; xstore64le_o32 x15, 72, x2 ;; xmov x2, sp ;; xstore64le_o32 x15, 64, x2 -;; xpcadd x2, 0x2d // target = 0x1d1 +;; xpcadd x2, 0x2d // target = 0x1df ;; xstore64le_o32 x15, 80, x2 -;; call3 x0, x1, x14, -0x1a2 // target = 0x11 +;; call3 x0, x1, x14, -0x1b0 // target = 0x11 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1d1 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1df ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 1d1: xzero x0 -;; 1d3: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 1d8: ret +;; 1df: xzero x0 +;; 1e1: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1e6: ret ;; ;; signatures[0]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -180,15 +183,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x15, x0 -;; br_if_not32 x15, 0x13 // target = 0x230 -;; 223: xload32le_o32 x0, x16, 0 +;; br_if_not32 x15, 0x13 // target = 0x23e +;; 231: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 230: xmov x1, x17 -;; 233: xload64le_o32 x0, x1, 16 -;; 23a: xload64le_o32 x0, x0, 328 -;; 241: call_indirect_host 42 -;; 245: trap +;; 23e: xmov x1, x17 +;; 241: xload64le_o32 x0, x1, 16 +;; 248: xload64le_o32 x0, x0, 328 +;; 24f: call_indirect_host 42 +;; 253: trap ;; ;; signatures[1]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -208,15 +211,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x0, x0 -;; br_if_not32 x0, 0x13 // target = 0x2a6 -;; 299: xload32le_o32 x0, x16, 0 +;; br_if_not32 x0, 0x13 // target = 0x2b4 +;; 2a7: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 2a6: xmov x1, x17 -;; 2a9: xload64le_o32 x0, x1, 16 -;; 2b0: xload64le_o32 x0, x0, 328 -;; 2b7: call_indirect_host 42 -;; 2bb: trap +;; 2b4: xmov x1, x17 +;; 2b7: xload64le_o32 x0, x1, 16 +;; 2be: xload64le_o32 x0, x0, 328 +;; 2c5: call_indirect_host 42 +;; 2c9: trap ;; ;; wasmtime_builtin_table_get_lazy_init_func_ref: ;; push_frame diff --git a/tests/disas/pulley-fusion-fires-32bit.wat b/tests/disas/pulley-fusion-fires-32bit.wat index 481ed46c6056..90ccab4100a9 100644 --- a/tests/disas/pulley-fusion-fires-32bit.wat +++ b/tests/disas/pulley-fusion-fires-32bit.wat @@ -49,22 +49,27 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 32, x16, x17, x24 -;; br_if_xugteq32_u8 x2, 3, 0x45 // target = 0x5b +;; push_frame_save 16, x25 +;; br_if_xugteq32_u8 x2, 3, 0x58 // target = 0x6e ;; 1d: xload32le_o32 x15, x0, 24 -;; xmov x24, x0 +;; xmov x3, x0 ;; xshl32_u6 x0, x2, 2 ;; xadd32 x15, x15, x0 ;; xload32le_o32 x15, x15, 0 -;; xband_funcref_dispatch_not_x32 x0, x17, x16, x15, 4, 12, 0x15 // target = 0x49 -;; call_indirect2 x17, x16, x24 -;; pop_frame_restore 32, x16, x17, x24 +;; xband32_s8 x0, x15, -2 +;; br_if_not32 x15, 0x21 // target = 0x59 +;; 3e: xmov x25, x3 +;; xload32le_o32 x1, x0, 4 +;; xload32le_o32 x0, x0, 12 +;; call_indirect2 x1, x0, x25 +;; pop_frame_restore 16, x25 ;; ret -;; 49: xzero x0 -;; 4b: zext32 x1, x2 -;; 4e: call3 x24, x0, x1, 0x267 // target = 0x2b5 -;; 56: jump -0x17 // target = 0x3f -;; 5b: trap +;; 59: xzero x0 +;; 5b: zext32 x1, x2 +;; 5e: xmov x25, x3 +;; 61: call3 x25, x0, x1, 0x267 // target = 0x2c8 +;; 69: jump -0x28 // target = 0x41 +;; 6e: trap ;; ╰─╼ trap: Normal(TableOutOfBounds) ;; ;; wasm[0]::array_to_wasm_trampoline[0]: @@ -75,19 +80,19 @@ ;; xstore32le_o32 x13, 48, x14 ;; xmov x14, sp ;; xstore32le_o32 x13, 44, x14 -;; xpcadd x15, 0x2a // target = 0xb0 +;; xpcadd x15, 0x2a // target = 0xc3 ;; xstore32le_o32 x13, 52, x15 -;; call -0x95 // target = 0x0 +;; call -0xa8 // target = 0x0 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xb0 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0xc3 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; b0: xzero x0 -;; b2: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; b7: ret +;; c3: xzero x0 +;; c5: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; ca: ret ;; ;; wasm[0]::array_to_wasm_trampoline[1]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -97,19 +102,19 @@ ;; xstore32le_o32 x13, 48, x14 ;; xmov x14, sp ;; xstore32le_o32 x13, 44, x14 -;; xpcadd x15, 0x2a // target = 0x10a +;; xpcadd x15, 0x2a // target = 0x11d ;; xstore32le_o32 x13, 52, x15 -;; call -0xea // target = 0x5 +;; call -0xfd // target = 0x5 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x10a +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x11d ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 10a: xzero x0 -;; 10c: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 111: ret +;; 11d: xzero x0 +;; 11f: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 124: ret ;; ;; wasm[0]::array_to_wasm_trampoline[2]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -119,19 +124,19 @@ ;; xstore32le_o32 x13, 48, x14 ;; xmov x14, sp ;; xstore32le_o32 x13, 44, x14 -;; xpcadd x15, 0x2a // target = 0x164 +;; xpcadd x15, 0x2a // target = 0x177 ;; xstore32le_o32 x13, 52, x15 -;; call -0x13e // target = 0xb +;; call -0x151 // target = 0xb ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x164 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x177 ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 164: xzero x0 -;; 166: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 16b: ret +;; 177: xzero x0 +;; 179: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 17e: ret ;; ;; wasm[0]::array_to_wasm_trampoline[3]: ;; push_frame_save 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 @@ -142,19 +147,19 @@ ;; xstore32le_o32 x15, 48, x2 ;; xmov x2, sp ;; xstore32le_o32 x15, 44, x2 -;; xpcadd x2, 0x2d // target = 0x1c8 +;; xpcadd x2, 0x2d // target = 0x1db ;; xstore32le_o32 x15, 52, x2 -;; call3 x0, x1, x14, -0x199 // target = 0x11 +;; call3 x0, x1, x14, -0x1ac // target = 0x11 ;; ├─╼ exception frame offset: SP = FP - 0x90 -;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1c8 +;; ╰─╼ exception handler: default handler, no dynamic context, handler=0x1db ;; xload64le_o32 x2, sp, 0 ;; xstore32le_o32 x2, 0, x0 ;; xone x0 ;; pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 ;; ret -;; 1c8: xzero x0 -;; 1ca: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 -;; 1cf: ret +;; 1db: xzero x0 +;; 1dd: pop_frame_restore 144, x16, x17, x18, x19, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29, sp, spilltmp0 +;; 1e2: ret ;; ;; signatures[0]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -173,15 +178,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x15, x0 -;; br_if_not32 x15, 0x13 // target = 0x227 -;; 21a: xload32le_o32 x0, x16, 0 +;; br_if_not32 x15, 0x13 // target = 0x23a +;; 22d: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 227: xmov x1, x17 -;; 22a: xload32le_o32 x0, x1, 8 -;; 231: xload32le_o32 x0, x0, 164 -;; 238: call_indirect_host 42 -;; 23c: trap +;; 23a: xmov x1, x17 +;; 23d: xload32le_o32 x0, x1, 8 +;; 244: xload32le_o32 x0, x0, 164 +;; 24b: call_indirect_host 42 +;; 24f: trap ;; ;; signatures[1]::wasm_to_array_trampoline: ;; push_frame_save 32, x16, x17 @@ -201,15 +206,15 @@ ;; xmov x3, x16 ;; call_indirect_host 0 ;; zext8 x0, x0 -;; br_if_not32 x0, 0x13 // target = 0x29d -;; 290: xload32le_o32 x0, x16, 0 +;; br_if_not32 x0, 0x13 // target = 0x2b0 +;; 2a3: xload32le_o32 x0, x16, 0 ;; pop_frame_restore 32, x16, x17 ;; ret -;; 29d: xmov x1, x17 -;; 2a0: xload32le_o32 x0, x1, 8 -;; 2a7: xload32le_o32 x0, x0, 164 -;; 2ae: call_indirect_host 42 -;; 2b2: trap +;; 2b0: xmov x1, x17 +;; 2b3: xload32le_o32 x0, x1, 8 +;; 2ba: xload32le_o32 x0, x0, 164 +;; 2c1: call_indirect_host 42 +;; 2c5: trap ;; ;; wasmtime_builtin_table_get_lazy_init_func_ref: ;; push_frame diff --git a/tests/disas/pulley-fusion-fires-multi-call.wat b/tests/disas/pulley-fusion-fires-multi-call.wat index cfccdb5a0b27..abd94de07148 100644 --- a/tests/disas/pulley-fusion-fires-multi-call.wat +++ b/tests/disas/pulley-fusion-fires-multi-call.wat @@ -50,40 +50,46 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 48, x16, x17, x18, x27, x28, x29 +;; push_frame_save 32, x16, x17, x28, x29 ;; xmov x29, x3 -;; br_if_xugteq32_u8 x2, 3, 0x95 // target = 0xae +;; br_if_xugteq32_u8 x2, 3, 0xb1 // target = 0xca ;; 20: xload64le_o32 x28, x0, 48 ;; xmov x4, x0 ;; zext32 x1, x2 ;; xshl64_u6 x0, x1, 3 ;; xadd64 x0, x28, x0 -;; xload64le_o32 x0, x0, 0 -;; xband_funcref_dispatch_not_x64 x0, x17, x16, x0, 8, 24, 0x50 // target = 0x8a -;; xmov x18, x4 -;; call_indirect2 x17, x16, x18 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x68 // target = 0xa6 +;; 45: xmov x16, x4 +;; xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x16 ;; xmov x3, x29 -;; xmov x4, x18 +;; xmov x4, x16 ;; xmov x17, x0 -;; br_if_xugteq32_u8 x3, 3, 0x5c // target = 0xb1 -;; 5c: zext32 x1, x3 +;; br_if_xugteq32_u8 x3, 3, 0x6a // target = 0xcd +;; 6a: zext32 x1, x3 ;; xshl64_u6 x0, x1, 3 ;; xadd64 x0, x28, x0 -;; xload64le_o32 x0, x0, 0 -;; xband_funcref_dispatch_not_x64 x0, x27, x28, x0, 8, 24, 0x30 // target = 0x9c -;; xmov x16, x4 -;; call_indirect2 x27, x28, x16 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x3a // target = 0xb8 +;; 85: xmov x16, x4 +;; xload64le_o32 x1, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; call_indirect2 x1, x0, x16 ;; xmov x1, x17 ;; xadd32 x0, x1, x0 -;; pop_frame_restore 48, x16, x17, x18, x27, x28, x29 +;; pop_frame_restore 32, x16, x17, x28, x29 ;; ret -;; 8a: xzero x0 -;; 8c: xmov x18, x4 -;; 8f: call3 x18, x0, x1, 0x28f // target = 0x31e -;; 97: jump -0x4f // target = 0x48 -;; 9c: xzero x0 -;; 9e: xmov x16, x4 -;; a1: call3 x16, x0, x1, 0x27d // target = 0x31e -;; a9: jump -0x2f // target = 0x7a -;; ae: trap -;; b1: trap +;; a6: xzero x0 +;; a8: xmov x16, x4 +;; ab: call3 x16, x0, x1, 0x28f // target = 0x33a +;; b3: jump -0x6b // target = 0x48 +;; b8: xzero x0 +;; ba: xmov x16, x4 +;; bd: call3 x16, x0, x1, 0x27d // target = 0x33a +;; c5: jump -0x3d // target = 0x88 +;; ca: trap +;; cd: trap diff --git a/tests/disas/pulley-fusion-fires-return-call-indirect.wat b/tests/disas/pulley-fusion-fires-return-call-indirect.wat index db63dbc4f1a2..ae5faaba802c 100644 --- a/tests/disas/pulley-fusion-fires-return-call-indirect.wat +++ b/tests/disas/pulley-fusion-fires-return-call-indirect.wat @@ -37,26 +37,24 @@ ;; ret ;; ;; wasm[0]::function[1]: -;; push_frame_save 32, x16, x17, x24 -;; br_if_xneq32_i8 x2, 0, 0x5a // target = 0x64 -;; 11: xload64le_o32 x15, x0, 48 +;; push_frame_save 16, x25 +;; br_if32 x2, 0x58 // target = 0x62 +;; 10: xload64le_o32 x15, x0, 48 ;; xmov x1, x0 ;; zext32 x14, x2 ;; xshl64_u6 x0, x14, 3 ;; xadd64 x15, x15, x0 ;; xload64le_o32 x15, x15, 0 -;; xband_funcref_dispatch_not_x64 x0, x16, x17, x15, 8, 24, 0x1b // target = 0x46 -;; xmov x15, x16 -;; xmov x2, x0 -;; xmov x0, x17 -;; pop_frame_restore 32, x16, x17, x24 +;; xband64_s8 x0, x15, -2 +;; br_if_xeq64_i8 x15, 0, 0x22 // target = 0x50 +;; 35: xmov x25, x1 +;; xload64le_o32 x15, x0, 8 +;; xload64le_o32 x0, x0, 24 +;; xmov x1, x25 +;; pop_frame_restore 16, x25 ;; xjump x15 -;; 46: xzero x0 -;; xmov x24, x1 -;; call3 x24, x0, x14, 0x1bf // target = 0x20a -;; xmov x2, x0 -;; xmov x0, x17 -;; xmov x1, x24 -;; xmov x15, x16 -;; jump -0x20 // target = 0x3f -;; 64: trap +;; 50: xzero x0 +;; xmov x25, x1 +;; call3 x25, x0, x14, 0x1b3 // target = 0x208 +;; jump -0x25 // target = 0x38 +;; 62: trap diff --git a/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat b/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat index 1c3bade1689f..398c5fee2cb3 100644 --- a/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat +++ b/tests/disas/pulley-fusion-no-fire-sig-runtime-check.wat @@ -55,33 +55,34 @@ ;; ret ;; ;; wasm[0]::function[3]: -;; push_frame_save 16, x18, x29 +;; push_frame_save 16, x16, x18 ;; xmov x3, x0 -;; br_if_xugteq32_u8 x2, 3, 0x7f // target = 0x98 +;; br_if_xugteq32_u8 x2, 3, 0x82 // target = 0x9b ;; 20: xmov x1, x3 ;; xload64le_o32 x0, x1, 48 ;; zext32 x1, x2 ;; xmov x18, x2 ;; xshl64_u6 x2, x1, 3 ;; xadd64 x0, x0, x2 -;; xload64le_o32 x0, x0, 0 -;; xband64_s8_br_if_not_x64 x0, x0, -2, 0x49 // target = 0x86 -;; 45: xmov x29, x3 -;; br_if_xeq64_i8 x0, 0, 0x53 // target = 0x9b -;; 4f: xload32le_o32 x1, x0, 16 -;; xload64le_o32 x2, x29, 40 +;; xload64le_o32 x2, x0, 0 +;; xband64_s8 x0, x2, -2 +;; br_if_xeq64_i8 x2, 0, 0x48 // target = 0x89 +;; 48: xmov x16, x3 +;; br_if_xeq64_i8 x0, 0, 0x53 // target = 0x9e +;; 52: xload32le_o32 x1, x0, 16 +;; xload64le_o32 x2, x16, 40 ;; xload32le_o32 x2, x2, 0 -;; br_if_xneq32 x1, x2, 0x3a // target = 0x9e -;; 6b: xload64le_o32 x1, x0, 8 +;; br_if_xneq32 x1, x2, 0x3a // target = 0xa1 +;; 6e: xload64le_o32 x1, x0, 8 ;; xload64le_o32 x0, x0, 24 ;; xmov x2, x18 -;; call_indirect2 x1, x0, x29 -;; pop_frame_restore 16, x18, x29 +;; call_indirect2 x1, x0, x16 +;; pop_frame_restore 16, x16, x18 ;; ret -;; 86: xzero x0 -;; 88: xmov x29, x3 -;; 8b: call3 x29, x0, x1, 0x281 // target = 0x30c -;; 93: jump -0x4b // target = 0x48 -;; 98: trap +;; 89: xzero x0 +;; 8b: xmov x16, x3 +;; 8e: call3 x16, x0, x1, 0x281 // target = 0x30f +;; 96: jump -0x4b // target = 0x4b ;; 9b: trap ;; 9e: trap +;; a1: trap