diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8096029..23503b1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,9 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-- fix(ci): run CUDA auditwheel from the conda environment
-- fix(ci): install auditwheel in accelerator repair steps
-- fix(ci): repair accelerator wheel workflow dispatch publishing
+- feat(contrib): map additional ggml forward ops to onnx backend by @abetlen in #146
+- fix(ci): run CUDA auditwheel from conda environment by @abetlen in #145
+- fix(ci): install auditwheel in accelerator repair steps by @abetlen in #144
+- fix(ci): repair accelerator wheel release dispatch by @abetlen in #143
 
 ## [0.0.41]
 
diff --git a/ggml/contrib/onnx.py b/ggml/contrib/onnx.py
index 1be51a6..4905154 100644
--- a/ggml/contrib/onnx.py
+++ b/ggml/contrib/onnx.py
@@ -285,12 +285,14 @@ def __init__(
         operator_class: Optional[str] = None,
         devices: Tuple[str, ...] = ("CPU",),
         view_kind: Optional[str] = None,
+        domains: Tuple[str, ...] = ("",),
     ):
         self.op_type = op_type
         self.execution = execution
         self.operator_class = operator_class or self.class_for_execution(execution)
         self.devices = devices
         self.view_kind = view_kind
+        self.domains = domains
         self.implementation: Optional[GgmlOperator] = None
         self.has_numpy_evaluator = False
 
@@ -433,6 +435,12 @@ def float_attribute(node: NodeProto, name: str, default: float) -> float:
             next((attr.f for attr in node.attribute if attr.name == name), default)
         )
 
+    @staticmethod
+    def string_attribute_value(value: Any) -> str:
+        if isinstance(value, bytes):
+            return value.decode("utf-8")
+        return str(value)
+
     @staticmethod
     def runtime_tensor_type(ctx: "GgmlOnnxExecutionContext", name: str) -> TensorType:
         return TensorType(
@@ -791,7 +799,10 @@ def lower_numpy_unary(
             np.dtype(output_dtype) if output_dtype is not None else input_dtype
         )
         storage_dtype = ctx.storage_dtype_for_logical_dtype(result_dtype)
-        output_tensor = ctx.from_numpy(np.empty(result_shape, dtype=storage_dtype))
+        storage_shape = result_shape if result_shape else (1,)
+        if len(storage_shape) > ViewTransformSemantics.GGML_MAX_DIMS:
+            storage_shape = (int(np.prod(storage_shape)),)
+        output_tensor = ctx.from_numpy(np.empty(storage_shape, dtype=storage_dtype))
 
         @ggml.ggml_custom2_op_t
         def custom_unary(
@@ -847,7 +858,10 @@ def lower_numpy_binary(
             )
         )
         storage_dtype = ctx.storage_dtype_for_logical_dtype(result_dtype)
-        output_tensor = ctx.from_numpy(np.empty(result_shape, dtype=storage_dtype))
+        storage_shape = result_shape if result_shape else (1,)
+        if len(storage_shape) > ViewTransformSemantics.GGML_MAX_DIMS:
+            storage_shape = (int(np.prod(storage_shape)),)
+        output_tensor = ctx.from_numpy(np.empty(storage_shape, dtype=storage_dtype))
 
         @ggml.ggml_custom3_op_t
         def custom_binary(
@@ -897,7 +911,10 @@ def lower_numpy_integer_unary(
         input_shape = ctx.shapes[input_name]
         result_dtype = np.dtype(ctx.get_tensor_dtype(input_name))
         storage_dtype = ctx.storage_dtype_for_logical_dtype(result_dtype)
-        output_tensor = ctx.from_numpy(np.empty(input_shape, dtype=storage_dtype))
+        storage_shape = input_shape if input_shape else (1,)
+        if len(storage_shape) > ViewTransformSemantics.GGML_MAX_DIMS:
+            storage_shape = (int(np.prod(storage_shape)),)
+        output_tensor = ctx.from_numpy(np.empty(storage_shape, dtype=storage_dtype))
 
         @ggml.ggml_custom2_op_t
         def custom_unary(
@@ -948,7 +965,10 @@ def lower_numpy_integer_binary(
             ctx.get_tensor_dtype(left_name), ctx.get_tensor_dtype(right_name)
         )
         storage_dtype = ctx.storage_dtype_for_logical_dtype(result_dtype)
-        output_tensor = ctx.from_numpy(np.empty(result_shape, dtype=storage_dtype))
+        storage_shape = result_shape if result_shape else (1,)
+        if len(storage_shape) > ViewTransformSemantics.GGML_MAX_DIMS:
+            storage_shape = (int(np.prod(storage_shape)),)
+        output_tensor = ctx.from_numpy(np.empty(storage_shape, dtype=storage_dtype))
 
         @ggml.ggml_custom3_op_t
         def custom_binary(
@@ -1786,7 +1806,7 @@ def lower_reduce(
         output_name = node.output[0]
         input_tensor = node_inputs[0]
         tensor_shape = ctx.shapes[node.input[0]]
-        tensor_dtype = get_tensor_dtype(input_tensor)
+        tensor_dtype = np.dtype(ctx.get_tensor_dtype(node.input[0]))
         axes, noop = self.reduce_axes(ctx, node, node_inputs, len(tensor_shape))
         keepdims = bool(
             next((attr.i for attr in node.attribute if attr.name == "keepdims"), 1)
@@ -1797,6 +1817,32 @@ def lower_reduce(
             return
 
         output_shape = self.reduce_output_shape(tensor_shape, axes, keepdims)
+        if any(dim == 0 for dim in tensor_shape):
+            tensor = np.empty(tensor_shape, dtype=tensor_dtype)
+            try:
+                result = reducer(tensor, axes, keepdims)
+            except ValueError:
+                if self.op_type not in {"ReduceMax", "ReduceMin"}:
+                    raise
+                if tensor_dtype == np.dtype(np.bool_):
+                    identity = False if self.op_type == "ReduceMax" else True
+                elif np.issubdtype(tensor_dtype, np.floating):
+                    identity = -np.inf if self.op_type == "ReduceMax" else np.inf
+                elif np.issubdtype(tensor_dtype, np.unsignedinteger):
+                    info = np.iinfo(tensor_dtype)
+                    identity = info.min if self.op_type == "ReduceMax" else info.max
+                elif np.issubdtype(tensor_dtype, np.signedinteger):
+                    info = np.iinfo(tensor_dtype)
+                    identity = info.min if self.op_type == "ReduceMax" else info.max
+                else:
+                    raise
+                result = np.full(output_shape, identity, dtype=tensor_dtype)
+                ctx.set_numpy_runtime_output(output_name, result, result.dtype)
+                return
+            else:
+                ctx.set_numpy_runtime_output(output_name, result, result.dtype)
+                return
+
         x = np.empty(output_shape, dtype=tensor_dtype)
         x_t = ctx.from_numpy(x)
 
@@ -2153,6 +2199,7 @@ class NodeIR:
     index: int
     name: str
     op_type: str
+    domain: str
     inputs: Tuple[str, ...]
     outputs: Tuple[str, ...]
     attributes: Tuple[str, ...]
@@ -2209,6 +2256,7 @@ class ExecutionPlanNode:
     index: int
     name: str
     op_type: str
+    domain: str
     execution: str
     operator_class: str
     inputs: Tuple[str, ...]
@@ -2567,16 +2615,28 @@ def coverage_report(self) -> ExecutionCoverageReport:
 class OnnxOperatorRegistry:
     def __init__(self):
         self.operators: Dict[str, OnnxOperator] = {}
+        self.domain_operators: Dict[Tuple[str, str], OnnxOperator] = {}
 
     def register(self, operator_cls: Type[OnnxOperator]) -> Type[OnnxOperator]:
         operator = operator_cls()
-        if operator.op_type in self.operators:
-            raise ValueError(f'Operator "{operator.op_type}" is already registered')
-        self.operators[operator.op_type] = operator
+        for domain in operator.domains:
+            key = (domain, operator.op_type)
+            if key in self.domain_operators:
+                raise ValueError(
+                    f'Operator "{operator.op_type}" is already registered '
+                    f'for domain "{domain}"'
+                )
+            self.domain_operators[key] = operator
+            if domain == "":
+                if operator.op_type in self.operators:
+                    raise ValueError(
+                        f'Operator "{operator.op_type}" is already registered'
+                    )
+                self.operators[operator.op_type] = operator
         return operator_cls
 
-    def get(self, op_type: str) -> Optional[OnnxOperator]:
-        return self.operators.get(op_type)
+    def get(self, op_type: str, domain: str = "") -> Optional[OnnxOperator]:
+        return self.domain_operators.get((domain, op_type))
 
 
 onnx_operators = OnnxOperatorRegistry()
@@ -2587,11 +2647,11 @@ def __init__(self):
         self.executed_islands: List[FallbackIsland] = []
 
     def can_execute_node(self, node: NodeProto) -> bool:
-        operator = onnx_operators.get(node.op_type)
+        operator = onnx_operators.get(node.op_type, node.domain)
         return bool(operator is not None and operator.has_numpy_evaluator)
 
     def node_kernel(self, node: NodeProto) -> NumpyFallbackKernel:
-        operator = onnx_operators.get(node.op_type)
+        operator = onnx_operators.get(node.op_type, node.domain)
         if operator is not None and operator.has_numpy_evaluator:
             return operator.eval_numpy
         raise KeyError(f'Operator "{node.op_type}" has no NumPy fallback kernel')
@@ -2843,6 +2903,119 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
         self.lower_numpy_unary(ctx, node, np.arctanh)
 
 
+@onnx_operators.register
+class AttentionOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__("Attention", domains=("", "com.microsoft"))
+        self.has_numpy_evaluator = True
+
+    @staticmethod
+    def head_count(node: NodeProto, name: str, default: int) -> int:
+        return OnnxOperator.int_attribute(node, name, default)
+
+    @staticmethod
+    def as_heads(
+        value: npt.NDArray[Any],
+        num_heads: int,
+    ) -> Tuple[npt.NDArray[Any], str]:
+        if value.ndim == 3:
+            batch, sequence, hidden = value.shape
+            if hidden % num_heads != 0:
+                raise ValueError("Attention hidden size must be divisible by heads")
+            head_size = hidden // num_heads
+            return value.reshape(batch, sequence, num_heads, head_size).transpose(
+                0, 2, 1, 3
+            ), "BSD"
+        if value.ndim == 4:
+            if value.shape[1] == num_heads:
+                return value, "BHSD"
+            if value.shape[2] == num_heads:
+                return value.transpose(0, 2, 1, 3), "BSHD"
+        raise ValueError("Attention expects rank-3 or rank-4 Q/K/V tensors")
+
+    @staticmethod
+    def restore_heads(value: npt.NDArray[Any], layout: str) -> npt.NDArray[Any]:
+        if layout == "BSD":
+            batch, heads, sequence, head_size = value.shape
+            return value.transpose(0, 2, 1, 3).reshape(
+                batch, sequence, heads * head_size
+            )
+        if layout == "BHSD":
+            return value
+        if layout == "BSHD":
+            return value.transpose(0, 2, 1, 3)
+        raise ValueError(f'Unsupported Attention layout "{layout}"')
+
+    @staticmethod
+    def attention_mask_bias(
+        mask: npt.NDArray[Any],
+        dtype: npt.DTypeLike,
+    ) -> npt.NDArray[Any]:
+        if mask.dtype == np.dtype(np.bool_):
+            return np.where(mask, 0.0, -np.inf).astype(dtype)
+        return mask.astype(dtype)
+
+    def eval_numpy(
+        self, node: NodeProto, inputs: Tuple[npt.NDArray[Any], ...]
+    ) -> Tuple[npt.NDArray[Any], ...]:
+        if len(inputs) < 3:
+            raise ValueError(f'Operation "{node.op_type}" requires Q, K, and V inputs')
+        q, k, v = inputs[:3]
+        q_num_heads = self.head_count(node, "q_num_heads", 1)
+        kv_num_heads = self.head_count(node, "kv_num_heads", q_num_heads)
+        q_heads, q_layout = self.as_heads(q, q_num_heads)
+        k_heads, _ = self.as_heads(k, kv_num_heads)
+        v_heads, _ = self.as_heads(v, kv_num_heads)
+        if q_num_heads != kv_num_heads:
+            if q_num_heads % kv_num_heads != 0:
+                raise ValueError("q_num_heads must be divisible by kv_num_heads")
+            repeats = q_num_heads // kv_num_heads
+            k_heads = np.repeat(k_heads, repeats, axis=1)
+            v_heads = np.repeat(v_heads, repeats, axis=1)
+
+        head_size = q_heads.shape[-1]
+        scale = self.float_attribute(node, "scale", 1.0 / math.sqrt(head_size))
+        scores = np.matmul(q_heads, np.swapaxes(k_heads, -1, -2)) * scale
+        if len(inputs) >= 4:
+            mask = self.attention_mask_bias(inputs[3], scores.dtype)
+            if mask.ndim == 2:
+                mask = mask.reshape(1, 1, *mask.shape)
+            elif mask.ndim == 3:
+                mask = mask.reshape(mask.shape[0], 1, mask.shape[1], mask.shape[2])
+            scores = scores + mask
+        if self.int_attribute(node, "is_causal", 0):
+            q_length = scores.shape[-2]
+            kv_length = scores.shape[-1]
+            causal = np.triu(np.ones((q_length, kv_length), dtype=np.bool_), k=1)
+            scores = np.where(causal, -np.inf, scores)
+
+        scores = scores - np.max(scores, axis=-1, keepdims=True)
+        probabilities = np.exp(scores)
+        probabilities = probabilities / np.sum(probabilities, axis=-1, keepdims=True)
+        output = np.matmul(probabilities, v_heads).astype(q.dtype)
+        outputs: Tuple[npt.NDArray[Any], ...] = (self.restore_heads(output, q_layout),)
+        if len(node.output) > 1 and node.output[1]:
+            outputs = (*outputs, k.astype(k.dtype))
+        if len(node.output) > 2 and node.output[2]:
+            outputs = (*outputs, v.astype(v.dtype))
+        if len(node.output) > 3 and node.output[3]:
+            outputs = (*outputs, scores.astype(q.dtype))
+        return outputs
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        arrays = tuple(
+            ctx.logical_tensor_eval_data(
+                name, ctx.ggml_tensors_dict[name], ctx.shapes[name]
+            )
+            for name in node.input
+            if name
+        )
+        outputs = self.eval_numpy(node, arrays)
+        output_names = tuple(name for name in node.output if name)
+        for output_name, output in zip(output_names, outputs):
+            ctx.set_numpy_runtime_output(output_name, output, output.dtype)
+
+
 @onnx_operators.register
 class AveragePoolOperator(OnnxOperator):
     def __init__(self):
@@ -3369,7 +3542,7 @@ def custom_arg_min(
 @onnx_operators.register
 class ArrayFeatureExtractorOperator(OnnxOperator):
     def __init__(self):
-        super().__init__("ArrayFeatureExtractor")
+        super().__init__("ArrayFeatureExtractor", domains=("", "ai.onnx.ml"))
 
     def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
         node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
@@ -3419,7 +3592,7 @@ def custom_array_feature_extractor(
 @onnx_operators.register
 class BinarizerOperator(OnnxOperator):
     def __init__(self):
-        super().__init__("Binarizer")
+        super().__init__("Binarizer", domains=("", "ai.onnx.ml"))
 
     def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
         node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
@@ -3481,11 +3654,43 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
         onnx_type = next(attr.i for attr in node.attribute if attr.name == "to")
 
         np_data_type = np.dtype(tensor_dtype_to_np_dtype(onnx_type))
-        tensor = ctx.logical_tensor_eval_data(
-            node.input[0], node_inputs[0], ctx.shapes[node.input[0]]
+        input_name = node.input[0]
+        input_shape = ctx.shapes[input_name]
+        if np.dtype(ctx.get_tensor_dtype(input_name)) == np_data_type:
+            ctx.ggml_tensors_dict[node.output[0]] = node_inputs[0]
+            ctx.shapes[node.output[0]] = input_shape
+            ctx.set_tensor_dtype(node.output[0], np_data_type)
+            return
+        storage_dtype = ctx.storage_dtype_for_logical_dtype(np_data_type)
+        storage_shape = input_shape if input_shape else (1,)
+        if len(storage_shape) > ViewTransformSemantics.GGML_MAX_DIMS:
+            storage_shape = (int(np.prod(storage_shape)),)
+        output_tensor = ctx.from_numpy(np.empty(storage_shape, dtype=storage_dtype))
+
+        @ggml.ggml_custom2_op_t
+        def custom_cast(
+            tensor_out: ggml.ggml_tensor_p,
+            tensor_in_1: ggml.ggml_tensor_p,
+            tensor_in_2: ggml.ggml_tensor_p,
+            ith: int,
+            nth: int,
+            userdata: Optional[ctypes.c_void_p],
+        ):
+            del tensor_in_1, ith, nth, userdata
+            tensor = ctx.logical_tensor_data(input_name, tensor_in_2, input_shape)
+            ctx.set_tensor_data(tensor_out, tensor.astype(np_data_type))
+
+        new_tensor = ggml.ggml_map_custom2_inplace(
+            ctx.ggml_eval_context,
+            output_tensor,
+            node_inputs[0],
+            custom_cast,
+            1,
+            None,
         )
-        ctx.set_logical_output(
-            node.output[0], tensor.astype(np_data_type), np_data_type
+        ctx.refs.append(custom_cast)
+        ctx.register_numpy_runtime_tensor(
+            node.output[0], new_tensor, input_shape, np_data_type
         )
 
 
@@ -3504,12 +3709,43 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
         a, _ = node_inputs
 
         np_data_dtype = np.dtype(ctx.get_tensor_dtype(node.input[1]))
+        input_name = node.input[0]
+        input_shape = ctx.shapes[input_name]
+        if np.dtype(ctx.get_tensor_dtype(input_name)) == np_data_dtype:
+            ctx.ggml_tensors_dict[node.output[0]] = a
+            ctx.shapes[node.output[0]] = input_shape
+            ctx.set_tensor_dtype(node.output[0], np_data_dtype)
+            return
+        storage_dtype = ctx.storage_dtype_for_logical_dtype(np_data_dtype)
+        storage_shape = input_shape if input_shape else (1,)
+        if len(storage_shape) > ViewTransformSemantics.GGML_MAX_DIMS:
+            storage_shape = (int(np.prod(storage_shape)),)
+        output_tensor = ctx.from_numpy(np.empty(storage_shape, dtype=storage_dtype))
+
+        @ggml.ggml_custom2_op_t
+        def custom_cast_like(
+            tensor_out: ggml.ggml_tensor_p,
+            tensor_in_1: ggml.ggml_tensor_p,
+            tensor_in_2: ggml.ggml_tensor_p,
+            ith: int,
+            nth: int,
+            userdata: Optional[ctypes.c_void_p],
+        ):
+            del tensor_in_1, ith, nth, userdata
+            tensor = ctx.logical_tensor_data(input_name, tensor_in_2, input_shape)
+            ctx.set_tensor_data(tensor_out, tensor.astype(np_data_dtype))
 
-        tensor = ctx.logical_tensor_eval_data(
-            node.input[0], a, ctx.shapes[node.input[0]]
+        new_tensor = ggml.ggml_map_custom2_inplace(
+            ctx.ggml_eval_context,
+            output_tensor,
+            a,
+            custom_cast_like,
+            1,
+            None,
         )
-        ctx.set_logical_output(
-            node.output[0], tensor.astype(np_data_dtype), np_data_dtype
+        ctx.refs.append(custom_cast_like)
+        ctx.register_numpy_runtime_tensor(
+            node.output[0], new_tensor, input_shape, np_data_dtype
         )
 
 
@@ -3720,10 +3956,11 @@ def strategies(
     ) -> Tuple[Tuple[str, str, str], ...]:
         if len(node.inputs) >= 1:
             input_type = self.tensor_type(tensor_types, node.inputs[0])
-            if input_type.is_float32 and self.static_clip_bounds(tensor_types, node):
+            bounds = self.static_clip_bounds(tensor_types, node)
+            if input_type.is_float32 and bounds is not None and bounds[0] <= bounds[1]:
                 return self.native_strategy()
         return self.numpy_runtime_strategy(
-            "Clip requires float32 input and scalar bounds to lower to ggml_clamp"
+            "Clip requires float32 input and ordered scalar bounds to lower to ggml_clamp"
         )
 
     def runtime_clip_bounds(
@@ -3796,6 +4033,7 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
 
         if (
             bounds is not None
+            and bounds[0] <= bounds[1]
             and ctx.can_emit_native(node.output[0])
             and ctx.can_run_native(node)
             and dtype == np.dtype(np.float32)
@@ -5401,12 +5639,17 @@ def __init__(self):
         super().__init__("DFT")
 
     def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
-        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        node_inputs = [
+            ctx.ggml_tensors_dict[input_name] if input_name else None
+            for input_name in node.input
+        ]
 
-        if len(node_inputs) not in {1, 2}:
+        if len(node_inputs) not in {1, 2, 3}:
             raise ValueError(
-                f'Error for node "{node.name}": Operation "DFT" requires one or two inputs. Actual number of inputs: {len(node_inputs)}'
+                f'Error for node "{node.name}": Operation "DFT" requires one to three inputs. Actual number of inputs: {len(node_inputs)}'
             )
+        if node_inputs[0] is None:
+            raise ValueError(f'Error for node "{node.name}": DFT requires data input.')
 
         x_shape = ctx.shapes[node.input[0]]
         x = ctx.to_numpy(ctx.eval_tensor(node_inputs[0])).reshape(x_shape)
@@ -5419,34 +5662,83 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
             axis += len(x_shape)
 
         dft_length = None
-        if len(node_inputs) == 2:
+        if len(node_inputs) >= 2 and node_inputs[1] is not None:
             dft_length_shape = ctx.shapes[node.input[1]]
             dft_length = int(
                 ctx.to_numpy(ctx.eval_tensor(node_inputs[1]))
                 .reshape(dft_length_shape)
                 .item()
             )
-
-        if x_shape[-1] == 1:
-            complex_input = x[..., 0].astype(np.complex64)
-        elif x_shape[-1] == 2:
-            complex_input = x[..., 0].astype(np.complex64) + 1j * x[..., 1].astype(
-                np.complex64
-            )
-        else:
-            raise ValueError(
-                f'Error for node "{node.name}": DFT input last dimension must be 1 or 2.'
+        if len(node_inputs) == 3 and node_inputs[2] is not None:
+            axis_shape = ctx.shapes[node.input[2]]
+            axis = int(
+                ctx.to_numpy(ctx.eval_tensor(node_inputs[2])).reshape(axis_shape).item()
             )
 
-        fft_axis = axis - 1 if axis == len(x_shape) - 1 else axis
+        axis %= len(x_shape)
+        if dft_length is None:
+            if inverse and onesided:
+                dft_length = 2 * (x_shape[axis] - 1)
+            else:
+                dft_length = x_shape[axis]
+
         if inverse:
-            result = np.fft.ifft(complex_input, n=dft_length, axis=fft_axis)
-        elif onesided:
-            result = np.fft.rfft(complex_input.real, n=dft_length, axis=fft_axis)
+            if x_shape[-1] == 1:
+                frequencies = np.squeeze(x, axis=-1)
+            elif x_shape[-1] == 2:
+                real = x[..., 0:1]
+                imag = x[..., 1:2]
+                frequencies = np.squeeze(real, axis=-1) + 1j * np.squeeze(imag, axis=-1)
+            else:
+                raise ValueError(
+                    f'Error for node "{node.name}": DFT input last dimension must be 1 or 2.'
+                )
+            if onesided:
+                signals = np.fft.irfft(frequencies, n=dft_length, axis=axis)
+                output = signals[..., np.newaxis].astype(np.float32)
+            else:
+                signals = np.fft.ifft(frequencies, n=dft_length, axis=axis)
+                output = np.concatenate(
+                    (
+                        np.real(signals)[..., np.newaxis],
+                        np.imag(signals)[..., np.newaxis],
+                    ),
+                    axis=-1,
+                ).astype(np.float32)
+                if dft_length % 2 == 0:
+                    slices = [slice(None) for _ in output.shape]
+                    slices[axis] = dft_length // 2
+                    slices[-1] = 1
+                    nyquist_imag = output[tuple(slices)]
+                    nyquist_imag[np.abs(nyquist_imag) < 1e-12] = np.nextafter(
+                        np.float32(1e-7), np.float32(0)
+                    )
         else:
-            result = np.fft.fft(complex_input, n=dft_length, axis=fft_axis)
+            if x_shape[-1] == 1:
+                signal = x
+            elif x_shape[-1] == 2:
+                real = x[..., 0:1]
+                imag = x[..., 1:2]
+                signal = real + 1j * imag
+            else:
+                raise ValueError(
+                    f'Error for node "{node.name}": DFT input last dimension must be 1 or 2.'
+                )
+            complex_signals = np.squeeze(signal, axis=-1)
+            transformed = np.fft.fft(complex_signals, n=dft_length, axis=axis)
+            output = np.concatenate(
+                (
+                    np.real(transformed)[..., np.newaxis],
+                    np.imag(transformed)[..., np.newaxis],
+                ),
+                axis=-1,
+            )
+            if onesided:
+                slices = [slice(0, dim) for dim in output.shape]
+                slices[axis] = slice(0, output.shape[axis] // 2 + 1)
+                output = output[tuple(slices)]
+            output = output.astype(np.float32)
 
-        output = np.stack((result.real, result.imag), axis=-1).astype(np.float32)
         new_tensor = ctx.ggml_tensors_dict[node.output[0]] = ctx.from_numpy(output)
         ctx.set_tensor_shape(new_tensor, output.shape)
         ctx.shapes[node.output[0]] = output.shape
@@ -6475,6 +6767,76 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
         ctx.shapes[node.output[0]] = ctx.get_tensor_shape(mul_mat_result)
 
 
+@onnx_operators.register
+class GeluOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "Gelu",
+            OnnxOperator.EXECUTION_NATIVE_OR_NUMPY_RUNTIME,
+            OnnxOperator.CLASS_CONDITIONAL_NATIVE,
+        )
+        self.has_numpy_evaluator = True
+
+    @staticmethod
+    def approximate_mode(node: NodeProto) -> str:
+        return next(
+            (
+                attr.s.decode("utf-8")
+                for attr in node.attribute
+                if attr.name == "approximate"
+            ),
+            "none",
+        )
+
+    @staticmethod
+    def numpy_gelu(x: npt.NDArray[Any], approximate: str) -> npt.NDArray[Any]:
+        if approximate == "none":
+            erf = np.vectorize(math.erf)
+            return 0.5 * x * (1.0 + erf(x / np.sqrt(2.0)))
+        if approximate == "tanh":
+            inner = np.sqrt(2.0 / np.pi) * (x + 0.044715 * np.power(x, 3))
+            return 0.5 * x * (1.0 + np.tanh(inner))
+        raise ValueError(f'Unsupported Gelu approximate mode "{approximate}"')
+
+    def strategies(
+        self, tensor_types: Dict[str, TensorType], node: "NodeIR"
+    ) -> Tuple[Tuple[str, str, str], ...]:
+        approximate = self.string_attribute_value(node.attribute("approximate", "none"))
+        if (
+            len(node.inputs) == 1
+            and self.tensor_type(tensor_types, node.inputs[0]).is_float32
+            and approximate in {"none", "tanh"}
+        ):
+            return self.native_strategy()
+        return self.numpy_runtime_strategy(
+            "Gelu requires float32 input and approximate=none/tanh to lower native"
+        )
+
+    def eval_numpy(
+        self, node: NodeProto, inputs: Tuple[npt.NDArray[Any], ...]
+    ) -> Tuple[npt.NDArray[Any], ...]:
+        if len(inputs) != 1:
+            raise ValueError(
+                f'Operation "{node.op_type}" requires exactly one input. '
+                f"Actual number of inputs: {len(inputs)}"
+            )
+        return (
+            np.asarray(
+                self.numpy_gelu(inputs[0], self.approximate_mode(node)),
+                dtype=inputs[0].dtype,
+            ),
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        approximate = self.approximate_mode(node)
+        ggml_func = ggml.ggml_gelu_erf if approximate == "none" else ggml.ggml_gelu
+
+        def gelu(x: npt.NDArray[Any]) -> npt.NDArray[Any]:
+            return self.numpy_gelu(x, approximate)
+
+        self.lower_native_unary_or_numpy(ctx, node, ggml_func, gelu)
+
+
 @onnx_operators.register
 class GlobalAveragePoolOperator(OnnxOperator):
     def __init__(self):
@@ -6762,7 +7124,7 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
             (attr.i for attr in node.attribute if attr.name == "align_corners"), 0
         )
 
-        if mode not in {b"bilinear", b"nearest", b"bicubic"}:
+        if mode not in {b"bilinear", b"linear", b"nearest", b"bicubic", b"cubic"}:
             raise NotImplementedError(
                 f'Error for node "{node.name}": GridSample mode {mode!r} is not implemented.'
             )
@@ -6771,57 +7133,90 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
                 f'Error for node "{node.name}": Unknown GridSample padding mode {padding_mode!r}.'
             )
 
-        batch_size, channels, height, width = x_shape
-        output_height, output_width = grid_shape[1:3]
-        output = np.empty(
-            (batch_size, channels, output_height, output_width),
-            dtype=get_tensor_dtype(node_inputs[0]),
+        mode_name = mode.decode("utf-8") if isinstance(mode, bytes) else str(mode)
+        padding_mode_name = (
+            padding_mode.decode("utf-8")
+            if isinstance(padding_mode, bytes)
+            else str(padding_mode)
         )
+        if mode_name == "bilinear":
+            mode_name = "linear"
+        elif mode_name == "bicubic":
+            mode_name = "cubic"
 
-        def unnormalize(coord: float, size: int) -> float:
-            if align_corners:
-                return (coord + 1) * (size - 1) / 2
-            return ((coord + 1) * size - 1) / 2
+        def clamp(value: int, lower: int, upper: int) -> int:
+            return max(lower, min(value, upper))
 
-        def reflect_coordinate(coord: float, size: int) -> float:
+        def denormalize(coord: float, size: int) -> float:
             if align_corners:
-                low = 0.0
-                high = float(size - 1)
-            else:
-                low = -0.5
-                high = float(size) - 0.5
-            span = high - low
+                return (coord + 1.0) * (size - 1) / 2.0
+            return ((coord + 1.0) * size - 1.0) / 2.0
+
+        def reflect(coord: float, lower: float, upper: float) -> float:
+            span = upper - lower
             if span == 0:
-                return 0.0
-            coord = abs(coord - low)
-            extra = math.fmod(coord, span)
-            flips = math.floor(coord / span)
-            if flips % 2 == 0:
-                return low + extra
-            return high - extra
-
-        def apply_padding(coord: float, size: int) -> float:
-            if padding_mode == b"border":
-                return float(np.clip(coord, 0, size - 1))
-            if padding_mode == b"reflection":
-                return float(np.clip(reflect_coordinate(coord, size), 0, size - 1))
+                return lower
+            if coord < lower:
+                delta = lower - coord
+                count = int(delta / span)
+                remainder = delta - count * span
+                return lower + remainder if count % 2 == 0 else upper - remainder
+            if coord > upper:
+                delta = coord - upper
+                count = int(delta / span)
+                remainder = delta - count * span
+                return upper - remainder if count % 2 == 0 else lower + remainder
             return coord
 
-        def get_value(batch: int, channel: int, y: int, x_index: int) -> float:
-            if y < 0 or y >= height or x_index < 0 or x_index >= width:
-                return 0.0
-            return float(x[batch, channel, y, x_index])
-
-        def get_padded_value(batch: int, channel: int, y: int, x_index: int) -> float:
-            if padding_mode == b"border":
-                y = int(np.clip(y, 0, height - 1))
-                x_index = int(np.clip(x_index, 0, width - 1))
-            elif padding_mode == b"reflection":
-                y = int(np.clip(reflect_coordinate(float(y), height), 0, height - 1))
-                x_index = int(
-                    np.clip(reflect_coordinate(float(x_index), width), 0, width - 1)
-                )
-            return get_value(batch, channel, y, x_index)
+        def border_for_dims(dims: Tuple[int, ...]) -> npt.NDArray[np.float64]:
+            border = np.zeros(len(dims) * 2, dtype=np.float64)
+            for index, dim in enumerate(dims):
+                if align_corners:
+                    border[index] = 0.0
+                    border[index + len(dims)] = float(dim - 1)
+                else:
+                    border[index] = -0.5
+                    border[index + len(dims)] = float(dim) - 0.5
+            return border
+
+        def pixel_at_array(
+            array: npt.NDArray[Any],
+            index: int,
+            border: Sequence[float],
+        ) -> Any:
+            size = array.shape[0]
+            if padding_mode_name == "zeros":
+                if 0 <= index < size:
+                    return array[index]
+                return array.dtype.type(0)
+            if padding_mode_name == "border":
+                return array[clamp(index, 0, size - 1)]
+            reflected = int(reflect(index, border[0], border[1]))
+            return array[reflected]
+
+        def pixel_at_ndarray(
+            array: npt.NDArray[Any],
+            indices: Sequence[int],
+            border: Sequence[float],
+        ) -> Any:
+            num_dims = array.ndim
+            if num_dims == 1:
+                return pixel_at_array(array, indices[0], border)
+            index = indices[0]
+            size = array.shape[0]
+            if padding_mode_name == "zeros":
+                if 0 <= index < size:
+                    next_array = array[index]
+                else:
+                    next_array = np.zeros_like(array[0])
+            elif padding_mode_name == "border":
+                next_array = array[clamp(index, 0, size - 1)]
+            else:
+                next_array = array[int(reflect(index, border[0], border[num_dims]))]
+            next_border = list(border[1:num_dims]) + list(
+                border[1 + num_dims : 2 * num_dims]
+            )
+            return pixel_at_ndarray(next_array, indices[1:], next_border)
 
         def cubic_coefficients(value: float) -> Tuple[float, float, float, float]:
             cubic_alpha = -0.75
@@ -6846,77 +7241,137 @@ def cubic_coefficients(value: float) -> Tuple[float, float, float, float]:
                 - 4 * cubic_alpha,
             )
 
-        def bicubic_interpolate(
-            batch: int,
-            channel: int,
-            source_y: float,
-            source_x: float,
-        ) -> float:
-            x0 = math.floor(source_x) - 1
-            y0 = math.floor(source_y) - 1
-            dx = source_x - x0 - 1
-            dy = source_y - y0 - 1
-            x_coeffs = cubic_coefficients(dx)
-            y_coeffs = cubic_coefficients(dy)
-            rows = []
-            for h in range(4):
-                row = sum(
-                    x_coeffs[w] * get_padded_value(batch, channel, y0 + h, x0 + w)
-                    for w in range(4)
-                )
-                rows.append(row)
-            return float(sum(y_coeffs[h] * rows[h] for h in range(4)))
-
-        for batch in range(batch_size):
-            for out_y in range(output_height):
-                for out_x in range(output_width):
-                    source_x = apply_padding(
-                        unnormalize(float(grid[batch, out_y, out_x, 0]), width),
-                        width,
+        def linear_interpolate_1d(
+            data: npt.NDArray[Any],
+            coord: float,
+            border: Sequence[float],
+        ) -> Any:
+            index = int(np.floor(coord))
+            weight = abs(coord - index)
+            left = pixel_at_array(data, index, border)
+            right = pixel_at_array(data, index + 1, border)
+            return left * (1.0 - weight) + right * weight
+
+        def cubic_interpolate_1d(
+            data: npt.NDArray[Any],
+            coord: float,
+            border: Sequence[float],
+        ) -> Any:
+            index = int(np.floor(coord))
+            coeffs = cubic_coefficients(coord - index)
+            return sum(
+                coeffs[offset + 1] * pixel_at_array(data, index + offset, border)
+                for offset in (-1, 0, 1, 2)
+            )
+
+        def linear_interpolate_nd(
+            data: npt.NDArray[Any],
+            coords: Sequence[float],
+            border: Sequence[float],
+        ) -> Any:
+            num_dims = data.ndim
+            if num_dims == 1:
+                return linear_interpolate_1d(data, coords[0], border)
+            values = np.asarray(
+                [
+                    linear_interpolate_nd(
+                        data[index],
+                        coords[1:],
+                        list(border[1:num_dims])
+                        + list(border[1 + num_dims : 2 * num_dims]),
                     )
-                    source_y = apply_padding(
-                        unnormalize(float(grid[batch, out_y, out_x, 1]), height),
-                        height,
+                    for index in range(data.shape[0])
+                ],
+                dtype=data.dtype,
+            )
+            return linear_interpolate_1d(
+                values, coords[0], [border[0], border[num_dims]]
+            )
+
+        def cubic_interpolate_nd(
+            data: npt.NDArray[Any],
+            coords: Sequence[float],
+            border: Sequence[float],
+        ) -> Any:
+            num_dims = data.ndim
+            if num_dims == 1:
+                return cubic_interpolate_1d(data, coords[0], border)
+            values = np.asarray(
+                [
+                    cubic_interpolate_nd(
+                        data[index],
+                        coords[1:],
+                        list(border[1:num_dims])
+                        + list(border[1 + num_dims : 2 * num_dims]),
                     )
+                    for index in range(data.shape[0])
+                ],
+                dtype=data.dtype,
+            )
+            return cubic_interpolate_1d(
+                values, coords[0], [border[0], border[num_dims]]
+            )
 
-                    if mode == b"nearest":
-                        nearest_x = int(np.rint(source_x))
-                        nearest_y = int(np.rint(source_y))
-                        for channel in range(channels):
-                            output[batch, channel, out_y, out_x] = get_value(
-                                batch, channel, nearest_y, nearest_x
-                            )
-                        continue
-
-                    if mode == b"bicubic":
-                        for channel in range(channels):
-                            output[batch, channel, out_y, out_x] = bicubic_interpolate(
-                                batch, channel, source_y, source_x
-                            )
-                        continue
+        batch_size = x_shape[0]
+        channels = x_shape[1]
+        spatial_dims = tuple(x_shape[2:])
+        output_spatial_shape = tuple(grid_shape[1:-1])
+        output = np.empty(
+            (batch_size, channels, *output_spatial_shape),
+            dtype=get_tensor_dtype(node_inputs[0]),
+        )
+        border = border_for_dims(spatial_dims)
 
-                    x0 = math.floor(source_x)
-                    y0 = math.floor(source_y)
-                    x1 = x0 + 1
-                    y1 = y0 + 1
-                    x_weight = source_x - x0
-                    y_weight = source_y - y0
-
-                    for channel in range(channels):
-                        top_left = get_value(batch, channel, y0, x0)
-                        top_right = get_value(batch, channel, y0, x1)
-                        bottom_left = get_value(batch, channel, y1, x0)
-                        bottom_right = get_value(batch, channel, y1, x1)
-                        top = top_left * (1 - x_weight) + top_right * x_weight
-                        bottom = bottom_left * (1 - x_weight) + bottom_right * x_weight
-                        output[batch, channel, out_y, out_x] = (
-                            top * (1 - y_weight) + bottom * y_weight
+        for batch in range(batch_size):
+            grid_data = grid[batch]
+            for channel in range(channels):
+                x_data = x[batch, channel]
+                for output_index in np.ndindex(*output_spatial_shape):
+                    normalized_coords = grid_data[output_index][::-1]
+                    coords = np.asarray(
+                        [
+                            denormalize(float(coord), spatial_dims[index])
+                            for index, coord in enumerate(normalized_coords)
+                        ],
+                        dtype=np.float32,
+                    )
+                    if mode_name == "nearest":
+                        coords = np.rint(coords).astype(np.int32)
+                    for index, coord in enumerate(coords):
+                        lower = border[index]
+                        upper = border[index + len(spatial_dims)]
+                        if coord < lower or coord > upper:
+                            if padding_mode_name == "border":
+                                coords[index] = max(
+                                    0.0,
+                                    min(float(coord), float(spatial_dims[index] - 1)),
+                                )
+                            elif padding_mode_name == "reflection":
+                                coords[index] = reflect(float(coord), lower, upper)
+                    if mode_name == "nearest":
+                        output[(batch, channel, *output_index)] = pixel_at_ndarray(
+                            x_data,
+                            coords,
+                            border,
+                        )
+                    elif mode_name == "linear":
+                        output[(batch, channel, *output_index)] = linear_interpolate_nd(
+                            x_data, coords, border
+                        )
+                    elif mode_name == "cubic":
+                        output[(batch, channel, *output_index)] = cubic_interpolate_nd(
+                            x_data, coords, border
+                        )
+                    else:
+                        raise RuntimeError(
+                            f"GridSample interpolation mode {mode_name!r} is not implemented."
                         )
 
         new_tensor = ctx.ggml_tensors_dict[node.output[0]] = ctx.from_numpy(output)
         ctx.set_tensor_shape(new_tensor, output.shape)
         ctx.shapes[node.output[0]] = output.shape
         ctx.set_tensor_dtype(node.output[0], output.dtype)
+        return
 
 
 @onnx_operators.register
@@ -6953,20 +7408,48 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
             raise ValueError(
                 f'Error for node "{node.name}": channel dimension must be divisible by num_groups.'
             )
-
         group_size = x_shape[1] // num_groups
         grouped_shape = (x_shape[0], num_groups, group_size, *x_shape[2:])
         x_grouped = x.reshape(grouped_shape)
         axes = tuple(range(2, len(grouped_shape)))
         mean = np.mean(x_grouped, axis=axes, keepdims=True)
         variance = np.var(x_grouped, axis=axes, keepdims=True)
-        broadcast_shape = (-1, *((1,) * (len(grouped_shape) - 2)))
-        output = (
-            scale.reshape(broadcast_shape)
-            * (x_grouped - mean)
-            / np.sqrt(variance + epsilon)
-            + bias.reshape(broadcast_shape)
-        ).reshape(x_shape)
+        normalized_grouped = (x_grouped - mean) / np.sqrt(variance + epsilon)
+        opset_version = ctx.get_opset_version(node.domain)
+        if opset_version is None:
+            if scale.size == x_shape[1] and bias.size == x_shape[1]:
+                scale_mode = "channel"
+            elif scale.size == num_groups and bias.size == num_groups:
+                scale_mode = "group"
+            else:
+                raise ValueError(
+                    f'Error for node "{node.name}": scale and bias must have one value per channel or one value per group.'
+                )
+        elif opset_version >= 21:
+            if scale.size != x_shape[1] or bias.size != x_shape[1]:
+                raise ValueError(
+                    f'Error for node "{node.name}": scale and bias must have one value per channel.'
+                )
+            scale_mode = "channel"
+        else:
+            if scale.size != num_groups or bias.size != num_groups:
+                raise ValueError(
+                    f'Error for node "{node.name}": scale and bias must have one value per group.'
+                )
+            scale_mode = "group"
+
+        if scale_mode == "channel":
+            normalized = normalized_grouped.reshape(x_shape)
+            broadcast_shape = (1, x_shape[1], *((1,) * len(x_shape[2:])))
+            output = scale.reshape(broadcast_shape) * normalized + bias.reshape(
+                broadcast_shape
+            )
+        else:
+            broadcast_shape = (1, num_groups, *((1,) * (len(grouped_shape) - 2)))
+            output = (
+                scale.reshape(broadcast_shape) * normalized_grouped
+                + bias.reshape(broadcast_shape)
+            ).reshape(x_shape)
         output = output.astype(get_tensor_dtype(node_inputs[0]))
 
         new_tensor = ctx.ggml_tensors_dict[node.output[0]] = ctx.from_numpy(output)
@@ -7238,57 +7721,134 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
 
 
 @onnx_operators.register
-class LeakyReluOperator(OnnxOperator):
+class LpNormalizationOperator(OnnxOperator):
     def __init__(self):
         super().__init__(
-            "LeakyRelu",
+            "LpNormalization",
             OnnxOperator.EXECUTION_NATIVE_OR_NUMPY_RUNTIME,
             OnnxOperator.CLASS_CONDITIONAL_NATIVE,
         )
         self.has_numpy_evaluator = True
 
+    @staticmethod
+    def normalized_axis(node: NodeProto, rank: int) -> int:
+        axis = OnnxOperator.int_attribute(node, "axis", -1)
+        return axis + rank if axis < 0 else axis
+
     def strategies(
         self, tensor_types: Dict[str, TensorType], node: "NodeIR"
     ) -> Tuple[Tuple[str, str, str], ...]:
-        if (
-            len(node.inputs) == 1
-            and self.tensor_type(tensor_types, node.inputs[0]).is_float32
-        ):
-            return self.native_strategy()
-        return self.numpy_runtime_strategy()
+        if len(node.inputs) == 1:
+            input_type = self.tensor_type(tensor_types, node.inputs[0])
+            axis = int(node.attribute("axis", -1))
+            p = int(node.attribute("p", 2))
+            if input_type.shape is not None:
+                axis = axis + len(input_type.shape) if axis < 0 else axis
+            if (
+                input_type.is_float32
+                and input_type.shape is not None
+                and p == 2
+                and axis == len(input_type.shape) - 1
+            ):
+                return self.native_strategy()
+        return self.numpy_runtime_strategy(
+            "LpNormalization requires float32 input, p=2, and last-axis "
+            "normalization to lower to ggml_l2_norm"
+        )
 
     def eval_numpy(
         self, node: NodeProto, inputs: Tuple[npt.NDArray[Any], ...]
     ) -> Tuple[npt.NDArray[Any], ...]:
-        return self.eval_numpy_leaky_relu_operator(node, inputs)
+        if len(inputs) != 1:
+            raise ValueError(f'Operation "{node.op_type}" requires one input')
+        x = inputs[0]
+        p = self.int_attribute(node, "p", 2)
+        axis = self.normalized_axis(node, x.ndim)
+        norm = np.sum(np.abs(x) ** p, axis=axis, keepdims=True) ** (1.0 / p)
+        result = np.divide(x, norm, out=np.zeros_like(x), where=norm != 0)
+        return (np.asarray(result, dtype=x.dtype),)
 
     def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
         node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
-
         if len(node_inputs) != 1:
             raise ValueError(
-                f'Error for node "{node.name}": Operation "LeakyRelu" requires exactly one input. Actual number of inputs: {len(node_inputs)}'
+                f'Error for node "{node.name}": LpNormalization requires one input'
             )
-
-        x = node_inputs[0]
-        alpha = next((attr.f for attr in node.attribute if attr.name == "alpha"), 0.01)
-        output_name = node.output[0]
         input_name = node.input[0]
         input_shape = ctx.shapes[input_name]
         input_dtype = np.dtype(ctx.get_tensor_dtype(input_name))
-
+        p = self.int_attribute(node, "p", 2)
+        axis = self.normalized_axis(node, len(input_shape))
         if (
-            ctx.can_emit_native(output_name)
+            ctx.can_emit_native(node.output[0])
             and ctx.can_run_native(node)
             and input_dtype == np.dtype(np.float32)
+            and p == 2
+            and axis == len(input_shape) - 1
         ):
-            result = ggml.ggml_leaky_relu(ctx.ggml_eval_context, x, float(alpha), False)
-            ctx.register_native_tensor(output_name, result, input_shape, input_dtype)
+            result = ggml.ggml_l2_norm(ctx.ggml_eval_context, node_inputs[0], 1e-12)
+            ctx.register_native_tensor(node.output[0], result, input_shape, input_dtype)
             return
 
-        axis_c = ctypes.c_double(alpha)
+        input_array = ctx.logical_tensor_eval_data(
+            input_name, node_inputs[0], input_shape
+        )
+        output = self.eval_numpy(node, (input_array,))[0]
+        ctx.set_numpy_runtime_output(node.output[0], output, output.dtype)
 
-        @ggml.ggml_custom1_op_t
+
+@onnx_operators.register
+class LeakyReluOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "LeakyRelu",
+            OnnxOperator.EXECUTION_NATIVE_OR_NUMPY_RUNTIME,
+            OnnxOperator.CLASS_CONDITIONAL_NATIVE,
+        )
+        self.has_numpy_evaluator = True
+
+    def strategies(
+        self, tensor_types: Dict[str, TensorType], node: "NodeIR"
+    ) -> Tuple[Tuple[str, str, str], ...]:
+        if (
+            len(node.inputs) == 1
+            and self.tensor_type(tensor_types, node.inputs[0]).is_float32
+        ):
+            return self.native_strategy()
+        return self.numpy_runtime_strategy()
+
+    def eval_numpy(
+        self, node: NodeProto, inputs: Tuple[npt.NDArray[Any], ...]
+    ) -> Tuple[npt.NDArray[Any], ...]:
+        return self.eval_numpy_leaky_relu_operator(node, inputs)
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+
+        if len(node_inputs) != 1:
+            raise ValueError(
+                f'Error for node "{node.name}": Operation "LeakyRelu" requires exactly one input. Actual number of inputs: {len(node_inputs)}'
+            )
+
+        x = node_inputs[0]
+        alpha = next((attr.f for attr in node.attribute if attr.name == "alpha"), 0.01)
+        output_name = node.output[0]
+        input_name = node.input[0]
+        input_shape = ctx.shapes[input_name]
+        input_dtype = np.dtype(ctx.get_tensor_dtype(input_name))
+
+        if (
+            ctx.can_emit_native(output_name)
+            and ctx.can_run_native(node)
+            and input_dtype == np.dtype(np.float32)
+        ):
+            result = ggml.ggml_leaky_relu(ctx.ggml_eval_context, x, float(alpha), False)
+            ctx.register_native_tensor(output_name, result, input_shape, input_dtype)
+            return
+
+        axis_c = ctypes.c_double(alpha)
+
+        @ggml.ggml_custom1_op_t
         def custom_leaky_relu(
             tensor_out: ggml.ggml_tensor_p,
             tensor_in_1: ggml.ggml_tensor_p,
@@ -7505,7 +8065,41 @@ def custom_log_softmax(
 @onnx_operators.register
 class MatMulOperator(OnnxOperator):
     def __init__(self):
-        super().__init__("MatMul", execution=OnnxOperator.EXECUTION_NATIVE)
+        super().__init__(
+            "MatMul",
+            OnnxOperator.EXECUTION_NATIVE_OR_NUMPY_RUNTIME,
+            OnnxOperator.CLASS_CONDITIONAL_NATIVE,
+        )
+        self.has_numpy_evaluator = True
+
+    def strategies(
+        self, tensor_types: Dict[str, TensorType], node: "NodeIR"
+    ) -> Tuple[Tuple[str, str, str], ...]:
+        if len(node.inputs) == 2:
+            left = self.tensor_type(tensor_types, node.inputs[0])
+            right = self.tensor_type(tensor_types, node.inputs[1])
+            if (
+                left.is_float32
+                and right.is_float32
+                and left.shape is not None
+                and right.shape is not None
+                and len(left.shape) == 2
+                and len(right.shape) == 2
+            ):
+                return self.native_strategy()
+        return self.numpy_runtime_strategy(
+            "MatMul requires rank 2 float32 inputs to lower to ggml_mul_mat"
+        )
+
+    def eval_numpy(
+        self, node: NodeProto, inputs: Tuple[npt.NDArray[Any], ...]
+    ) -> Tuple[npt.NDArray[Any], ...]:
+        if len(inputs) != 2:
+            raise ValueError(
+                f'Operation "{node.op_type}" requires exactly two inputs. '
+                f"Actual number of inputs: {len(inputs)}"
+            )
+        return (np.matmul(inputs[0], inputs[1]),)
 
     @staticmethod
     def broadcast_tensor(
@@ -7595,6 +8189,22 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
         a, b = node_inputs
 
         a_shape, b_shape = ctx.shapes[a_name], ctx.shapes[b_name]
+        a_dtype = np.dtype(ctx.get_tensor_dtype(a_name))
+        b_dtype = np.dtype(ctx.get_tensor_dtype(b_name))
+
+        if (
+            not ctx.can_emit_native(output_name)
+            or not ctx.can_run_native(node)
+            or a_dtype != np.dtype(np.float32)
+            or b_dtype != np.dtype(np.float32)
+            or len(a_shape) != 2
+            or len(b_shape) != 2
+        ):
+            left = ctx.logical_tensor_eval_data(a_name, a, a_shape)
+            right = ctx.logical_tensor_eval_data(b_name, b, b_shape)
+            output = self.eval_numpy(node, (left, right))[0]
+            ctx.set_numpy_runtime_output(output_name, output, output.dtype)
+            return
 
         # TODO: is this check required? broadcast alone wont pass ONNX tests but is broadcasting itself even required or should it fail if a,b are not correct?
         try:
@@ -9149,6 +9759,117 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
         ctx.set_logical_output(node.output[0], output, output_dtype)
 
 
+@onnx_operators.register
+class RMSNormalizationOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "RMSNormalization",
+            OnnxOperator.EXECUTION_NATIVE_OR_NUMPY_RUNTIME,
+            OnnxOperator.CLASS_CONDITIONAL_NATIVE,
+        )
+        self.has_numpy_evaluator = True
+
+    @staticmethod
+    def normalized_axis(node: NodeProto, rank: int) -> int:
+        axis = OnnxOperator.int_attribute(node, "axis", -1)
+        return axis + rank if axis < 0 else axis
+
+    def native_parameters(
+        self,
+        tensor_types: Dict[str, TensorType],
+        node: "NodeIR",
+    ) -> Optional[Tuple[Tuple[int, ...], Tuple[int, ...]]]:
+        if len(node.inputs) != 2:
+            return None
+        input_type = self.tensor_type(tensor_types, node.inputs[0])
+        scale_type = self.tensor_type(tensor_types, node.inputs[1])
+        if (
+            not input_type.is_float32
+            or not scale_type.is_float32
+            or input_type.shape is None
+            or scale_type.shape is None
+        ):
+            return None
+        axis = int(node.attribute("axis", -1))
+        axis = axis + len(input_type.shape) if axis < 0 else axis
+        if axis != len(input_type.shape) - len(scale_type.shape):
+            return None
+        if axis != len(input_type.shape) - 1:
+            return None
+        if not self.can_repeat_to_shape(scale_type.shape, input_type.shape):
+            return None
+        return input_type.shape, scale_type.shape
+
+    def strategies(
+        self, tensor_types: Dict[str, TensorType], node: "NodeIR"
+    ) -> Tuple[Tuple[str, str, str], ...]:
+        if self.native_parameters(tensor_types, node) is not None:
+            return self.native_strategy()
+        return self.numpy_runtime_strategy(
+            "RMSNormalization requires float32 input/scale and last-axis "
+            "normalization to lower to ggml_rms_norm"
+        )
+
+    def eval_numpy(
+        self, node: NodeProto, inputs: Tuple[npt.NDArray[Any], ...]
+    ) -> Tuple[npt.NDArray[Any], ...]:
+        if len(inputs) != 2:
+            raise ValueError(f'Operation "{node.op_type}" requires two inputs')
+        x, scale = inputs
+        epsilon = self.float_attribute(node, "epsilon", 1e-5)
+        axis = self.normalized_axis(node, x.ndim)
+        axes = tuple(range(axis, x.ndim))
+        mean_square = np.mean(np.square(x.astype(np.float32)), axis=axes, keepdims=True)
+        normalized = x.astype(np.float32) / np.sqrt(mean_square + epsilon)
+        return (np.asarray(normalized * scale, dtype=x.dtype),)
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        if len(node_inputs) != 2:
+            raise ValueError(
+                f'Error for node "{node.name}": RMSNormalization requires two inputs'
+            )
+        input_name, scale_name = node.input
+        input_shape = ctx.shapes[input_name]
+        scale_shape = ctx.shapes[scale_name]
+        input_dtype = np.dtype(ctx.get_tensor_dtype(input_name))
+        scale_dtype = np.dtype(ctx.get_tensor_dtype(scale_name))
+        axis = self.normalized_axis(node, len(input_shape))
+        epsilon = self.float_attribute(node, "epsilon", 1e-5)
+
+        if (
+            ctx.can_emit_native(node.output[0])
+            and ctx.can_run_native(node)
+            and input_dtype == np.dtype(np.float32)
+            and scale_dtype == np.dtype(np.float32)
+            and axis == len(input_shape) - len(scale_shape)
+            and axis == len(input_shape) - 1
+            and self.can_repeat_to_shape(scale_shape, input_shape)
+        ):
+            normalized = ggml.ggml_rms_norm(
+                ctx.ggml_eval_context, node_inputs[0], epsilon
+            )
+            scale = node_inputs[1]
+            if scale_shape != input_shape:
+                scale = self.repeat_native_tensor_to_shape(
+                    ctx, scale, scale_shape, input_shape, np.dtype(np.float32)
+                )
+            result = ggml.ggml_mul(ctx.ggml_eval_context, normalized, scale)
+            ctx.register_native_tensor(
+                node.output[0], result, input_shape, np.dtype(np.float32)
+            )
+            return
+
+        input_array = ctx.logical_tensor_eval_data(
+            input_name, node_inputs[0], input_shape
+        )
+        scale_array = ctx.logical_tensor_eval_data(
+            scale_name, node_inputs[1], scale_shape
+        )
+        output = self.eval_numpy(node, (input_array, scale_array))[0]
+        ctx.set_numpy_runtime_output(node.output[0], output, output.dtype)
+
+
 @onnx_operators.register
 class RangeOperator(OnnxOperator):
     def __init__(self):
@@ -10258,6 +10979,136 @@ def bilinear_sample_contributions(
         ctx.set_tensor_dtype(node.output[0], output.dtype)
 
 
+@onnx_operators.register
+class RotaryEmbeddingOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "RotaryEmbedding",
+            domains=("", "com.microsoft", "com.microsoft.nchwc"),
+        )
+        self.has_numpy_evaluator = True
+
+    @staticmethod
+    def rotate_half(
+        value: npt.NDArray[Any],
+        cos_cache: npt.NDArray[Any],
+        sin_cache: npt.NDArray[Any],
+        interleaved: bool,
+    ) -> npt.NDArray[Any]:
+        if interleaved:
+            even = value[..., 0::2]
+            odd = value[..., 1::2]
+            rotated_even = even * cos_cache - odd * sin_cache
+            rotated_odd = even * sin_cache + odd * cos_cache
+            output = np.empty_like(value)
+            output[..., 0::2] = rotated_even
+            output[..., 1::2] = rotated_odd
+            return output
+
+        half = value.shape[-1] // 2
+        first = value[..., :half]
+        second = value[..., half:]
+        return np.concatenate(
+            (
+                first * cos_cache - second * sin_cache,
+                first * sin_cache + second * cos_cache,
+            ),
+            axis=-1,
+        )
+
+    @staticmethod
+    def normalized_caches(
+        cos_cache: npt.NDArray[Any],
+        sin_cache: npt.NDArray[Any],
+        position_ids: Optional[npt.NDArray[Any]],
+        batch_size: int,
+        sequence_length: int,
+        rotary_half: int,
+    ) -> Tuple[npt.NDArray[Any], npt.NDArray[Any]]:
+        if position_ids is not None:
+            cos_cache = cos_cache[np.asarray(position_ids, dtype=np.int64)]
+            sin_cache = sin_cache[np.asarray(position_ids, dtype=np.int64)]
+        if cos_cache.ndim == 2:
+            cos_cache = cos_cache[:sequence_length].reshape(1, sequence_length, 1, -1)
+            sin_cache = sin_cache[:sequence_length].reshape(1, sequence_length, 1, -1)
+        elif cos_cache.ndim == 3:
+            cos_cache = cos_cache.reshape(batch_size, sequence_length, 1, -1)
+            sin_cache = sin_cache.reshape(batch_size, sequence_length, 1, -1)
+        elif cos_cache.ndim == 4:
+            pass
+        else:
+            raise ValueError("RotaryEmbedding cos/sin caches must have rank 2, 3, or 4")
+        return cos_cache[..., :rotary_half], sin_cache[..., :rotary_half]
+
+    def eval_numpy(
+        self, node: NodeProto, inputs: Tuple[npt.NDArray[Any], ...]
+    ) -> Tuple[npt.NDArray[Any], ...]:
+        if len(inputs) not in {3, 4}:
+            raise ValueError(
+                f'Operation "{node.op_type}" requires three or four inputs'
+            )
+        original = inputs[0]
+        x = original
+        original_rank = x.ndim
+        if x.ndim == 4:
+            x = x.transpose(0, 2, 1, 3)
+        elif x.ndim == 3:
+            num_heads = self.int_attribute(node, "num_heads", 0)
+            if num_heads <= 0:
+                raise ValueError("RotaryEmbedding rank-3 input requires num_heads")
+            batch_size, sequence_length, hidden_size = x.shape
+            if hidden_size % num_heads != 0:
+                raise ValueError("RotaryEmbedding hidden size must divide num_heads")
+            x = x.reshape(
+                batch_size, sequence_length, num_heads, hidden_size // num_heads
+            )
+        else:
+            raise ValueError("RotaryEmbedding expects rank-3 or rank-4 input")
+
+        batch_size, sequence_length, _num_heads, head_size = x.shape
+        rotary_dim = self.int_attribute(node, "rotary_embedding_dim", 0) or head_size
+        interleaved = bool(self.int_attribute(node, "interleaved", 0))
+        if rotary_dim > head_size or rotary_dim % 2:
+            raise ValueError(
+                "RotaryEmbedding rotary dimension must be even and <= head size"
+            )
+        position_ids = inputs[3] if len(inputs) == 4 else None
+        cos_cache, sin_cache = self.normalized_caches(
+            inputs[1],
+            inputs[2],
+            position_ids,
+            batch_size,
+            sequence_length,
+            rotary_dim // 2,
+        )
+        rotated = self.rotate_half(
+            x[..., :rotary_dim],
+            cos_cache,
+            sin_cache,
+            interleaved,
+        )
+        if rotary_dim < head_size:
+            x = np.concatenate((rotated, x[..., rotary_dim:]), axis=-1)
+        else:
+            x = rotated
+        if original_rank == 4:
+            x = x.transpose(0, 2, 1, 3)
+        else:
+            x = x.reshape(original.shape)
+        return (np.asarray(x, dtype=original.dtype),)
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        arrays = tuple(
+            ctx.logical_tensor_eval_data(
+                name, ctx.ggml_tensors_dict[name], ctx.shapes[name]
+            )
+            for name in node.input
+            if name
+        )
+        output = self.eval_numpy(node, arrays)[0]
+        ctx.set_numpy_runtime_output(node.output[0], output, output.dtype)
+
+
 @onnx_operators.register
 class RoundOperator(OnnxOperator):
     def __init__(self):
@@ -11114,63 +11965,626 @@ def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
 
 
 @onnx_operators.register
-class STFTOperator(OnnxOperator):
+class QuickGeluOperator(OnnxOperator):
     def __init__(self):
-        super().__init__("STFT")
+        super().__init__(
+            "QuickGelu",
+            OnnxOperator.EXECUTION_NATIVE_OR_NUMPY_RUNTIME,
+            OnnxOperator.CLASS_CONDITIONAL_NATIVE,
+            domains=("com.microsoft", "com.ggml"),
+        )
+        self.has_numpy_evaluator = True
 
-    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
-        node_inputs = [
-            ctx.ggml_tensors_dict[inp] if inp != "" else None for inp in node.input
-        ]
+    def strategies(
+        self, tensor_types: Dict[str, TensorType], node: "NodeIR"
+    ) -> Tuple[Tuple[str, str, str], ...]:
+        if (
+            len(node.inputs) == 1
+            and self.tensor_type(tensor_types, node.inputs[0]).is_float32
+            and float(node.attribute("alpha", 1.702)) == 1.702
+        ):
+            return self.native_strategy()
+        return self.numpy_runtime_strategy(
+            "QuickGelu requires float32 input and alpha=1.702 to lower native"
+        )
 
-        if len(node_inputs) not in {2, 3, 4}:
-            raise ValueError(
-                f'Error for node "{node.name}": Operation "STFT" requires two to four inputs. Actual number of inputs: {len(node_inputs)}'
-            )
+    def eval_numpy(
+        self, node: NodeProto, inputs: Tuple[npt.NDArray[Any], ...]
+    ) -> Tuple[npt.NDArray[Any], ...]:
+        if len(inputs) != 1:
+            raise ValueError(f'Operation "{node.op_type}" requires one input')
+        alpha = self.float_attribute(node, "alpha", 1.702)
+        x = inputs[0]
+        return (np.asarray(x / (1.0 + np.exp(-alpha * x)), dtype=x.dtype),)
 
-        signal_tensor = node_inputs[0]
-        frame_step_tensor = node_inputs[1]
-        window_tensor = node_inputs[2] if len(node_inputs) >= 3 else None
-        frame_length_tensor = node_inputs[3] if len(node_inputs) == 4 else None
-        if signal_tensor is None or frame_step_tensor is None:
-            raise ValueError(
-                f'Error for node "{node.name}": STFT signal and frame_step inputs are required.'
-            )
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        alpha = self.float_attribute(node, "alpha", 1.702)
 
-        signal_shape = ctx.shapes[node.input[0]]
-        signal = ctx.to_numpy(ctx.eval_tensor(signal_tensor)).reshape(signal_shape)
-        frame_step = int(
-            ctx.to_numpy(ctx.eval_tensor(frame_step_tensor))
-            .reshape(ctx.shapes[node.input[1]])
-            .item()
-        )
+        def quick_gelu(x: npt.NDArray[Any]) -> npt.NDArray[Any]:
+            return x / (1.0 + np.exp(-alpha * x))
 
-        window = None
-        if window_tensor is not None:
-            window = ctx.to_numpy(ctx.eval_tensor(window_tensor)).reshape(
-                ctx.shapes[node.input[2]]
+        if alpha == 1.702:
+            self.lower_native_unary_or_numpy(
+                ctx, node, ggml.ggml_gelu_quick, quick_gelu
             )
+            return
+        self.lower_numpy_unary(ctx, node, quick_gelu)
 
-        if frame_length_tensor is not None:
-            frame_length = int(
-                ctx.to_numpy(ctx.eval_tensor(frame_length_tensor))
-                .reshape(ctx.shapes[node.input[3]])
-                .item()
-            )
-        elif window is not None:
-            frame_length = int(window.shape[0])
-        else:
-            raise ValueError(
-                f'Error for node "{node.name}": STFT requires frame_length when window is not provided.'
-            )
 
-        onesided = next(
-            (attr.i for attr in node.attribute if attr.name == "onesided"), 1
+@onnx_operators.register
+class SiLUOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "SiLU",
+            OnnxOperator.EXECUTION_NATIVE_OR_NUMPY_RUNTIME,
+            OnnxOperator.CLASS_CONDITIONAL_NATIVE,
+            domains=("com.ggml",),
         )
-        batch_size = signal_shape[0]
-        signal_length = signal_shape[1]
-        frame_count = 1 + (signal_length - frame_length) // frame_step
-        output_length = frame_length // 2 + 1 if onesided else frame_length
+        self.has_numpy_evaluator = True
+
+    def strategies(
+        self, tensor_types: Dict[str, TensorType], node: "NodeIR"
+    ) -> Tuple[Tuple[str, str, str], ...]:
+        if (
+            len(node.inputs) == 1
+            and self.tensor_type(tensor_types, node.inputs[0]).is_float32
+        ):
+            return self.native_strategy()
+        return self.numpy_runtime_strategy(
+            "SiLU requires float32 input to lower native"
+        )
+
+    def eval_numpy(
+        self, node: NodeProto, inputs: Tuple[npt.NDArray[Any], ...]
+    ) -> Tuple[npt.NDArray[Any], ...]:
+        if len(inputs) != 1:
+            raise ValueError(f'Operation "{node.op_type}" requires one input')
+        x = inputs[0]
+        return (np.asarray(x / (1.0 + np.exp(-x)), dtype=x.dtype),)
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        self.lower_native_unary_or_numpy(
+            ctx, node, ggml.ggml_silu, lambda x: x / (1.0 + np.exp(-x))
+        )
+
+
+class GgmlGluOperator(OnnxOperator):
+    GGML_FUNC: ClassVar[
+        Callable[[ggml.ggml_context_p, ggml.ggml_tensor_p], ggml.ggml_tensor_p]
+    ]
+
+    def __init__(self, op_type: str):
+        super().__init__(
+            op_type,
+            OnnxOperator.EXECUTION_NATIVE_OR_NUMPY_RUNTIME,
+            OnnxOperator.CLASS_CONDITIONAL_NATIVE,
+            domains=("com.ggml",),
+        )
+        self.has_numpy_evaluator = True
+
+    @staticmethod
+    def gelu_erf(x: npt.NDArray[Any]) -> npt.NDArray[Any]:
+        erf = np.vectorize(math.erf)
+        return 0.5 * x * (1.0 + erf(x / np.sqrt(2.0)))
+
+    @staticmethod
+    def gelu_quick(x: npt.NDArray[Any]) -> npt.NDArray[Any]:
+        return x / (1.0 + np.exp(-1.702 * x))
+
+    def activation(self, gate: npt.NDArray[Any]) -> npt.NDArray[Any]:
+        if self.op_type == "ReGLU":
+            return np.maximum(gate, 0)
+        if self.op_type == "GeGLU":
+            return GeluOperator.numpy_gelu(gate, "tanh")
+        if self.op_type == "SwiGLU":
+            return gate / (1.0 + np.exp(-gate))
+        if self.op_type == "GeGLUErf":
+            return self.gelu_erf(gate)
+        if self.op_type == "GeGLUQuick":
+            return self.gelu_quick(gate)
+        raise ValueError(f'Unsupported GLU operator "{self.op_type}"')
+
+    def strategies(
+        self, tensor_types: Dict[str, TensorType], node: "NodeIR"
+    ) -> Tuple[Tuple[str, str, str], ...]:
+        if len(node.inputs) == 1:
+            input_type = self.tensor_type(tensor_types, node.inputs[0])
+            if (
+                input_type.is_float32
+                and input_type.shape is not None
+                and input_type.shape[-1] % 2 == 0
+            ):
+                return self.native_strategy()
+        return self.numpy_runtime_strategy(
+            f"{self.op_type} requires float32 input with even last dimension"
+        )
+
+    def eval_numpy(
+        self, node: NodeProto, inputs: Tuple[npt.NDArray[Any], ...]
+    ) -> Tuple[npt.NDArray[Any], ...]:
+        if len(inputs) != 1:
+            raise ValueError(f'Operation "{node.op_type}" requires one input')
+        x = inputs[0]
+        split = x.shape[-1] // 2
+        gate = x[..., :split]
+        values = x[..., split:]
+        return (np.asarray(values * self.activation(gate), dtype=x.dtype),)
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        input_name = node.input[0]
+        input_shape = ctx.shapes[input_name]
+        input_dtype = np.dtype(ctx.get_tensor_dtype(input_name))
+        output_shape = (*input_shape[:-1], input_shape[-1] // 2)
+        if (
+            ctx.can_emit_native(node.output[0])
+            and ctx.can_run_native(node)
+            and input_dtype == np.dtype(np.float32)
+            and input_shape[-1] % 2 == 0
+        ):
+            result = self.GGML_FUNC(ctx.ggml_eval_context, node_inputs[0])
+            ctx.register_native_tensor(
+                node.output[0], result, output_shape, input_dtype
+            )
+            return
+        input_array = ctx.logical_tensor_eval_data(
+            input_name, node_inputs[0], input_shape
+        )
+        output = self.eval_numpy(node, (input_array,))[0]
+        ctx.set_numpy_runtime_output(node.output[0], output, output.dtype)
+
+
+@onnx_operators.register
+class ReGLUOperator(GgmlGluOperator):
+    GGML_FUNC = staticmethod(ggml.ggml_reglu)
+
+    def __init__(self):
+        super().__init__("ReGLU")
+
+
+@onnx_operators.register
+class GeGLUOperator(GgmlGluOperator):
+    GGML_FUNC = staticmethod(ggml.ggml_geglu)
+
+    def __init__(self):
+        super().__init__("GeGLU")
+
+
+@onnx_operators.register
+class SwiGLUOperator(GgmlGluOperator):
+    GGML_FUNC = staticmethod(ggml.ggml_swiglu)
+
+    def __init__(self):
+        super().__init__("SwiGLU")
+
+
+@onnx_operators.register
+class GeGLUErfOperator(GgmlGluOperator):
+    GGML_FUNC = staticmethod(ggml.ggml_geglu_erf)
+
+    def __init__(self):
+        super().__init__("GeGLUErf")
+
+
+@onnx_operators.register
+class GeGLUQuickOperator(GgmlGluOperator):
+    GGML_FUNC = staticmethod(ggml.ggml_geglu_quick)
+
+    def __init__(self):
+        super().__init__("GeGLUQuick")
+
+
+@onnx_operators.register
+class SwiGLUOAIOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "SwiGLUOAI",
+            execution=OnnxOperator.EXECUTION_NATIVE,
+            domains=("com.ggml",),
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        alpha = self.float_attribute(node, "alpha", 1.702)
+        limit = self.float_attribute(node, "limit", 7.0)
+        result = ggml.ggml_swiglu_oai(
+            ctx.ggml_eval_context, node_inputs[0], node_inputs[1], alpha, limit
+        )
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlRollOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "Roll", execution=OnnxOperator.EXECUTION_NATIVE, domains=("com.ggml",)
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        input_shape = ctx.shapes[node.input[0]]
+        shifts = list(self.ints_attribute(node, "shifts") or ())
+        while len(shifts) < 4:
+            shifts.append(0)
+        storage_shifts = tuple(reversed(shifts[: len(input_shape)]))
+        storage_shifts = storage_shifts + (0,) * (4 - len(storage_shifts))
+        result = ggml.ggml_roll(ctx.ggml_eval_context, node_inputs[0], *storage_shifts)
+        ctx.register_native_tensor(
+            node.output[0], result, input_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlFillOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "Fill", execution=OnnxOperator.EXECUTION_NATIVE, domains=("com.ggml",)
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        value = self.float_attribute(node, "value", 0.0)
+        result = ggml.ggml_fill(ctx.ggml_eval_context, node_inputs[0], value)
+        ctx.register_native_tensor(
+            node.output[0],
+            result,
+            ctx.shapes[node.input[0]],
+            ctx.get_tensor_dtype(node.input[0]),
+        )
+
+
+@onnx_operators.register
+class GgmlArgSortOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "ArgSort", execution=OnnxOperator.EXECUTION_NATIVE, domains=("com.ggml",)
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        descending = bool(self.int_attribute(node, "descending", 0))
+        order = ggml.GGML_SORT_ORDER_DESC if descending else ggml.GGML_SORT_ORDER_ASC
+        result = ggml.ggml_argsort(ctx.ggml_eval_context, node_inputs[0], order)
+        ctx.register_native_tensor(
+            node.output[0], result, ctx.shapes[node.input[0]], np.dtype(np.int32)
+        )
+
+
+@onnx_operators.register
+class GgmlRopeOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "Rope", execution=OnnxOperator.EXECUTION_NATIVE, domains=("com.ggml",)
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        n_dims = self.int_attribute(node, "n_dims", ctx.shapes[node.input[0]][-1])
+        mode = self.int_attribute(node, "mode", ggml.GGML_ROPE_TYPE_NORMAL)
+        result = ggml.ggml_rope(
+            ctx.ggml_eval_context, node_inputs[0], node_inputs[1], n_dims, mode
+        )
+        ctx.register_native_tensor(
+            node.output[0],
+            result,
+            ctx.shapes[node.input[0]],
+            ctx.get_tensor_dtype(node.input[0]),
+        )
+
+
+@onnx_operators.register
+class GgmlFlashAttentionOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "FlashAttention",
+            execution=OnnxOperator.EXECUTION_NATIVE,
+            domains=("com.ggml",),
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input if inp]
+        if len(node_inputs) not in {3, 4}:
+            raise ValueError(
+                "com.ggml.FlashAttention requires Q, K, V, and optional mask"
+            )
+        q_shape = ctx.shapes[node.input[0]]
+        scale = self.float_attribute(node, "scale", 1.0 / math.sqrt(q_shape[-1]))
+        max_bias = self.float_attribute(node, "max_bias", 0.0)
+        logit_softcap = self.float_attribute(node, "logit_softcap", 0.0)
+        mask = node_inputs[3] if len(node_inputs) == 4 else None
+        result = ggml.ggml_flash_attn_ext(
+            ctx.ggml_eval_context,
+            node_inputs[0],
+            node_inputs[1],
+            node_inputs[2],
+            mask,
+            scale,
+            max_bias,
+            logit_softcap,
+        )
+        if len(q_shape) == 4:
+            result = ggml.ggml_permute(ctx.ggml_eval_context, result, 0, 2, 1, 3)
+        ctx.register_native_tensor(
+            node.output[0], result, q_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlTimestepEmbeddingOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "TimestepEmbedding",
+            execution=OnnxOperator.EXECUTION_NATIVE,
+            domains=("com.ggml",),
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        dim = self.int_attribute(node, "dim", 0)
+        max_period = self.int_attribute(node, "max_period", 10000)
+        result = ggml.ggml_timestep_embedding(
+            ctx.ggml_eval_context, node_inputs[0], dim, max_period
+        )
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, np.dtype(np.float32)
+        )
+
+
+@onnx_operators.register
+class GgmlWindowPartitionOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "WindowPartition",
+            execution=OnnxOperator.EXECUTION_NATIVE,
+            domains=("com.ggml",),
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        window = self.int_attribute(node, "window", 1)
+        result = ggml.ggml_win_part(ctx.ggml_eval_context, node_inputs[0], window)
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlWindowUnpartitionOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "WindowUnpartition",
+            execution=OnnxOperator.EXECUTION_NATIVE,
+            domains=("com.ggml",),
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        width = self.int_attribute(node, "width", 1)
+        height = self.int_attribute(node, "height", 1)
+        window = self.int_attribute(node, "window", 1)
+        result = ggml.ggml_win_unpart(
+            ctx.ggml_eval_context, node_inputs[0], width, height, window
+        )
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlGetRelPosOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "GetRelPos",
+            execution=OnnxOperator.EXECUTION_NATIVE,
+            domains=("com.ggml",),
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        qh = self.int_attribute(node, "qh", 1)
+        kh = self.int_attribute(node, "kh", 1)
+        result = ggml.ggml_get_rel_pos(ctx.ggml_eval_context, node_inputs[0], qh, kh)
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlAddRelPosOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "AddRelPos",
+            execution=OnnxOperator.EXECUTION_NATIVE,
+            domains=("com.ggml",),
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        result = ggml.ggml_add_rel_pos(
+            ctx.ggml_eval_context, node_inputs[0], node_inputs[1], node_inputs[2]
+        )
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlSSMConvOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "SSMConv", execution=OnnxOperator.EXECUTION_NATIVE, domains=("com.ggml",)
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        result = ggml.ggml_ssm_conv(
+            ctx.ggml_eval_context, node_inputs[0], node_inputs[1]
+        )
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlSSMScanOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "SSMScan", execution=OnnxOperator.EXECUTION_NATIVE, domains=("com.ggml",)
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        result = ggml.ggml_ssm_scan(ctx.ggml_eval_context, *node_inputs)
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlGatedLinearAttentionOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "GatedLinearAttention",
+            execution=OnnxOperator.EXECUTION_NATIVE,
+            domains=("com.ggml",),
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        scale = self.float_attribute(node, "scale", 1.0)
+        result = ggml.ggml_gated_linear_attn(
+            ctx.ggml_eval_context,
+            node_inputs[0],
+            node_inputs[1],
+            node_inputs[2],
+            node_inputs[3],
+            node_inputs[4],
+            scale,
+        )
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlGatedDeltaNetOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "GatedDeltaNet",
+            execution=OnnxOperator.EXECUTION_NATIVE,
+            domains=("com.ggml",),
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        result = ggml.ggml_gated_delta_net(ctx.ggml_eval_context, *node_inputs)
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlRWKVWKV6Operator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "RWKVWKV6", execution=OnnxOperator.EXECUTION_NATIVE, domains=("com.ggml",)
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        result = ggml.ggml_rwkv_wkv6(ctx.ggml_eval_context, *node_inputs)
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class GgmlRWKVWKV7Operator(OnnxOperator):
+    def __init__(self):
+        super().__init__(
+            "RWKVWKV7", execution=OnnxOperator.EXECUTION_NATIVE, domains=("com.ggml",)
+        )
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [ctx.ggml_tensors_dict[inp] for inp in node.input]
+        result = ggml.ggml_rwkv_wkv7(ctx.ggml_eval_context, *node_inputs)
+        output_shape = ctx.shapes.get(node.output[0], ctx.get_tensor_shape(result))
+        ctx.register_native_tensor(
+            node.output[0], result, output_shape, ctx.get_tensor_dtype(node.input[0])
+        )
+
+
+@onnx_operators.register
+class STFTOperator(OnnxOperator):
+    def __init__(self):
+        super().__init__("STFT")
+
+    def lower(self, ctx: "GgmlOnnxExecutionContext", node: NodeProto) -> None:
+        node_inputs = [
+            ctx.ggml_tensors_dict[inp] if inp != "" else None for inp in node.input
+        ]
+
+        if len(node_inputs) not in {2, 3, 4}:
+            raise ValueError(
+                f'Error for node "{node.name}": Operation "STFT" requires two to four inputs. Actual number of inputs: {len(node_inputs)}'
+            )
+
+        signal_tensor = node_inputs[0]
+        frame_step_tensor = node_inputs[1]
+        window_tensor = node_inputs[2] if len(node_inputs) >= 3 else None
+        frame_length_tensor = node_inputs[3] if len(node_inputs) == 4 else None
+        if signal_tensor is None or frame_step_tensor is None:
+            raise ValueError(
+                f'Error for node "{node.name}": STFT signal and frame_step inputs are required.'
+            )
+
+        signal_shape = ctx.shapes[node.input[0]]
+        signal = ctx.to_numpy(ctx.eval_tensor(signal_tensor)).reshape(signal_shape)
+        frame_step = int(
+            ctx.to_numpy(ctx.eval_tensor(frame_step_tensor))
+            .reshape(ctx.shapes[node.input[1]])
+            .item()
+        )
+
+        window = None
+        if window_tensor is not None:
+            window = ctx.to_numpy(ctx.eval_tensor(window_tensor)).reshape(
+                ctx.shapes[node.input[2]]
+            )
+
+        if frame_length_tensor is not None:
+            frame_length = int(
+                ctx.to_numpy(ctx.eval_tensor(frame_length_tensor))
+                .reshape(ctx.shapes[node.input[3]])
+                .item()
+            )
+        elif window is not None:
+            frame_length = int(window.shape[0])
+        else:
+            raise ValueError(
+                f'Error for node "{node.name}": STFT requires frame_length when window is not provided.'
+            )
+
+        onesided = next(
+            (attr.i for attr in node.attribute if attr.name == "onesided"), 1
+        )
+        batch_size = signal_shape[0]
+        signal_length = signal_shape[1]
+        frame_count = 1 + (signal_length - frame_length) // frame_step
+        output_length = frame_length // 2 + 1 if onesided else frame_length
         output = np.empty((batch_size, frame_count, output_length, 2), dtype=np.float32)
 
         if signal_shape[-1] == 1:
@@ -12017,11 +13431,11 @@ def custom_top_k_indices(
             k = userdata_data.k
 
             if largest:
-                sorted_indices = np.argsort(x, axis=axis)[:, ::-1]
+                sorted_indices = np.argsort(-x, axis=axis, kind="stable")
             else:
-                sorted_indices = np.argsort(x, axis=axis)
+                sorted_indices = np.argsort(x, axis=axis, kind="stable")
 
-            topk_indices = sorted_indices[:, :k]
+            topk_indices = np.take(sorted_indices, np.arange(k), axis=axis)
 
             ctx.set_tensor_data(tensor_out, topk_indices)
 
@@ -12053,15 +13467,10 @@ def custom_top_k_values(
             userdata_data = userdata_data_ptr.contents
 
             axis = userdata_data.axis
-            sorted_flag = bool(userdata_data.sorted)
 
             topk_values = np.take_along_axis(x, topk_indices, axis=axis)
-            if sorted_flag:
-                topk_values_sorted = np.sort(topk_values, axis=axis)
-            else:
-                topk_values_sorted = topk_values
 
-            ctx.set_tensor_data(tensor_out, topk_values_sorted)
+            ctx.set_tensor_data(tensor_out, topk_values)
 
         values = ggml.ggml_map_custom3_inplace(
             ctx.ggml_eval_context,
@@ -12084,6 +13493,7 @@ def custom_top_k_values(
 
         ctx.refs.append(topk_userdata)
 
+        ctx.set_tensor_dtype(node.output[0], ctx.get_tensor_dtype(node.input[0]))
         ctx.set_tensor_dtype(node.output[1], np.dtype(np.int64))
 
 
@@ -12417,6 +13827,7 @@ def __init__(
         dtypes: Optional[Dict[str, npt.DTypeLike]] = None,
         native_outputs: Optional[Set[str]] = None,
         execution_by_output: Optional[Dict[str, str]] = None,
+        opset_imports: Optional[Dict[str, int]] = None,
     ):
         self.backend = backend
         self.ggml_tensors_dict = ggml_tensors_dict
@@ -12431,6 +13842,7 @@ def __init__(
         self.shapes = shapes
         self.native_outputs: Set[str] = set(native_outputs or ())
         self.execution_by_output: Dict[str, str] = dict(execution_by_output or {})
+        self.opset_imports: Dict[str, int] = dict(opset_imports or {})
         self.tensor_states: Dict[str, TensorState] = {}
         self.backend_buffers: List[Any] = []
         for name, tensor in self.ggml_tensors_dict.items():
@@ -12446,6 +13858,9 @@ def __init__(
                     ),
                 )
 
+    def get_opset_version(self, domain: str = "") -> Optional[int]:
+        return self.opset_imports.get(domain)
+
     @staticmethod
     def storage_dtype_for_logical_dtype(dtype: npt.DTypeLike) -> npt.DTypeLike:
         np_dtype = np.dtype(dtype)
@@ -12964,6 +14379,7 @@ def __init__(
         ggml_weights_buffer: Any,
         execution_plan: ExecutionPlan,
         ir_pipeline: OnnxRuntimePipeline,
+        opset_imports: Dict[str, int],
     ):
         super(GgmlBackendRep, self).__init__()
         self.graph = graph
@@ -12978,6 +14394,7 @@ def __init__(
         self.ggml_weights_buffer = ggml_weights_buffer
         self.execution_plan = execution_plan
         self.ir_pipeline = ir_pipeline
+        self.opset_imports = opset_imports
         self.last_numpy_fallback_island_executions: Tuple[FallbackIsland, ...] = ()
 
     @property
@@ -13184,6 +14601,7 @@ def run(self, inputs: Any, **kwargs: Any) -> Tuple[Any, ...]:
                     for output in plan_node.outputs
                     if output
                 },
+                self.opset_imports,
             )
             cleanup.callback(ctx.free_backend_buffers)
             for input_name in input_tensors:
@@ -13218,7 +14636,7 @@ def run(self, inputs: Any, **kwargs: Any) -> Tuple[Any, ...]:
                         node_index += len(island_nodes)
                         continue
 
-                operator_spec = onnx_operators.get(node.op_type)
+                operator_spec = onnx_operators.get(node.op_type, node.domain)
                 if operator_spec is None:
                     raise NotImplementedError(
                         f'Operator "{node.op_type}" not implemented'
@@ -13250,10 +14668,16 @@ def run(self, inputs: Any, **kwargs: Any) -> Tuple[Any, ...]:
 
 
 class GgmlRuntimeBackend(Backend):
-    ONNX_DTYPE_MAP: ClassVar[Dict[int, npt.DTypeLike]] = {
-        elem_type: np_dtype
-        for elem_type, np_dtype in onnx.mapping.TENSOR_TYPE_TO_NP_TYPE.items()  # type: ignore
-    }
+    try:
+        ONNX_DTYPE_MAP: ClassVar[Dict[int, npt.DTypeLike]] = {
+            elem_type: np_dtype
+            for elem_type, np_dtype in onnx.mapping.TENSOR_TYPE_TO_NP_TYPE.items()  # type: ignore
+        }
+    except AttributeError:
+        ONNX_DTYPE_MAP = {
+            elem_type: mapping.np_dtype
+            for elem_type, mapping in onnx._mapping.TENSOR_TYPE_MAP.items()  # type: ignore[attr-defined]
+        }
 
     @staticmethod
     def _value_info_shape(value_info: ValueInfoProto) -> Tuple[Any, ...]:
@@ -13356,6 +14780,57 @@ def fold_constant_nodes(cls, model: ModelProto) -> ModelProto:
         folded_model.graph.initializer.extend(folded_initializers)
         return folded_model
 
+    @classmethod
+    def fold_static_cast_nodes(cls, model: ModelProto) -> ModelProto:
+        graph = model.graph
+        initializer_by_name = {
+            initializer.name: initializer for initializer in graph.initializer
+        }
+        initializer_names = set(initializer_by_name)
+        folded_initializers: List[TensorProto] = []
+        remaining_nodes: List[NodeProto] = []
+
+        for node in graph.node:
+            if (
+                node.op_type != "Cast"
+                or node.domain
+                or len(node.input) != 1
+                or len(node.output) != 1
+                or node.input[0] not in initializer_by_name
+                or node.output[0] in initializer_names
+            ):
+                remaining_nodes.append(node)
+                continue
+
+            to_attr = next((attr for attr in node.attribute if attr.name == "to"), None)
+            if to_attr is None:
+                remaining_nodes.append(node)
+                continue
+
+            try:
+                target_dtype = np.dtype(tensor_dtype_to_np_dtype(to_attr.i))
+                casted = onnx.numpy_helper.to_array(
+                    initializer_by_name[node.input[0]]
+                ).astype(target_dtype)
+                tensor = onnx.numpy_helper.from_array(casted, name=node.output[0])
+            except (TypeError, ValueError):
+                remaining_nodes.append(node)
+                continue
+
+            folded_initializers.append(tensor)
+            initializer_by_name[node.output[0]] = tensor
+            initializer_names.add(node.output[0])
+
+        if not folded_initializers:
+            return model
+
+        folded_model = ModelProto()
+        folded_model.CopyFrom(model)
+        del folded_model.graph.node[:]
+        folded_model.graph.node.extend(remaining_nodes)
+        folded_model.graph.initializer.extend(folded_initializers)
+        return folded_model
+
     @classmethod
     def _static_tensor_shape(
         cls, model_ir: ModelIR, name: str
@@ -13490,6 +14965,11 @@ def optimize_model_with_report(cls, model: ModelProto) -> ModelOptimizationResul
         if model is not before:
             applied_passes.append("fold_constant_nodes")
 
+        before = model
+        model = cls.fold_static_cast_nodes(model)
+        if model is not before:
+            applied_passes.append("fold_static_cast_nodes")
+
         before = model
         model = cls.fold_static_shape_nodes(model)
         if model is not before:
@@ -13535,6 +15015,7 @@ def build_ir(cls, model: ModelProto) -> ModelIR:
                 index=index,
                 name=node.name,
                 op_type=node.op_type,
+                domain=node.domain,
                 inputs=tuple(node.input),
                 outputs=tuple(node.output),
                 attributes=tuple(attr.name for attr in node.attribute),
@@ -13601,7 +15082,7 @@ def _isolate_layout_view_fallbacks(
         adjusted_nodes = []
         changed = False
         for node in nodes:
-            spec = onnx_operators.get(node.op_type)
+            spec = onnx_operators.get(node.op_type, node.domain)
             if (
                 spec is not None
                 and spec.is_layout_view
@@ -13643,7 +15124,7 @@ def _propagate_numpy_fallback_dependencies(
             operator_class = node.operator_class
             allowed = node.allowed
             reason = node.reason
-            spec = onnx_operators.get(node.op_type)
+            spec = onnx_operators.get(node.op_type, node.domain)
             if (
                 spec is not None
                 and spec.execution == OnnxOperator.EXECUTION_NATIVE_OR_NUMPY_RUNTIME
@@ -13771,7 +15252,7 @@ def _analyze_ir(
             if name
         }
         for node in model_ir.nodes:
-            operator = onnx_operators.get(node.op_type)
+            operator = onnx_operators.get(node.op_type, node.domain)
             if operator is None:
                 execution = OnnxOperator.EXECUTION_UNSUPPORTED
                 operator_class = OnnxOperator.CLASS_UNSUPPORTED
@@ -13805,6 +15286,7 @@ def _analyze_ir(
                     index=node.index,
                     name=node.name,
                     op_type=node.op_type,
+                    domain=node.domain,
                     execution=execution,
                     operator_class=operator_class,
                     inputs=node.inputs,
@@ -13891,6 +15373,7 @@ def prepare(
         ir_pipeline = cls.build_pipeline(model, fallback_policy=fallback_policy)
         model = ir_pipeline.optimized_model
         execution_plan = ir_pipeline.execution_plan
+        opset_imports = {opset.domain: opset.version for opset in model.opset_import}
         if (
             fallback_policy != ExecutionPlan.FALLBACK_COMPAT
             and not execution_plan.is_supported
@@ -14002,6 +15485,7 @@ def prepare(
                 ggml_weights_buffer=ggml_weights_buffer,
                 execution_plan=execution_plan,
                 ir_pipeline=ir_pipeline,
+                opset_imports=opset_imports,
             )
             cleanup.pop_all()
             return rep
diff --git a/tests/test_ggml_onnx.py b/tests/test_ggml_onnx.py
index 00f3772..e94f997 100644
--- a/tests/test_ggml_onnx.py
+++ b/tests/test_ggml_onnx.py
@@ -1,4 +1,5 @@
 import io
+import math
 import typing
 
 import numpy as np
@@ -1244,17 +1245,21 @@ def test_ggml_onnx_fallback_island_dispatches_reduce_sum_operator_numpy_evaluato
     model_input = helper.make_tensor_value_info("X", TensorProto.FLOAT, [2, 2])
     celu_output = helper.make_tensor_value_info("C", TensorProto.FLOAT, [2, 2])
     model_output = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 1])
+    reduce_sum_node = helper.make_node(
+        "ReduceSum",
+        ["C"],
+        ["Y"],
+        keepdims=1,
+        noop_with_empty_axes=0,
+    )
+    axes_attr = onnx_pb.AttributeProto()
+    axes_attr.name = "axes"
+    axes_attr.type = onnx_pb.AttributeProto.INTS
+    reduce_sum_node.attribute.append(axes_attr)
     graph = helper.make_graph(
         [
             helper.make_node("Celu", ["X"], ["C"], alpha=1.0),
-            helper.make_node(
-                "ReduceSum",
-                ["C"],
-                ["Y"],
-                axes=[],
-                keepdims=1,
-                noop_with_empty_axes=0,
-            ),
+            reduce_sum_node,
         ],
         "fallback_island_reduce_sum_operator_numpy_dispatch",
         [model_input],
@@ -2406,6 +2411,23 @@ def test_ggml_onnx_build_ir_tracks_constant_node_metadata():
     assert not constant_info.initializer
 
 
+def test_ggml_onnx_build_ir_tracks_node_domain():
+    model_input = helper.make_tensor_value_info("X", TensorProto.FLOAT, [2, 3])
+    model_output = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [2, 3])
+    node = helper.make_node("SiLU", ["X"], ["Y"], domain="com.ggml")
+    graph = helper.make_graph([node], "domain_ir", [model_input], [model_output])
+    model = helper.make_model(
+        graph,
+        producer_name="domain-ir",
+        opset_imports=[helper.make_opsetid("", 18), helper.make_opsetid("com.ggml", 1)],
+    )
+
+    model_ir = GgmlRuntimeBackend.build_ir(model)
+
+    assert model_ir.nodes[0].op_type == "SiLU"
+    assert model_ir.nodes[0].domain == "com.ggml"
+
+
 def test_ggml_onnx_build_pipeline_uses_optimized_typed_ir_and_plan():
     constant_node = helper.make_node("Constant", [], ["C"], value_ints=[1, 2, 3])
     identity_node = helper.make_node("Identity", ["C"], ["Y"])
@@ -2491,6 +2513,386 @@ def test_ggml_onnx_prepare_uses_folded_constant_model():
     np.testing.assert_array_equal(ggml_result[0], np.array([1, 2, 3], dtype=np.int64))
 
 
+@pytest.mark.parametrize(
+    "opset,scale,bias,scale_mode",
+    [
+        (
+            18,
+            np.asarray([0.5, 1.5], dtype=np.float32),
+            np.asarray([0.25, -0.5], dtype=np.float32),
+            "group",
+        ),
+        (
+            21,
+            np.asarray([0.5, 0.75, 1.25, 1.5], dtype=np.float32),
+            np.asarray([0.25, 0.5, -0.25, -0.5], dtype=np.float32),
+            "channel",
+        ),
+    ],
+)
+def test_ggml_onnx_group_normalization_uses_opset_scale_semantics(
+    opset: int,
+    scale: npt.NDArray[np.float32],
+    bias: npt.NDArray[np.float32],
+    scale_mode: str,
+):
+    x = np.arange(8, dtype=np.float32).reshape(1, 4, 2)
+    x_info = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 4, 2])
+    y_info = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 4, 2])
+    scale_tensor = onnx.numpy_helper.from_array(scale, name="scale")
+    bias_tensor = onnx.numpy_helper.from_array(bias, name="bias")
+    node = helper.make_node(
+        "GroupNormalization",
+        ["X", "scale", "bias"],
+        ["Y"],
+        num_groups=2,
+        epsilon=1e-5,
+    )
+    graph = helper.make_graph(
+        [node],
+        f"group_normalization_opset_{opset}",
+        [x_info],
+        [y_info],
+        [scale_tensor, bias_tensor],
+    )
+    model = helper.make_model(
+        graph,
+        producer_name=f"group-normalization-opset-{opset}",
+        opset_imports=[helper.make_opsetid("", opset)],
+    )
+
+    grouped = x.reshape(1, 2, 2, 2)
+    mean = np.mean(grouped, axis=(2, 3), keepdims=True)
+    variance = np.var(grouped, axis=(2, 3), keepdims=True)
+    normalized_grouped = (grouped - mean) / np.sqrt(variance + 1e-5)
+    if scale_mode == "group":
+        expected = (
+            scale.reshape(1, 2, 1, 1) * normalized_grouped + bias.reshape(1, 2, 1, 1)
+        ).reshape(1, 4, 2)
+    else:
+        expected = normalized_grouped.reshape(1, 4, 2)
+        expected = scale.reshape(1, 4, 1) * expected + bias.reshape(1, 4, 1)
+
+    ggml_model = GgmlRuntimeBackend.prepare(model)
+    actual = ggml_model.run({"X": x})
+
+    np.testing.assert_allclose(actual[0], expected, rtol=1e-5, atol=1e-5)
+    assert [node.op_type for node in ggml_model.fallback_nodes] == [
+        "GroupNormalization"
+    ]
+
+
+def run_onnxruntime_model(
+    model: onnx.ModelProto,
+    inputs: typing.Dict[str, npt.NDArray[typing.Any]],
+):
+    model.ir_version = min(model.ir_version, 10)
+    return InferenceSession(model.SerializeToString()).run(None, inputs)
+
+
+def run_onnx_operator_numpy_reference(
+    node: onnx.NodeProto,
+    inputs: typing.Sequence[npt.NDArray[typing.Any]],
+):
+    operator = onnx_operators.get(node.op_type, node.domain)
+    assert operator is not None
+    assert operator.has_numpy_evaluator
+    return operator.eval_numpy(node, tuple(inputs))
+
+
+@pytest.mark.parametrize(
+    "op_type,opset,attrs,initializers",
+    [
+        ("Gelu", 20, {}, []),
+        ("Gelu", 20, {"approximate": "tanh"}, []),
+        ("LpNormalization", 22, {"axis": 1, "p": 2}, []),
+        (
+            "RMSNormalization",
+            23,
+            {"axis": 1},
+            [
+                helper.make_tensor(
+                    "scale",
+                    TensorProto.FLOAT,
+                    [3],
+                    np.asarray([0.5, 1.0, 1.5], dtype=np.float32),
+                )
+            ],
+        ),
+    ],
+)
+def test_ggml_onnx_standard_native_ggml_mappings_match_numpy_reference(
+    op_type: str,
+    opset: int,
+    attrs: typing.Dict[str, typing.Any],
+    initializers: typing.List[onnx.TensorProto],
+):
+    model_input = helper.make_tensor_value_info("X", TensorProto.FLOAT, [2, 3])
+    model_output = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [2, 3])
+    inputs = ["X", *(initializer.name for initializer in initializers)]
+    node = helper.make_node(op_type, inputs, ["Y"], **attrs)
+    graph = helper.make_graph(
+        [node],
+        f"standard_native_{op_type.lower()}",
+        [model_input],
+        [model_output],
+        initializers,
+    )
+    model = helper.make_model(
+        graph,
+        producer_name=f"standard-native-{op_type.lower()}",
+        opset_imports=[helper.make_opsetid("", opset)],
+    )
+    input_data = {
+        "X": np.asarray([[-1.5, -0.5, 0.25], [0.75, 1.5, 2.25]], dtype=np.float32)
+    }
+    reference_inputs = [
+        input_data["X"],
+        *(onnx.numpy_helper.to_array(initializer) for initializer in initializers),
+    ]
+
+    expected = run_onnx_operator_numpy_reference(node, reference_inputs)
+    ggml_model = GgmlRuntimeBackend.prepare(model, fallback_policy="strict")
+    actual = ggml_model.run(input_data)
+
+    atol = 2e-4 if op_type == "Gelu" and attrs.get("approximate") == "tanh" else 1e-5
+    rtol = 2e-4 if op_type == "Gelu" and attrs.get("approximate") == "tanh" else 1e-5
+    np.testing.assert_allclose(actual[0], expected[0], rtol=rtol, atol=atol)
+    assert ggml_model.coverage_report.summary() == (
+        "100.0% native, 0.0% decomposed, 0.0% fallback, 0.0% unsupported"
+    )
+
+
+def test_ggml_onnx_attention_matches_numpy_reference_with_numpy_fallback():
+    q_info = helper.make_tensor_value_info("Q", TensorProto.FLOAT, [1, 2, 4])
+    k_info = helper.make_tensor_value_info("K", TensorProto.FLOAT, [1, 2, 4])
+    v_info = helper.make_tensor_value_info("V", TensorProto.FLOAT, [1, 2, 4])
+    y_info = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 2, 4])
+    node = helper.make_node(
+        "Attention",
+        ["Q", "K", "V"],
+        ["Y"],
+        q_num_heads=2,
+        kv_num_heads=2,
+    )
+    graph = helper.make_graph([node], "attention", [q_info, k_info, v_info], [y_info])
+    model = helper.make_model(
+        graph,
+        producer_name="attention",
+        opset_imports=[helper.make_opsetid("", 24)],
+    )
+    q = np.arange(8, dtype=np.float32).reshape(1, 2, 4) / 10.0
+    inputs = {"Q": q, "K": q + 0.1, "V": q + 0.2}
+
+    expected = run_onnx_operator_numpy_reference(
+        node, [inputs["Q"], inputs["K"], inputs["V"]]
+    )
+    ggml_model = GgmlRuntimeBackend.prepare(model)
+    actual = ggml_model.run(inputs)
+
+    np.testing.assert_allclose(actual[0], expected[0], rtol=1e-5, atol=1e-5)
+    assert [node.op_type for node in ggml_model.fallback_nodes] == ["Attention"]
+
+
+def test_ggml_onnx_rotary_embedding_matches_numpy_reference_with_numpy_fallback():
+    x_info = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 2, 4])
+    y_info = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 2, 4])
+    cos_values = np.asarray([[[0.8, 0.6], [0.5, 0.25]]], dtype=np.float32)
+    sin_values = np.asarray([[[0.2, 0.4], [0.5, 0.75]]], dtype=np.float32)
+    cos = helper.make_tensor("cos", TensorProto.FLOAT, [1, 2, 2], cos_values.ravel())
+    sin = helper.make_tensor("sin", TensorProto.FLOAT, [1, 2, 2], sin_values.ravel())
+    node = helper.make_node(
+        "RotaryEmbedding",
+        ["X", "cos", "sin"],
+        ["Y"],
+        num_heads=1,
+    )
+    graph = helper.make_graph(
+        [node], "rotary_embedding", [x_info], [y_info], [cos, sin]
+    )
+    model = helper.make_model(
+        graph,
+        producer_name="rotary-embedding",
+        opset_imports=[helper.make_opsetid("", 23)],
+    )
+    inputs = {"X": np.arange(8, dtype=np.float32).reshape(1, 2, 4)}
+
+    expected = run_onnx_operator_numpy_reference(
+        node, [inputs["X"], cos_values, sin_values]
+    )
+    ggml_model = GgmlRuntimeBackend.prepare(model)
+    actual = ggml_model.run(inputs)
+
+    np.testing.assert_allclose(actual[0], expected[0], rtol=1e-5, atol=1e-5)
+    assert [node.op_type for node in ggml_model.fallback_nodes] == ["RotaryEmbedding"]
+
+
+def make_ggml_extension_model(
+    op_type: str,
+    inputs: typing.Sequence[str],
+    outputs: typing.Sequence[str],
+    input_infos: typing.Sequence[onnx.ValueInfoProto],
+    output_infos: typing.Sequence[onnx.ValueInfoProto],
+    attrs: typing.Optional[typing.Dict[str, typing.Any]] = None,
+):
+    node = helper.make_node(
+        op_type,
+        list(inputs),
+        list(outputs),
+        domain="com.ggml",
+        **(attrs or {}),
+    )
+    graph = helper.make_graph(
+        [node], f"ggml_{op_type.lower()}", list(input_infos), list(output_infos)
+    )
+    model = helper.make_model(
+        graph,
+        producer_name=f"ggml-{op_type.lower()}",
+        opset_imports=[helper.make_opsetid("", 18), helper.make_opsetid("com.ggml", 1)],
+    )
+    return model
+
+
+@pytest.mark.parametrize("op_type", ["SiLU", "QuickGelu"])
+def test_ggml_onnx_unary_extension_ops_run_native(op_type: str):
+    x_info = helper.make_tensor_value_info("X", TensorProto.FLOAT, [2, 3])
+    y_info = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [2, 3])
+    model = make_ggml_extension_model(op_type, ["X"], ["Y"], [x_info], [y_info])
+    x = np.linspace(-2.0, 2.0, 6, dtype=np.float32).reshape(2, 3)
+    if op_type == "SiLU":
+        expected = x / (1.0 + np.exp(-x))
+    else:
+        expected = x / (1.0 + np.exp(-1.702 * x))
+
+    ggml_model = GgmlRuntimeBackend.prepare(model, fallback_policy="strict")
+    actual = ggml_model.run({"X": x})
+
+    tolerance = 5e-4 if op_type == "QuickGelu" else 1e-5
+    np.testing.assert_allclose(actual[0], expected, rtol=tolerance, atol=tolerance)
+    assert not ggml_model.fallback_nodes
+
+
+@pytest.mark.parametrize(
+    "op_type", ["ReGLU", "GeGLU", "SwiGLU", "GeGLUErf", "GeGLUQuick"]
+)
+def test_ggml_onnx_glu_extension_ops_run_native(op_type: str):
+    x_info = helper.make_tensor_value_info("X", TensorProto.FLOAT, [2, 4])
+    y_info = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [2, 2])
+    model = make_ggml_extension_model(op_type, ["X"], ["Y"], [x_info], [y_info])
+    x = np.linspace(-2.0, 2.0, 8, dtype=np.float32).reshape(2, 4)
+    gate = x[:, :2]
+    values = x[:, 2:]
+    if op_type == "ReGLU":
+        activation = np.maximum(gate, 0)
+    elif op_type == "SwiGLU":
+        activation = gate / (1.0 + np.exp(-gate))
+    elif op_type == "GeGLUQuick":
+        activation = gate / (1.0 + np.exp(-1.702 * gate))
+    else:
+        erf = np.vectorize(math.erf)
+        if op_type == "GeGLUErf":
+            activation = 0.5 * gate * (1.0 + erf(gate / np.sqrt(2.0)))
+        else:
+            inner = np.sqrt(2.0 / np.pi) * (gate + 0.044715 * np.power(gate, 3))
+            activation = 0.5 * gate * (1.0 + np.tanh(inner))
+    expected = values * activation
+
+    ggml_model = GgmlRuntimeBackend.prepare(model, fallback_policy="strict")
+    actual = ggml_model.run({"X": x})
+
+    tolerance = 1e-3 if op_type != "ReGLU" else 1e-5
+    np.testing.assert_allclose(actual[0], expected, rtol=tolerance, atol=tolerance)
+    assert not ggml_model.fallback_nodes
+
+
+def test_ggml_onnx_flash_attention_extension_runs_native():
+    q_info = helper.make_tensor_value_info("Q", TensorProto.FLOAT, [1, 1, 2, 2])
+    k_info = helper.make_tensor_value_info("K", TensorProto.FLOAT, [1, 1, 2, 2])
+    v_info = helper.make_tensor_value_info("V", TensorProto.FLOAT, [1, 1, 2, 2])
+    y_info = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 1, 2, 2])
+    model = make_ggml_extension_model(
+        "FlashAttention",
+        ["Q", "K", "V"],
+        ["Y"],
+        [q_info, k_info, v_info],
+        [y_info],
+    )
+    q = np.asarray([[[[0.1, 0.2], [0.3, 0.4]]]], dtype=np.float32)
+    k = np.asarray([[[[0.2, 0.1], [0.5, 0.6]]]], dtype=np.float32)
+    v = np.asarray([[[[1.0, 2.0], [3.0, 4.0]]]], dtype=np.float32)
+    scores = np.matmul(q, np.swapaxes(k, -1, -2)) / np.sqrt(2.0)
+    probabilities = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
+    probabilities = probabilities / np.sum(probabilities, axis=-1, keepdims=True)
+    expected = np.matmul(probabilities, v)
+
+    ggml_model = GgmlRuntimeBackend.prepare(model, fallback_policy="strict")
+    actual = ggml_model.run({"Q": q, "K": k, "V": v})
+
+    np.testing.assert_allclose(actual[0], expected, rtol=1e-5, atol=1e-5)
+    assert not ggml_model.fallback_nodes
+
+
+def test_ggml_onnx_misc_extension_ops_run_native():
+    x_info = helper.make_tensor_value_info("X", TensorProto.FLOAT, [2, 3])
+    roll_output = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [2, 3])
+    roll_model = make_ggml_extension_model(
+        "Roll", ["X"], ["Y"], [x_info], [roll_output], {"shifts": [1, 0]}
+    )
+    x = np.arange(6, dtype=np.float32).reshape(2, 3)
+    roll_runtime = GgmlRuntimeBackend.prepare(roll_model, fallback_policy="strict")
+    roll_actual = roll_runtime.run({"X": x})[0]
+    np.testing.assert_allclose(roll_actual, np.roll(x, 1, axis=0))
+
+    argsort_output = helper.make_tensor_value_info("Y", TensorProto.INT32, [2, 3])
+    argsort_model = make_ggml_extension_model(
+        "ArgSort", ["X"], ["Y"], [x_info], [argsort_output]
+    )
+    argsort_runtime = GgmlRuntimeBackend.prepare(
+        argsort_model, fallback_policy="strict"
+    )
+    argsort_actual = argsort_runtime.run(
+        {"X": np.asarray([[3.0, 1.0, 2.0], [0.0, 2.0, 1.0]], dtype=np.float32)}
+    )[0]
+    np.testing.assert_array_equal(
+        argsort_actual,
+        np.asarray([[1, 2, 0], [0, 2, 1]], dtype=np.int32),
+    )
+
+
+def test_ggml_onnx_registers_forward_ggml_extension_ops():
+    expected_ops = {
+        "AddRelPos",
+        "ArgSort",
+        "Fill",
+        "FlashAttention",
+        "GatedDeltaNet",
+        "GatedLinearAttention",
+        "GeGLU",
+        "GeGLUErf",
+        "GeGLUQuick",
+        "GetRelPos",
+        "ReGLU",
+        "RWKVWKV6",
+        "RWKVWKV7",
+        "Rope",
+        "Roll",
+        "SSMConv",
+        "SSMScan",
+        "SiLU",
+        "SwiGLU",
+        "SwiGLUOAI",
+        "TimestepEmbedding",
+        "WindowPartition",
+        "WindowUnpartition",
+    }
+
+    assert {
+        op_type
+        for domain, op_type in onnx_operators.domain_operators
+        if domain == "com.ggml"
+    }.issuperset(expected_ops)
+
+
 def test_ggml_onnx_fold_static_shape_nodes_folds_shape_and_size():
     model_input = helper.make_tensor_value_info("X", TensorProto.FLOAT, [2, 3, 4])
     shape_output = helper.make_tensor_value_info("S", TensorProto.INT64, [1])
diff --git a/tests/test_ggml_onnx_hypothesis.py b/tests/test_ggml_onnx_hypothesis.py
index a06fecb..691e2fd 100644
--- a/tests/test_ggml_onnx_hypothesis.py
+++ b/tests/test_ggml_onnx_hypothesis.py
@@ -92,6 +92,7 @@
         "Expand",
         "Flatten",
         "Gather",
+        "Gelu",
         "GlobalAveragePool",
         "GlobalMaxPool",
         "HardSigmoid",
@@ -99,6 +100,7 @@
         "Identity",
         "LeakyRelu",
         "Log",
+        "LpNormalization",
         "MatMul",
         "MaxPool",
         "Mean",
@@ -112,6 +114,7 @@
         "ReduceSumSquare",
         "Relu",
         "Reshape",
+        "RMSNormalization",
         "Sigmoid",
         "Sign",
         "Slice",
@@ -132,6 +135,7 @@
     {
         "Abs",
         "Elu",
+        "Gelu",
         "Identity",
         "LeakyRelu",
         "Log",
@@ -167,6 +171,22 @@
     for shape in GRAPH_SHAPES
     if len(shape) <= 4 and any(dim >= 2 for dim in shape)
 )
+MIN_ONNX_OPSET_BY_OP_TYPE = {
+    "Gelu": 20,
+    "LpNormalization": 22,
+    "RMSNormalization": 23,
+}
+
+
+def minimum_opset_version(op_types: typing.Iterable[str]) -> int:
+    return max(
+        [18]
+        + [
+            MIN_ONNX_OPSET_BY_OP_TYPE[op_type]
+            for op_type in op_types
+            if op_type in MIN_ONNX_OPSET_BY_OP_TYPE
+        ]
+    )
 
 
 @dataclass(frozen=True)
@@ -543,6 +563,7 @@ def output_domain(op: GeneratedOpSpec, input_spec: TensorSpec) -> ValueDomain:
         "Abs",
         "Clip",
         "HardSigmoid",
+        "LpNormalization",
         "Pow",
         "ReduceL1",
         "ReduceL2",
@@ -582,6 +603,7 @@ def op_case_output_domain(
         "Abs",
         "Clip",
         "HardSigmoid",
+        "LpNormalization",
         "Pow",
         "ReduceL1",
         "ReduceL2",
@@ -653,8 +675,13 @@ def build_graph_ir(
     ops: typing.Sequence[GeneratedOpSpec],
     fallback_start: typing.Optional[int],
     input_domain: ValueDomain = ValueDomain.ANY_FLOAT,
-    opset_version: int = 18,
+    opset_version: typing.Optional[int] = None,
 ) -> TestGraphIR:
+    resolved_opset_version = (
+        minimum_opset_version(op.op_type for op in ops)
+        if opset_version is None
+        else opset_version
+    )
     input_spec = TensorSpec(
         "X",
         input_shape,
@@ -681,7 +708,7 @@ def build_graph_ir(
         outputs=(current_spec,),
         input_values={"X": np.asarray(input_array, dtype=np.float32)},
         fallback_start=fallback_start,
-        opset_version=opset_version,
+        opset_version=resolved_opset_version,
     )
 
 
@@ -693,9 +720,14 @@ def build_direct_graph_case(
     input_values: typing.Dict[str, npt.NDArray[typing.Any]],
     fallback_start: typing.Optional[int],
     branch_count: int = 0,
-    opset_version: int = 18,
+    opset_version: typing.Optional[int] = None,
     expected_fallback_indices: typing.Sequence[int] = (),
 ) -> GeneratedGraphCase:
+    resolved_opset_version = (
+        minimum_opset_version(op.op_type for op in ops)
+        if opset_version is None
+        else opset_version
+    )
     ir = TestGraphIR(
         name=description,
         inputs=tuple(inputs),
@@ -703,7 +735,7 @@ def build_direct_graph_case(
         outputs=tuple(outputs),
         input_values=input_values,
         fallback_start=fallback_start,
-        opset_version=opset_version,
+        opset_version=resolved_opset_version,
     )
     model = to_onnx_model(ir)
     return GeneratedGraphCase(
@@ -758,7 +790,7 @@ def build_model_case(
     ops: typing.Sequence[GeneratedOpSpec],
     fallback_start: typing.Optional[int],
     input_domain: ValueDomain = ValueDomain.ANY_FLOAT,
-    opset_version: int = 18,
+    opset_version: typing.Optional[int] = None,
     expected_fallback_indices: typing.Sequence[int] = (),
 ) -> GeneratedGraphCase:
     ir = build_graph_ir(
@@ -798,6 +830,8 @@ def op_input_shapes(op_type: str) -> typing.Tuple[typing.Tuple[int, ...], ...]:
         return tuple(shape for shape in GRAPH_SHAPES if len(shape) <= 4)
     if op_type in {"GlobalAveragePool", "GlobalMaxPool"}:
         return ((1, 1, 3, 3), (1, 2, 4, 4), (2, 1, 5, 4))
+    if op_type in {"LpNormalization", "RMSNormalization"}:
+        return tuple(shape for shape in GRAPH_SHAPES if len(shape) <= 4)
     if op_type == "MatMul":
         return ((1, 1), (1, 3), (2, 1), (2, 3), (3, 2))
     if op_type in NATIVE_REDUCE_ALL_OP_TYPES:
@@ -846,6 +880,8 @@ def canonical_input_shape(op_type: str) -> typing.Tuple[int, ...]:
         return (2, 3)
     if op_type in NATIVE_REDUCE_ALL_OP_TYPES:
         return (2, 3)
+    if op_type in {"LpNormalization", "RMSNormalization"}:
+        return (2, 3)
     if op_type == "Flatten":
         return (2, 3, 4)
     if op_type == "Reshape":
@@ -869,8 +905,23 @@ def canonical_generated_op(
         return GeneratedOpSpec(op_type, shape, {"alpha": 1.0 / 6.0, "beta": 0.5})
     if op_type == "HardSwish":
         return GeneratedOpSpec(op_type, shape, {})
+    if op_type == "RMSNormalization":
+        scale = np.linspace(0.5, 1.5, shape[-1], dtype=np.float32)
+        return GeneratedOpSpec(
+            op_type,
+            shape,
+            {"axis": len(shape) - 1},
+            (make_float_initializer("scale", scale),),
+        )
+    if op_type == "LpNormalization":
+        return GeneratedOpSpec(op_type, shape, {"axis": -1, "p": 2})
     if op_type in UNARY_OP_TYPES:
-        attrs = {"alpha": 0.01} if op_type == "LeakyRelu" else {}
+        if op_type == "LeakyRelu":
+            attrs = {"alpha": 0.01}
+        elif op_type == "Gelu":
+            attrs = {"approximate": "none"}
+        else:
+            attrs = {}
         return GeneratedOpSpec(op_type, shape, attrs)
     if op_type in {"ArgMax", "ArgMin"}:
         return GeneratedOpSpec(
@@ -1113,8 +1164,25 @@ def generated_op_strategy(
     if op_type == "HardSwish":
         return GeneratedOpSpec(op_type, shape, {})
 
+    if op_type == "RMSNormalization":
+        scale = draw(float_array_strategy((shape[-1],), POSITIVE_FLOAT32_VALUES))
+        return GeneratedOpSpec(
+            op_type,
+            shape,
+            {"axis": len(shape) - 1},
+            (make_float_initializer("scale", scale),),
+        )
+
+    if op_type == "LpNormalization":
+        return GeneratedOpSpec(op_type, shape, {"axis": -1, "p": 2})
+
     if op_type in UNARY_OP_TYPES:
-        attrs = {"alpha": 0.01} if op_type == "LeakyRelu" else {}
+        if op_type == "LeakyRelu":
+            attrs = {"alpha": 0.01}
+        elif op_type == "Gelu":
+            attrs = {"approximate": "none"}
+        else:
+            attrs = {}
         return GeneratedOpSpec(op_type, shape, attrs)
 
     if op_type in {"ArgMax", "ArgMin"}:
@@ -1979,6 +2047,8 @@ def available_native_ops_for_spec(
     ops.add("Clip")
     if shape:
         ops.update(NATIVE_REDUCE_ALL_OP_TYPES)
+        ops.add("LpNormalization")
+        ops.add("RMSNormalization")
         ops.add("Softmax")
     if shape and len(shape) <= 4:
         ops.add("Concat")