From 71f5447652249d6f66c20ce0da419ddb2a48b11c Mon Sep 17 00:00:00 2001
From: mygitljf <2410316423@qq.com>
Date: Mon, 18 May 2026 12:46:53 +0000
Subject: [PATCH 1/2] [2026 spring][T1-1-1] add
 rad2deg/copysign/lcm/nextafter/lgamma Python dispatch and tests

Wires the five T1-1-1 operators through infinicore.ntops.torch on CUDA:

- python/infinicore/ops/{rad2deg,copysign,lcm,nextafter,lgamma}.py: thin
  dispatchers calling infinicore.ntops.torch.<op>.
- python/infinicore/__init__.py: re-export the five ops.
- test/infinicore/ops/{rad2deg,copysign,lcm,nextafter,lgamma}.py: framework
  tests covering OUT_OF_PLACE and INPLACE(out=c) on float16/bfloat16/float32
  (lcm: int8/int16/int32/int64). nextafter, copysign, lcm, lgamma run
  bit-exact against torch.

Verified on NVIDIA A100 80GB PCIe with --nvidia (172/172 passed).
---
 python/infinicore/__init__.py      |  10 +++
 python/infinicore/ops/copysign.py  |   8 +++
 python/infinicore/ops/lcm.py       |   8 +++
 python/infinicore/ops/lgamma.py    |   8 +++
 python/infinicore/ops/nextafter.py |   8 +++
 python/infinicore/ops/rad2deg.py   |   8 +++
 test/infinicore/ops/copysign.py    |  96 +++++++++++++++++++++++++++
 test/infinicore/ops/lcm.py         | 102 +++++++++++++++++++++++++++++
 test/infinicore/ops/lgamma.py      |  91 +++++++++++++++++++++++++
 test/infinicore/ops/nextafter.py   |  96 +++++++++++++++++++++++++++
 test/infinicore/ops/rad2deg.py     |  91 +++++++++++++++++++++++++
 11 files changed, 526 insertions(+)
 create mode 100644 python/infinicore/ops/copysign.py
 create mode 100644 python/infinicore/ops/lcm.py
 create mode 100644 python/infinicore/ops/lgamma.py
 create mode 100644 python/infinicore/ops/nextafter.py
 create mode 100644 python/infinicore/ops/rad2deg.py
 create mode 100644 test/infinicore/ops/copysign.py
 create mode 100644 test/infinicore/ops/lcm.py
 create mode 100644 test/infinicore/ops/lgamma.py
 create mode 100644 test/infinicore/ops/nextafter.py
 create mode 100644 test/infinicore/ops/rad2deg.py
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 8c9adc64c..48d3a6662 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -76,6 +76,7 @@
 from infinicore.ops.broadcast_to import broadcast_to
 from infinicore.ops.cat import cat
 from infinicore.ops.cdist import cdist
+from infinicore.ops.copysign import copysign
 from infinicore.ops.cross_entropy import cross_entropy
 from infinicore.ops.diff import diff
 from infinicore.ops.digamma import digamma
@@ -94,8 +95,10 @@
 from infinicore.ops.kron import kron
 from infinicore.ops.kthvalue import kthvalue
 from infinicore.ops.kv_caching import kv_caching
+from infinicore.ops.lcm import lcm
 from infinicore.ops.ldexp import ldexp
 from infinicore.ops.lerp import lerp
+from infinicore.ops.lgamma import lgamma
 from infinicore.ops.logaddexp import logaddexp
 from infinicore.ops.logaddexp2 import logaddexp2
 from infinicore.ops.logcumsumexp import logcumsumexp
@@ -108,10 +111,12 @@
 from infinicore.ops.mha_varlen import mha_varlen
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
+from infinicore.ops.nextafter import nextafter
 from infinicore.ops.nrm2 import nrm2
 from infinicore.ops.paged_attention import paged_attention
 from infinicore.ops.paged_attention_prefill import paged_attention_prefill
 from infinicore.ops.paged_caching import paged_caching
+from infinicore.ops.rad2deg import rad2deg
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.reciprocal import reciprocal
 from infinicore.ops.rot import rot
@@ -279,6 +284,11 @@
     "var",
     "topk",
     "all",
+    "copysign",
+    "lcm",
+    "lgamma",
+    "nextafter",
+    "rad2deg",
     "set_printoptions",
     "printoptions",
 ]
diff --git a/python/infinicore/ops/copysign.py b/python/infinicore/ops/copysign.py
new file mode 100644
index 000000000..a0067aede
--- /dev/null
+++ b/python/infinicore/ops/copysign.py
@@ -0,0 +1,8 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def copysign(input: Tensor, other: Tensor, *, out=None) -> Tensor:
+    r"""Computes element-wise copysign: magnitude of input with sign of other."""
+    assert infinicore.use_ntops
+    return infinicore.ntops.torch.copysign(input, other, out=out)
diff --git a/python/infinicore/ops/lcm.py b/python/infinicore/ops/lcm.py
new file mode 100644
index 000000000..835d7721d
--- /dev/null
+++ b/python/infinicore/ops/lcm.py
@@ -0,0 +1,8 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def lcm(input: Tensor, other: Tensor, *, out=None) -> Tensor:
+    r"""Computes element-wise least common multiple. Integer dtypes only."""
+    assert infinicore.use_ntops
+    return infinicore.ntops.torch.lcm(input, other, out=out)
diff --git a/python/infinicore/ops/lgamma.py b/python/infinicore/ops/lgamma.py
new file mode 100644
index 000000000..516002d98
--- /dev/null
+++ b/python/infinicore/ops/lgamma.py
@@ -0,0 +1,8 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def lgamma(input: Tensor, *, out=None) -> Tensor:
+    r"""Computes element-wise natural logarithm of the absolute value of the gamma function."""
+    assert infinicore.use_ntops
+    return infinicore.ntops.torch.lgamma(input, out=out)
diff --git a/python/infinicore/ops/nextafter.py b/python/infinicore/ops/nextafter.py
new file mode 100644
index 000000000..12d9b6a6c
--- /dev/null
+++ b/python/infinicore/ops/nextafter.py
@@ -0,0 +1,8 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def nextafter(input: Tensor, other: Tensor, *, out=None) -> Tensor:
+    r"""Returns the next representable float value of input toward other, element-wise."""
+    assert infinicore.use_ntops
+    return infinicore.ntops.torch.nextafter(input, other, out=out)
diff --git a/python/infinicore/ops/rad2deg.py b/python/infinicore/ops/rad2deg.py
new file mode 100644
index 000000000..98d28d487
--- /dev/null
+++ b/python/infinicore/ops/rad2deg.py
@@ -0,0 +1,8 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def rad2deg(input: Tensor, *, out=None) -> Tensor:
+    r"""Converts angles in radians to degrees, element-wise."""
+    assert infinicore.use_ntops
+    return infinicore.ntops.torch.rad2deg(input, out=out)
diff --git a/test/infinicore/ops/copysign.py b/test/infinicore/ops/copysign.py
new file mode 100644
index 000000000..cf1d6a597
--- /dev/null
+++ b/test/infinicore/ops/copysign.py
@@ -0,0 +1,96 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import infinicore
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+    is_broadcast,
+)
+
+
+_TEST_CASES_DATA = [
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), None),
+    ((16, 5632), None, None, None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 0},
+    infinicore.float32: {"atol": 0, "rtol": 0},
+    infinicore.bfloat16: {"atol": 0, "rtol": 0},
+}
+
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    test_cases = []
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        a_strides = data[1] if len(data) > 1 else None
+        b_strides = data[2] if len(data) > 2 else None
+        c_strides = data[3] if len(data) > 3 else None
+
+        c_supports_inplace = not is_broadcast(c_strides)
+
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            a_spec = TensorSpec.from_tensor(shape, a_strides, dtype, name="a")
+            b_spec = TensorSpec.from_tensor(shape, b_strides, dtype, name="b")
+            c_spec = TensorSpec.from_tensor(shape, c_strides, dtype, name="c")
+
+            test_cases.append(
+                TestCase(
+                    inputs=[a_spec, b_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description="copysign - OUT_OF_PLACE",
+                )
+            )
+
+            if c_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec],
+                        kwargs=None,
+                        output_spec=c_spec,
+                        comparison_target="out",
+                        tolerance=tol,
+                        description="copysign - INPLACE(out)",
+                    )
+                )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("Copysign")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.copysign(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.copysign(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/lcm.py b/test/infinicore/ops/lcm.py
new file mode 100644
index 000000000..4278231c5
--- /dev/null
+++ b/test/infinicore/ops/lcm.py
@@ -0,0 +1,102 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import infinicore
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+    is_broadcast,
+)
+
+
+_TEST_CASES_DATA = [
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), None),
+    ((16, 5632), None, None, None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.int8: {"atol": 0, "rtol": 0},
+    infinicore.int16: {"atol": 0, "rtol": 0},
+    infinicore.int32: {"atol": 0, "rtol": 0},
+    infinicore.int64: {"atol": 0, "rtol": 0},
+}
+
+_TENSOR_DTYPES = [
+    infinicore.int8,
+    infinicore.int16,
+    infinicore.int32,
+    infinicore.int64,
+]
+
+
+def parse_test_cases():
+    test_cases = []
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        a_strides = data[1] if len(data) > 1 else None
+        b_strides = data[2] if len(data) > 2 else None
+        c_strides = data[3] if len(data) > 3 else None
+
+        c_supports_inplace = not is_broadcast(c_strides)
+
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0})
+            a_spec = TensorSpec.from_tensor(shape, a_strides, dtype, name="a")
+            b_spec = TensorSpec.from_tensor(shape, b_strides, dtype, name="b")
+            c_spec = TensorSpec.from_tensor(shape, c_strides, dtype, name="c")
+
+            test_cases.append(
+                TestCase(
+                    inputs=[a_spec, b_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description="lcm - OUT_OF_PLACE",
+                )
+            )
+
+            if c_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec],
+                        kwargs=None,
+                        output_spec=c_spec,
+                        comparison_target="out",
+                        tolerance=tol,
+                        description="lcm - INPLACE(out)",
+                    )
+                )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("Lcm")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.lcm(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.lcm(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/lgamma.py b/test/infinicore/ops/lgamma.py
new file mode 100644
index 000000000..9bd6b5cf0
--- /dev/null
+++ b/test/infinicore/ops/lgamma.py
@@ -0,0 +1,91 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import infinicore
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+    is_broadcast,
+)
+
+
+_TEST_CASES_DATA = [
+    ((13, 4), None),
+    ((13, 4), (10, 1)),
+    ((8, 16), None),
+    ((8, 16), (40, 1)),
+    ((2, 3, 4), None),
+    ((16, 5632), None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-2, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2},
+}
+
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    test_cases = []
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        in_strides = data[1] if len(data) > 1 else None
+
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            input_spec = TensorSpec.from_tensor(shape, in_strides, dtype)
+            out_spec = TensorSpec.from_tensor(shape, None, dtype)
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description="lgamma - OUT_OF_PLACE",
+                )
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs=None,
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=tol,
+                    description="lgamma - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("Lgamma")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.lgamma(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.lgamma(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/nextafter.py b/test/infinicore/ops/nextafter.py
new file mode 100644
index 000000000..68ed8ecc1
--- /dev/null
+++ b/test/infinicore/ops/nextafter.py
@@ -0,0 +1,96 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import infinicore
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+    is_broadcast,
+)
+
+
+_TEST_CASES_DATA = [
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), None),
+    ((16, 5632), None, None, None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 0},
+    infinicore.float32: {"atol": 0, "rtol": 0},
+    infinicore.bfloat16: {"atol": 0, "rtol": 0},
+}
+
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    test_cases = []
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        a_strides = data[1] if len(data) > 1 else None
+        b_strides = data[2] if len(data) > 2 else None
+        c_strides = data[3] if len(data) > 3 else None
+
+        c_supports_inplace = not is_broadcast(c_strides)
+
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0})
+            a_spec = TensorSpec.from_tensor(shape, a_strides, dtype, name="a")
+            b_spec = TensorSpec.from_tensor(shape, b_strides, dtype, name="b")
+            c_spec = TensorSpec.from_tensor(shape, c_strides, dtype, name="c")
+
+            test_cases.append(
+                TestCase(
+                    inputs=[a_spec, b_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description="nextafter - OUT_OF_PLACE",
+                )
+            )
+
+            if c_supports_inplace:
+                test_cases.append(
+                    TestCase(
+                        inputs=[a_spec, b_spec],
+                        kwargs=None,
+                        output_spec=c_spec,
+                        comparison_target="out",
+                        tolerance=tol,
+                        description="nextafter - INPLACE(out)",
+                    )
+                )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("Nextafter")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.nextafter(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.nextafter(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/rad2deg.py b/test/infinicore/ops/rad2deg.py
new file mode 100644
index 000000000..ae83bbef8
--- /dev/null
+++ b/test/infinicore/ops/rad2deg.py
@@ -0,0 +1,91 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import infinicore
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorSpec,
+    TestCase,
+    is_broadcast,
+)
+
+
+_TEST_CASES_DATA = [
+    ((13, 4), None),
+    ((13, 4), (10, 1)),
+    ((8, 16), None),
+    ((8, 16), (40, 1)),
+    ((2, 3, 4), None),
+    ((16, 5632), None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 1e-2, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-5, "rtol": 1e-4},
+    infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2},
+}
+
+_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32]
+
+
+def parse_test_cases():
+    test_cases = []
+    for data in _TEST_CASES_DATA:
+        shape = data[0]
+        in_strides = data[1] if len(data) > 1 else None
+
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4})
+            input_spec = TensorSpec.from_tensor(shape, in_strides, dtype)
+            out_spec = TensorSpec.from_tensor(shape, None, dtype)
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description="rad2deg - OUT_OF_PLACE",
+                )
+            )
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs=None,
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=tol,
+                    description="rad2deg - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("Rad2Deg")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.rad2deg(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.rad2deg(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()

From 05cf4bdee66acb053fd1266e9f8a1510dd78a72a Mon Sep 17 00:00:00 2001
From: mygitljf <2410316423@qq.com>
Date: Tue, 19 May 2026 17:13:58 +0000
Subject: [PATCH 2/2] [2026 spring][T1-1-1] benchmark: tolerate drivers without
 get_empty_cache_for_benchmark

Some Triton driver backends (e.g. MetaX MACA's MacaDriver) do not
implement Triton benchmark's `get_empty_cache_for_benchmark` /
`clear_cache` helpers. Calling them eagerly aborts the run before any
op is ever dispatched.

Probe the driver with `getattr` + `callable` and only install the
cache-clear hook when both helpers exist. Backends that expose them
(e.g. NVIDIA's CudaDriver) keep the original behavior; backends that
do not simply skip cache clearing - correctness and device-event
timing are unaffected.

Verification (MetaX C500, --metax):
  InfiniCore run.py --bench device --num_prerun 50 --num_iterations 1000
  for rad2deg copysign lcm nextafter lgamma:
    Total tests run: 5, Passed: 5
    [Device] PyTorch:    110695.750 ms
    [Device] InfiniCore: 108326.593 ms
    Device Speedup (PyTorch/InfiniCore): 1.022x
---
 test/infinicore/framework/benchmark.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/test/infinicore/framework/benchmark.py b/test/infinicore/framework/benchmark.py
index cc77d53de..def69fb58 100644
--- a/test/infinicore/framework/benchmark.py
+++ b/test/infinicore/framework/benchmark.py
@@ -117,10 +117,15 @@ def _clear_cache():
         if infinicore.use_ntops:
             import triton
 
-            cache = triton.runtime.driver.active.get_empty_cache_for_benchmark()
+            driver = triton.runtime.driver.active
+            get_cache = getattr(driver, "get_empty_cache_for_benchmark", None)
+            clear_cache = getattr(driver, "clear_cache", None)
 
-            def _clear_cache():
-                triton.runtime.driver.active.clear_cache(cache)
+            if callable(get_cache) and callable(clear_cache):
+                cache = get_cache()
+
+                def _clear_cache():
+                    clear_cache(cache)
 
         # Create pairs of DeviceEvents for each iteration
         start_events = [