From 71f5447652249d6f66c20ce0da419ddb2a48b11c Mon Sep 17 00:00:00 2001 From: mygitljf <2410316423@qq.com> Date: Mon, 18 May 2026 12:46:53 +0000 Subject: [PATCH 1/2] [2026 spring][T1-1-1] add rad2deg/copysign/lcm/nextafter/lgamma Python dispatch and tests Wires the five T1-1-1 operators through infinicore.ntops.torch on CUDA: - python/infinicore/ops/{rad2deg,copysign,lcm,nextafter,lgamma}.py: thin dispatchers calling infinicore.ntops.torch.. - python/infinicore/__init__.py: re-export the five ops. - test/infinicore/ops/{rad2deg,copysign,lcm,nextafter,lgamma}.py: framework tests covering OUT_OF_PLACE and INPLACE(out=c) on float16/bfloat16/float32 (lcm: int8/int16/int32/int64). nextafter, copysign, lcm, lgamma run bit-exact against torch. Verified on NVIDIA A100 80GB PCIe with --nvidia (172/172 passed). --- python/infinicore/__init__.py | 10 +++ python/infinicore/ops/copysign.py | 8 +++ python/infinicore/ops/lcm.py | 8 +++ python/infinicore/ops/lgamma.py | 8 +++ python/infinicore/ops/nextafter.py | 8 +++ python/infinicore/ops/rad2deg.py | 8 +++ test/infinicore/ops/copysign.py | 96 +++++++++++++++++++++++++++ test/infinicore/ops/lcm.py | 102 +++++++++++++++++++++++++++++ test/infinicore/ops/lgamma.py | 91 +++++++++++++++++++++++++ test/infinicore/ops/nextafter.py | 96 +++++++++++++++++++++++++++ test/infinicore/ops/rad2deg.py | 91 +++++++++++++++++++++++++ 11 files changed, 526 insertions(+) create mode 100644 python/infinicore/ops/copysign.py create mode 100644 python/infinicore/ops/lcm.py create mode 100644 python/infinicore/ops/lgamma.py create mode 100644 python/infinicore/ops/nextafter.py create mode 100644 python/infinicore/ops/rad2deg.py create mode 100644 test/infinicore/ops/copysign.py create mode 100644 test/infinicore/ops/lcm.py create mode 100644 test/infinicore/ops/lgamma.py create mode 100644 test/infinicore/ops/nextafter.py create mode 100644 test/infinicore/ops/rad2deg.py diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 8c9adc64c..48d3a6662 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -76,6 +76,7 @@ from infinicore.ops.broadcast_to import broadcast_to from infinicore.ops.cat import cat from infinicore.ops.cdist import cdist +from infinicore.ops.copysign import copysign from infinicore.ops.cross_entropy import cross_entropy from infinicore.ops.diff import diff from infinicore.ops.digamma import digamma @@ -94,8 +95,10 @@ from infinicore.ops.kron import kron from infinicore.ops.kthvalue import kthvalue from infinicore.ops.kv_caching import kv_caching +from infinicore.ops.lcm import lcm from infinicore.ops.ldexp import ldexp from infinicore.ops.lerp import lerp +from infinicore.ops.lgamma import lgamma from infinicore.ops.logaddexp import logaddexp from infinicore.ops.logaddexp2 import logaddexp2 from infinicore.ops.logcumsumexp import logcumsumexp @@ -108,10 +111,12 @@ from infinicore.ops.mha_varlen import mha_varlen from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow +from infinicore.ops.nextafter import nextafter from infinicore.ops.nrm2 import nrm2 from infinicore.ops.paged_attention import paged_attention from infinicore.ops.paged_attention_prefill import paged_attention_prefill from infinicore.ops.paged_caching import paged_caching +from infinicore.ops.rad2deg import rad2deg from infinicore.ops.rearrange import rearrange from infinicore.ops.reciprocal import reciprocal from infinicore.ops.rot import rot @@ -279,6 +284,11 @@ "var", "topk", "all", + "copysign", + "lcm", + "lgamma", + "nextafter", + "rad2deg", "set_printoptions", "printoptions", ] diff --git a/python/infinicore/ops/copysign.py b/python/infinicore/ops/copysign.py new file mode 100644 index 000000000..a0067aede --- /dev/null +++ b/python/infinicore/ops/copysign.py @@ -0,0 +1,8 @@ +import infinicore +from infinicore.tensor import Tensor + + +def copysign(input: Tensor, other: Tensor, *, out=None) -> Tensor: + r"""Computes element-wise copysign: magnitude of input with sign of other.""" + assert infinicore.use_ntops + return infinicore.ntops.torch.copysign(input, other, out=out) diff --git a/python/infinicore/ops/lcm.py b/python/infinicore/ops/lcm.py new file mode 100644 index 000000000..835d7721d --- /dev/null +++ b/python/infinicore/ops/lcm.py @@ -0,0 +1,8 @@ +import infinicore +from infinicore.tensor import Tensor + + +def lcm(input: Tensor, other: Tensor, *, out=None) -> Tensor: + r"""Computes element-wise least common multiple. Integer dtypes only.""" + assert infinicore.use_ntops + return infinicore.ntops.torch.lcm(input, other, out=out) diff --git a/python/infinicore/ops/lgamma.py b/python/infinicore/ops/lgamma.py new file mode 100644 index 000000000..516002d98 --- /dev/null +++ b/python/infinicore/ops/lgamma.py @@ -0,0 +1,8 @@ +import infinicore +from infinicore.tensor import Tensor + + +def lgamma(input: Tensor, *, out=None) -> Tensor: + r"""Computes element-wise natural logarithm of the absolute value of the gamma function.""" + assert infinicore.use_ntops + return infinicore.ntops.torch.lgamma(input, out=out) diff --git a/python/infinicore/ops/nextafter.py b/python/infinicore/ops/nextafter.py new file mode 100644 index 000000000..12d9b6a6c --- /dev/null +++ b/python/infinicore/ops/nextafter.py @@ -0,0 +1,8 @@ +import infinicore +from infinicore.tensor import Tensor + + +def nextafter(input: Tensor, other: Tensor, *, out=None) -> Tensor: + r"""Returns the next representable float value of input toward other, element-wise.""" + assert infinicore.use_ntops + return infinicore.ntops.torch.nextafter(input, other, out=out) diff --git a/python/infinicore/ops/rad2deg.py b/python/infinicore/ops/rad2deg.py new file mode 100644 index 000000000..98d28d487 --- /dev/null +++ b/python/infinicore/ops/rad2deg.py @@ -0,0 +1,8 @@ +import infinicore +from infinicore.tensor import Tensor + + +def rad2deg(input: Tensor, *, out=None) -> Tensor: + r"""Converts angles in radians to degrees, element-wise.""" + assert infinicore.use_ntops + return infinicore.ntops.torch.rad2deg(input, out=out) diff --git a/test/infinicore/ops/copysign.py b/test/infinicore/ops/copysign.py new file mode 100644 index 000000000..cf1d6a597 --- /dev/null +++ b/test/infinicore/ops/copysign.py @@ -0,0 +1,96 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import infinicore +import torch +from framework import ( + BaseOperatorTest, + GenericTestRunner, + TensorSpec, + TestCase, + is_broadcast, +) + + +_TEST_CASES_DATA = [ + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), None), + ((16, 5632), None, None, None), +] + +_TOLERANCE_MAP = { + infinicore.float16: {"atol": 0, "rtol": 0}, + infinicore.float32: {"atol": 0, "rtol": 0}, + infinicore.bfloat16: {"atol": 0, "rtol": 0}, +} + +_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32] + + +def parse_test_cases(): + test_cases = [] + for data in _TEST_CASES_DATA: + shape = data[0] + a_strides = data[1] if len(data) > 1 else None + b_strides = data[2] if len(data) > 2 else None + c_strides = data[3] if len(data) > 3 else None + + c_supports_inplace = not is_broadcast(c_strides) + + for dtype in _TENSOR_DTYPES: + tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4}) + a_spec = TensorSpec.from_tensor(shape, a_strides, dtype, name="a") + b_spec = TensorSpec.from_tensor(shape, b_strides, dtype, name="b") + c_spec = TensorSpec.from_tensor(shape, c_strides, dtype, name="c") + + test_cases.append( + TestCase( + inputs=[a_spec, b_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tol, + description="copysign - OUT_OF_PLACE", + ) + ) + + if c_supports_inplace: + test_cases.append( + TestCase( + inputs=[a_spec, b_spec], + kwargs=None, + output_spec=c_spec, + comparison_target="out", + tolerance=tol, + description="copysign - INPLACE(out)", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("Copysign") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + return torch.copysign(*args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + return infinicore.copysign(*args, **kwargs) + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/test/infinicore/ops/lcm.py b/test/infinicore/ops/lcm.py new file mode 100644 index 000000000..4278231c5 --- /dev/null +++ b/test/infinicore/ops/lcm.py @@ -0,0 +1,102 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import infinicore +import torch +from framework import ( + BaseOperatorTest, + GenericTestRunner, + TensorSpec, + TestCase, + is_broadcast, +) + + +_TEST_CASES_DATA = [ + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), None), + ((16, 5632), None, None, None), +] + +_TOLERANCE_MAP = { + infinicore.int8: {"atol": 0, "rtol": 0}, + infinicore.int16: {"atol": 0, "rtol": 0}, + infinicore.int32: {"atol": 0, "rtol": 0}, + infinicore.int64: {"atol": 0, "rtol": 0}, +} + +_TENSOR_DTYPES = [ + infinicore.int8, + infinicore.int16, + infinicore.int32, + infinicore.int64, +] + + +def parse_test_cases(): + test_cases = [] + for data in _TEST_CASES_DATA: + shape = data[0] + a_strides = data[1] if len(data) > 1 else None + b_strides = data[2] if len(data) > 2 else None + c_strides = data[3] if len(data) > 3 else None + + c_supports_inplace = not is_broadcast(c_strides) + + for dtype in _TENSOR_DTYPES: + tol = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0}) + a_spec = TensorSpec.from_tensor(shape, a_strides, dtype, name="a") + b_spec = TensorSpec.from_tensor(shape, b_strides, dtype, name="b") + c_spec = TensorSpec.from_tensor(shape, c_strides, dtype, name="c") + + test_cases.append( + TestCase( + inputs=[a_spec, b_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tol, + description="lcm - OUT_OF_PLACE", + ) + ) + + if c_supports_inplace: + test_cases.append( + TestCase( + inputs=[a_spec, b_spec], + kwargs=None, + output_spec=c_spec, + comparison_target="out", + tolerance=tol, + description="lcm - INPLACE(out)", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("Lcm") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + return torch.lcm(*args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + return infinicore.lcm(*args, **kwargs) + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/test/infinicore/ops/lgamma.py b/test/infinicore/ops/lgamma.py new file mode 100644 index 000000000..9bd6b5cf0 --- /dev/null +++ b/test/infinicore/ops/lgamma.py @@ -0,0 +1,91 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import infinicore +import torch +from framework import ( + BaseOperatorTest, + GenericTestRunner, + TensorSpec, + TestCase, + is_broadcast, +) + + +_TEST_CASES_DATA = [ + ((13, 4), None), + ((13, 4), (10, 1)), + ((8, 16), None), + ((8, 16), (40, 1)), + ((2, 3, 4), None), + ((16, 5632), None), +] + +_TOLERANCE_MAP = { + infinicore.float16: {"atol": 1e-2, "rtol": 1e-2}, + infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}, + infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2}, +} + +_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32] + + +def parse_test_cases(): + test_cases = [] + for data in _TEST_CASES_DATA: + shape = data[0] + in_strides = data[1] if len(data) > 1 else None + + for dtype in _TENSOR_DTYPES: + tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4}) + input_spec = TensorSpec.from_tensor(shape, in_strides, dtype) + out_spec = TensorSpec.from_tensor(shape, None, dtype) + + test_cases.append( + TestCase( + inputs=[input_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tol, + description="lgamma - OUT_OF_PLACE", + ) + ) + + test_cases.append( + TestCase( + inputs=[input_spec], + kwargs=None, + output_spec=out_spec, + comparison_target="out", + tolerance=tol, + description="lgamma - INPLACE(out)", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("Lgamma") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + return torch.lgamma(*args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + return infinicore.lgamma(*args, **kwargs) + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/test/infinicore/ops/nextafter.py b/test/infinicore/ops/nextafter.py new file mode 100644 index 000000000..68ed8ecc1 --- /dev/null +++ b/test/infinicore/ops/nextafter.py @@ -0,0 +1,96 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import infinicore +import torch +from framework import ( + BaseOperatorTest, + GenericTestRunner, + TensorSpec, + TestCase, + is_broadcast, +) + + +_TEST_CASES_DATA = [ + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), None), + ((16, 5632), None, None, None), +] + +_TOLERANCE_MAP = { + infinicore.float16: {"atol": 0, "rtol": 0}, + infinicore.float32: {"atol": 0, "rtol": 0}, + infinicore.bfloat16: {"atol": 0, "rtol": 0}, +} + +_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32] + + +def parse_test_cases(): + test_cases = [] + for data in _TEST_CASES_DATA: + shape = data[0] + a_strides = data[1] if len(data) > 1 else None + b_strides = data[2] if len(data) > 2 else None + c_strides = data[3] if len(data) > 3 else None + + c_supports_inplace = not is_broadcast(c_strides) + + for dtype in _TENSOR_DTYPES: + tol = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 0}) + a_spec = TensorSpec.from_tensor(shape, a_strides, dtype, name="a") + b_spec = TensorSpec.from_tensor(shape, b_strides, dtype, name="b") + c_spec = TensorSpec.from_tensor(shape, c_strides, dtype, name="c") + + test_cases.append( + TestCase( + inputs=[a_spec, b_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tol, + description="nextafter - OUT_OF_PLACE", + ) + ) + + if c_supports_inplace: + test_cases.append( + TestCase( + inputs=[a_spec, b_spec], + kwargs=None, + output_spec=c_spec, + comparison_target="out", + tolerance=tol, + description="nextafter - INPLACE(out)", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("Nextafter") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + return torch.nextafter(*args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + return infinicore.nextafter(*args, **kwargs) + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/test/infinicore/ops/rad2deg.py b/test/infinicore/ops/rad2deg.py new file mode 100644 index 000000000..ae83bbef8 --- /dev/null +++ b/test/infinicore/ops/rad2deg.py @@ -0,0 +1,91 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import infinicore +import torch +from framework import ( + BaseOperatorTest, + GenericTestRunner, + TensorSpec, + TestCase, + is_broadcast, +) + + +_TEST_CASES_DATA = [ + ((13, 4), None), + ((13, 4), (10, 1)), + ((8, 16), None), + ((8, 16), (40, 1)), + ((2, 3, 4), None), + ((16, 5632), None), +] + +_TOLERANCE_MAP = { + infinicore.float16: {"atol": 1e-2, "rtol": 1e-2}, + infinicore.float32: {"atol": 1e-5, "rtol": 1e-4}, + infinicore.bfloat16: {"atol": 1e-2, "rtol": 5e-2}, +} + +_TENSOR_DTYPES = [infinicore.float16, infinicore.bfloat16, infinicore.float32] + + +def parse_test_cases(): + test_cases = [] + for data in _TEST_CASES_DATA: + shape = data[0] + in_strides = data[1] if len(data) > 1 else None + + for dtype in _TENSOR_DTYPES: + tol = _TOLERANCE_MAP.get(dtype, {"atol": 1e-5, "rtol": 1e-4}) + input_spec = TensorSpec.from_tensor(shape, in_strides, dtype) + out_spec = TensorSpec.from_tensor(shape, None, dtype) + + test_cases.append( + TestCase( + inputs=[input_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tol, + description="rad2deg - OUT_OF_PLACE", + ) + ) + + test_cases.append( + TestCase( + inputs=[input_spec], + kwargs=None, + output_spec=out_spec, + comparison_target="out", + tolerance=tol, + description="rad2deg - INPLACE(out)", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("Rad2Deg") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + return torch.rad2deg(*args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + return infinicore.rad2deg(*args, **kwargs) + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() From 05cf4bdee66acb053fd1266e9f8a1510dd78a72a Mon Sep 17 00:00:00 2001 From: mygitljf <2410316423@qq.com> Date: Tue, 19 May 2026 17:13:58 +0000 Subject: [PATCH 2/2] [2026 spring][T1-1-1] benchmark: tolerate drivers without get_empty_cache_for_benchmark Some Triton driver backends (e.g. MetaX MACA's MacaDriver) do not implement Triton benchmark's `get_empty_cache_for_benchmark` / `clear_cache` helpers. Calling them eagerly aborts the run before any op is ever dispatched. Probe the driver with `getattr` + `callable` and only install the cache-clear hook when both helpers exist. Backends that expose them (e.g. NVIDIA's CudaDriver) keep the original behavior; backends that do not simply skip cache clearing - correctness and device-event timing are unaffected. Verification (MetaX C500, --metax): InfiniCore run.py --bench device --num_prerun 50 --num_iterations 1000 for rad2deg copysign lcm nextafter lgamma: Total tests run: 5, Passed: 5 [Device] PyTorch: 110695.750 ms [Device] InfiniCore: 108326.593 ms Device Speedup (PyTorch/InfiniCore): 1.022x --- test/infinicore/framework/benchmark.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/infinicore/framework/benchmark.py b/test/infinicore/framework/benchmark.py index cc77d53de..def69fb58 100644 --- a/test/infinicore/framework/benchmark.py +++ b/test/infinicore/framework/benchmark.py @@ -117,10 +117,15 @@ def _clear_cache(): if infinicore.use_ntops: import triton - cache = triton.runtime.driver.active.get_empty_cache_for_benchmark() + driver = triton.runtime.driver.active + get_cache = getattr(driver, "get_empty_cache_for_benchmark", None) + clear_cache = getattr(driver, "clear_cache", None) - def _clear_cache(): - triton.runtime.driver.active.clear_cache(cache) + if callable(get_cache) and callable(clear_cache): + cache = get_cache() + + def _clear_cache(): + clear_cache(cache) # Create pairs of DeviceEvents for each iteration start_events = [