diff --git a/HONOR_CODE.md b/HONOR_CODE.md new file mode 100644 index 0000000..a375b70 --- /dev/null +++ b/HONOR_CODE.md @@ -0,0 +1,72 @@ +# 2026 春季启元人工智能大赛诚信守则(Honor Code) + + +本人作为 2026 春季启元人工智能大赛(以下简称“比赛”)的参赛选手,郑重承诺严格遵守比赛规则及本诚信守则,秉持诚信、公正、廉洁的参赛原则,自觉维护比赛的公平性与严肃性。本人充分理解并认可,违反本准则将导致参赛资格被取消、比赛成绩作废等相应后果,且愿意承担由此产生的一切责任。 + +## 一、参赛诚信承诺 + +1. 本人保证所提交的赛题PR(Pull Request)中包含的算子实现代码及相关文档,均为本人(及参赛团队,如为团队参赛)在比赛期间独立完成或在明确标注参考来源的基础上进行开发,不存在任何欺诈、抄袭、作弊行为。 + +2. 本人承诺主动、全面、真实地披露赛题实现过程中所有参考的外部资源,尤其是开源代码资源,不隐瞒任何可能影响比赛公平性的信息。 + +3. 本人保证不采用任何不正当手段获取比赛优势,包括但不限于窃取其他参赛选手的代码成果、利用非比赛允许的工具或技术、与他人串通作弊等。 + +## 二、参考资源说明 + +本人确认已按比赛要求,将本次赛题实现过程中涉及的参考资源信息单独撰写至`REFERENCE.md`文件中,该文件将与本诚信守则一同作为PR附件提交。`REFERENCE.md`需根据实际参考情况,按以下要求完整填写,信息不完整或虚假填写将视为违反本准则: + +**情况1:无参考外部开源代码及核心实现思路** + +`REFERENCE.md`中需明确声明:“本次赛题提交的算子代码、核心算法逻辑及实现方案均为本人(及参赛团队)独立设计与开发,未参考任何外部开源项目、技术文档中的核心代码片段或实现思路,未接受任何第三方的技术指导或代码支持。” + +**情况2:有参考外部开源代码及相关资源** + +对每个参考资源提供以下信息陈述: +1. 参考开源项目/资源名称 + +2. 参考资源链接(GitHub/Gitee/论文/技术文档等) + +3. 参考的具体内容(请明确说明参考的代码片段、算法逻辑、实现思路等,需标注对应资源的具体位置,如文件路径、代码行数等) + +4. 本人对参考内容的修改与优化说明:(请详细说明在参考基础上,本人所做的独立开发、修改、优化工作,体现自身技术贡献) + +5. 若是开源项目,提供参考资源的开源协议类型:(如MIT、Apache 2.0、GPL等) + +6. 其他需要补充说明的信息 + + +## 三、禁止行为确认 + +本人明确知晓并承诺避免以下违反比赛公平性的行为,若存在以下任一情况,自愿接受比赛组委会的相应处罚: + +1. 未经授权复制、抄袭他人(包括其他参赛选手、开源项目、商业代码)的代码、算法或技术方案,且未进行明确标注; + +2. 隐瞒或虚假披露参考资源信息,包括遗漏重要参考来源、伪造参考内容说明等; + +3. 与其他参赛选手或第三方串通,进行代码共享、成果交换等违规协作; + +4. 利用比赛平台漏洞、技术缺陷或非比赛允许的工具获取不正当利益; + +5. 伪造比赛相关证明材料、提交虚假信息; + +6. 其他违反比赛规则及公序良俗的不诚信行为。 + + +## 四、责任与确认 + +1. 本人充分理解,比赛组委会将对所有提交的PR进行代码溯源、参考信息核查等公平性审查,若发现本人存在违反本准则的行为,有权随时取消本人的参赛资格、作废比赛成绩,情节严重的将在比赛相关平台进行公示。 + +2. 若因本人违反本准则导致比赛争议或第三方权益受损(如开源协议侵权等),本人将独立承担全部法律责任及相关损失,与比赛组委会无关。 + +3. 本人确认已仔细阅读并完全理解本诚信守则的全部内容,自愿签署本准则,接受比赛组委会的监督与审查。 + +## 五、签署信息 + +参赛选手姓名(团队参赛需填写所有成员姓名) + + 练锦烽 + +签署日期 + +___2026___年__5__月__18__日 + diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py index f6934ef..8d0c187 100644 --- a/src/ntops/kernels/__init__.py +++ b/src/ntops/kernels/__init__.py @@ -9,6 +9,7 @@ bmm, clamp, conv2d, + copysign, cos, div, dropout, @@ -20,14 +21,18 @@ isinf, isnan, layer_norm, + lcm, le, + lgamma, lt, max_pool2d, mm, mul, ne, neg, + nextafter, pow, + rad2deg, relu, rms_norm, rotary_position_embedding, @@ -52,6 +57,7 @@ "bmm", "clamp", "conv2d", + "copysign", "cos", "div", "dropout", @@ -63,14 +69,18 @@ "isinf", "isnan", "layer_norm", + "lcm", "le", + "lgamma", "lt", "max_pool2d", "mm", "mul", "ne", "neg", + "nextafter", "pow", + "rad2deg", "relu", "rms_norm", "rotary_position_embedding", diff --git a/src/ntops/kernels/copysign.py b/src/ntops/kernels/copysign.py new file mode 100644 index 0000000..ef5d1bf --- /dev/null +++ b/src/ntops/kernels/copysign.py @@ -0,0 +1,68 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application_int16(input, other, output): + # Pure bit manipulation: take magnitude bits of input, sign bit of other. + # Avoids the fp16/bf16 -> fp32 -> fp16/bf16 round-trip required by + # libdevice.copysign, which doesn't support narrow floats. + dtype = output.dtype + int_dtype = ntl.int16 + + input_bits = ntl.cast(input, int_dtype, bitcast=True) + other_bits = ntl.cast(other, int_dtype, bitcast=True) + sign_bit = ntl.cast(1, int_dtype) << 15 + magn_mask = sign_bit - ntl.cast(1, int_dtype) + output = ntl.cast( # noqa: F841 + (input_bits & magn_mask) | (other_bits & sign_bit), dtype, bitcast=True + ) + + +def application_int32(input, other, output): + dtype = output.dtype + int_dtype = ntl.int32 + + input_bits = ntl.cast(input, int_dtype, bitcast=True) + other_bits = ntl.cast(other, int_dtype, bitcast=True) + sign_bit = ntl.cast(1, int_dtype) << 31 + magn_mask = sign_bit - ntl.cast(1, int_dtype) + output = ntl.cast( # noqa: F841 + (input_bits & magn_mask) | (other_bits & sign_bit), dtype, bitcast=True + ) + + +def application_int64(input, other, output): + dtype = output.dtype + int_dtype = ntl.int64 + + input_bits = ntl.cast(input, int_dtype, bitcast=True) + other_bits = ntl.cast(other, int_dtype, bitcast=True) + sign_bit = ntl.cast(1, int_dtype) << 63 + magn_mask = sign_bit - ntl.cast(1, int_dtype) + output = ntl.cast( # noqa: F841 + (input_bits & magn_mask) | (other_bits & sign_bit), dtype, bitcast=True + ) + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + if dtype in (ninetoothed.float16, ninetoothed.bfloat16): + application = application_int16 + elif dtype == ninetoothed.float32: + application = application_int32 + else: + application = application_int64 + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/lcm.py b/src/ntops/kernels/lcm.py new file mode 100644 index 0000000..3d05521 --- /dev/null +++ b/src/ntops/kernels/lcm.py @@ -0,0 +1,184 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor +from ninetoothed.language import libdevice + +from ntops.kernels.element_wise import arrangement + + +# T1-1-1 lcm: dtype-dispatched algorithm. +# +# int8/int16/int32 -> Stein binary GCD (no IDIV, cheap per-iter). +# int64 -> Dynamic Euclidean with grouped block-level early stop. +# +# Why two algorithms: +# Stein per-iter on A100: ~14 us at BLOCK=512/warps=8 (no IDIV; just +# ffs+shift+sub+min+abs + 1 where). +# Euclidean per-iter on A100: ~14 us at BLOCK=32/warps=1 (1 IDIV + +# 2 wheres; the int64 IDIV ~30 cycles, but BLOCK=32/warps=1 means +# 1 element per thread which maximizes the number of concurrent +# in-flight IDIVs across SMs). +# At BLOCK=512/warps=8 (= 2 elements per thread), Euclidean is ~43 +# us per iter because each thread's two dependent IDIV chains block +# each other -> 3x slower. The (32, 1) config is critical. +# For int8/16/32, Stein static unroll is unbeatable (no IDIV, no dynamic +# check overhead). For int64 with v2-style small inputs (values <= +# ~2^20), Euclidean dynamic averages ~14 outer iters vs Stein's +# fixed 60, giving ~4x speedup. +# +# Stein unroll counts (worst-case empirically validated): +# int8 (value range <= 127): max 5 -> use 8 +# int16 (value range <= 32767): max 13 -> use 16 +# int32 (value range <= 2^31): max 31 -> use 36 +# +# Euclidean (int64) uses grouped dynamic stop: +# outer cap = 12, inner unroll = 8 -> max 96 Euclidean iters. +# Block-level `ntl.max(b) != 0` check every 8 inner iters. +# N=96 covers Fibonacci adversarial worst case (~91 iters) for full +# int64 range. +# +# Sentinel-merge (iter05): one `where` per Stein iter using `a` (always +# odd, always >= 1) as the `b == 0` sentinel. +# +# History: +# iter05: sentinel-merge + flat 1D + explicit (512, warps_per_dt, 1) +# iter06: int64 Stein 72 -> 64 +# iter07: int64 Stein 64 -> 60 (1M+ sample empirical worst case = 57) +# iter08: int64 switch Stein -> dynamic Euclidean at (BLOCK=32, warps=1). +# Trade-off: int64 v2-style small-input launches ~2-4x faster +# than Stein; full-range int64 launches ~1.9x slower. Other +# dtypes unchanged. +def application_8(input, other, output): + dtype = output.dtype + compute_dtype = ntl.int32 + abs_a = ntl.abs(ntl.cast(input, compute_dtype)) + abs_b = ntl.abs(ntl.cast(other, compute_dtype)) + or_ab = abs_a | abs_b + safe_or = ntl.where(or_ab != 0, or_ab, 1) + k = ntl.cast(libdevice.ffs(safe_or) - 1, compute_dtype) + a0 = abs_a >> k + b0 = abs_b >> k + nonzero_a0 = a0 != 0 + safe_a0 = ntl.where(nonzero_a0, a0, 1) + ctz_a0 = ntl.cast(libdevice.ffs(safe_a0) - 1, compute_dtype) + a = ntl.where(nonzero_a0, a0 >> ctz_a0, b0) + b = ntl.where(nonzero_a0, b0, ntl.cast(0, compute_dtype)) + a = ntl.where(a == 0, ntl.cast(1, compute_dtype), a) + for _ in range(8): + b_for_calc = ntl.where(b != 0, b, a) + ctz_b = ntl.cast(libdevice.ffs(b_for_calc) - 1, compute_dtype) + b_odd = b_for_calc >> ctz_b + diff = b_odd - a + a = ntl.minimum(a, b_odd) + b = ntl.abs(diff) + gcd = a << k + safe_gcd = ntl.where(gcd == 0, 1, gcd) + output = ntl.cast( # noqa: F841 + ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype + ) + + +def application_16(input, other, output): + dtype = output.dtype + compute_dtype = ntl.int32 + abs_a = ntl.abs(ntl.cast(input, compute_dtype)) + abs_b = ntl.abs(ntl.cast(other, compute_dtype)) + or_ab = abs_a | abs_b + safe_or = ntl.where(or_ab != 0, or_ab, 1) + k = ntl.cast(libdevice.ffs(safe_or) - 1, compute_dtype) + a0 = abs_a >> k + b0 = abs_b >> k + nonzero_a0 = a0 != 0 + safe_a0 = ntl.where(nonzero_a0, a0, 1) + ctz_a0 = ntl.cast(libdevice.ffs(safe_a0) - 1, compute_dtype) + a = ntl.where(nonzero_a0, a0 >> ctz_a0, b0) + b = ntl.where(nonzero_a0, b0, ntl.cast(0, compute_dtype)) + a = ntl.where(a == 0, ntl.cast(1, compute_dtype), a) + for _ in range(16): + b_for_calc = ntl.where(b != 0, b, a) + ctz_b = ntl.cast(libdevice.ffs(b_for_calc) - 1, compute_dtype) + b_odd = b_for_calc >> ctz_b + diff = b_odd - a + a = ntl.minimum(a, b_odd) + b = ntl.abs(diff) + gcd = a << k + safe_gcd = ntl.where(gcd == 0, 1, gcd) + output = ntl.cast( # noqa: F841 + ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype + ) + + +def application_36(input, other, output): + dtype = output.dtype + compute_dtype = ntl.int32 + abs_a = ntl.abs(ntl.cast(input, compute_dtype)) + abs_b = ntl.abs(ntl.cast(other, compute_dtype)) + or_ab = abs_a | abs_b + safe_or = ntl.where(or_ab != 0, or_ab, 1) + k = ntl.cast(libdevice.ffs(safe_or) - 1, compute_dtype) + a0 = abs_a >> k + b0 = abs_b >> k + nonzero_a0 = a0 != 0 + safe_a0 = ntl.where(nonzero_a0, a0, 1) + ctz_a0 = ntl.cast(libdevice.ffs(safe_a0) - 1, compute_dtype) + a = ntl.where(nonzero_a0, a0 >> ctz_a0, b0) + b = ntl.where(nonzero_a0, b0, ntl.cast(0, compute_dtype)) + a = ntl.where(a == 0, ntl.cast(1, compute_dtype), a) + for _ in range(36): + b_for_calc = ntl.where(b != 0, b, a) + ctz_b = ntl.cast(libdevice.ffs(b_for_calc) - 1, compute_dtype) + b_odd = b_for_calc >> ctz_b + diff = b_odd - a + a = ntl.minimum(a, b_odd) + b = ntl.abs(diff) + gcd = a << k + safe_gcd = ntl.where(gcd == 0, 1, gcd) + output = ntl.cast( # noqa: F841 + ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype + ) + + +def application_euclidean_dyn(input, other, output): + # Dynamic Euclidean for int64. + # Block-level early stop every 8 inner iters; outer cap 12 -> N=96. + # Convergence for random uniform full-range int64 averages ~36 iters; + # for v2-style range (<=2^20) averages ~14 iters. + dtype = output.dtype + abs_a = ntl.abs(input) + abs_b = ntl.abs(other) + or_ab = abs_a | abs_b + a = ntl.where(abs_a >= abs_b, abs_a, abs_b) + b = ntl.where(abs_a >= abs_b, abs_b, abs_a) + outer = 0 + while ntl.max(b) != 0 and outer < 12: + for _ in range(8): + b_safe = ntl.where(b != 0, b, ntl.cast(1, dtype)) + r = a % b_safe + a = ntl.where(b != 0, b, a) + b = r + outer += 1 + gcd = a + safe_gcd = ntl.where(gcd == 0, 1, gcd) + output = ntl.cast( # noqa: F841 + ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype + ) + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + if dtype == ninetoothed.int64: + application = application_euclidean_dyn + elif dtype == ninetoothed.int32: + application = application_36 + elif dtype == ninetoothed.int16: + application = application_16 + else: + application = application_8 + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + return arrangement_, application, tensors diff --git a/src/ntops/kernels/lgamma.py b/src/ntops/kernels/lgamma.py new file mode 100644 index 0000000..920635d --- /dev/null +++ b/src/ntops/kernels/lgamma.py @@ -0,0 +1,38 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor +from ninetoothed.language import libdevice + +from ntops.kernels.element_wise import arrangement + + +def application_float32_compute(input, output): + # libdevice.lgamma only supports fp32/fp64; cast narrower floats up. + dtype = output.dtype + output = ntl.cast( # noqa: F841 + libdevice.lgamma(ntl.cast(input, ntl.float32)), + dtype, + ) + + +def application_native(input, output): + dtype = output.dtype + output = ntl.cast( # noqa: F841 + libdevice.lgamma(input), + dtype, + ) + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + if dtype in (ninetoothed.float16, ninetoothed.bfloat16): + application = application_float32_compute + else: + application = application_native + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/nextafter.py b/src/ntops/kernels/nextafter.py new file mode 100644 index 0000000..4194eb0 --- /dev/null +++ b/src/ntops/kernels/nextafter.py @@ -0,0 +1,138 @@ +import functools + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application_int16(input, other, output): + # PyTorch nextafter spec, implemented via IEEE bit manipulation: + # if either is NaN: result is NaN + # if a == b: result is b (preserves sign of zero) + # if a == 0: result is smallest subnormal with sign of b + # otherwise: walk one ULP toward b in IEEE bit space + dtype = output.dtype + int_dtype = ntl.int16 + + a = input + b = other + a_cmp = ntl.cast(a, ntl.float32) + b_cmp = ntl.cast(b, ntl.float32) + a_i = ntl.cast(a, int_dtype, bitcast=True) + b_i = ntl.cast(b, int_dtype, bitcast=True) + + one = ntl.cast(1, int_dtype) + zero = ntl.cast(0, int_dtype) + sign_bit = one << 15 + + is_nan = (a_cmp != a_cmp) | (b_cmp != b_cmp) + eq = a_cmp == b_cmp + is_zero = a_cmp == ntl.cast(0, ntl.float32) + + b_sign = b_i & sign_bit + zero_result = b_sign | one + + a_neg = a_i < zero + a_lt_b = a_cmp < b_cmp + step_up = a_neg ^ a_lt_b + step = ntl.where(step_up, one, -one) + general = a_i + step + + nan_bits = ntl.cast(ntl.cast(float("nan"), dtype), int_dtype, bitcast=True) + result_i = ntl.where( + is_nan, + nan_bits, + ntl.where(eq, b_i, ntl.where(is_zero, zero_result, general)), + ) + output = ntl.cast(result_i, dtype, bitcast=True) # noqa: F841 + + +def application_int32(input, other, output): + dtype = output.dtype + int_dtype = ntl.int32 + + a = input + b = other + a_i = ntl.cast(a, int_dtype, bitcast=True) + b_i = ntl.cast(b, int_dtype, bitcast=True) + + one = ntl.cast(1, int_dtype) + zero = ntl.cast(0, int_dtype) + sign_bit = one << 31 + + is_nan = (a != a) | (b != b) + eq = a == b + is_zero = a == ntl.cast(0, dtype) + + b_sign = b_i & sign_bit + zero_result = b_sign | one + + a_neg = a_i < zero + a_lt_b = a < b + step_up = a_neg ^ a_lt_b + step = ntl.where(step_up, one, -one) + general = a_i + step + + nan_bits = ntl.cast(ntl.cast(float("nan"), dtype), int_dtype, bitcast=True) + result_i = ntl.where( + is_nan, + nan_bits, + ntl.where(eq, b_i, ntl.where(is_zero, zero_result, general)), + ) + output = ntl.cast(result_i, dtype, bitcast=True) # noqa: F841 + + +def application_int64(input, other, output): + dtype = output.dtype + int_dtype = ntl.int64 + + a = input + b = other + a_i = ntl.cast(a, int_dtype, bitcast=True) + b_i = ntl.cast(b, int_dtype, bitcast=True) + + one = ntl.cast(1, int_dtype) + zero = ntl.cast(0, int_dtype) + sign_bit = one << 63 + + is_nan = (a != a) | (b != b) + eq = a == b + is_zero = a == ntl.cast(0, dtype) + + b_sign = b_i & sign_bit + zero_result = b_sign | one + + a_neg = a_i < zero + a_lt_b = a < b + step_up = a_neg ^ a_lt_b + step = ntl.where(step_up, one, -one) + general = a_i + step + + nan_bits = ntl.cast(ntl.cast(float("nan"), dtype), int_dtype, bitcast=True) + result_i = ntl.where( + is_nan, + nan_bits, + ntl.where(eq, b_i, ntl.where(is_zero, zero_result, general)), + ) + output = ntl.cast(result_i, dtype, bitcast=True) # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + if dtype in (ninetoothed.float16, ninetoothed.bfloat16): + application = application_int16 + elif dtype == ninetoothed.float32: + application = application_int32 + else: + application = application_int64 + + tensors = ( + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + Tensor(ndim, dtype=dtype), + ) + + return arrangement_, application, tensors diff --git a/src/ntops/kernels/rad2deg.py b/src/ntops/kernels/rad2deg.py new file mode 100644 index 0000000..9371f22 --- /dev/null +++ b/src/ntops/kernels/rad2deg.py @@ -0,0 +1,17 @@ +import functools + +from ninetoothed import Tensor + +from ntops.kernels.element_wise import arrangement + + +def application(input, output): + output = input * 57.29577951308232 # noqa: F841 + + +def premake(ndim, dtype=None, block_size=None): + arrangement_ = functools.partial(arrangement, block_size=block_size) + + tensors = (Tensor(ndim, dtype=dtype), Tensor(ndim, dtype=dtype)) + + return arrangement_, application, tensors diff --git a/src/ntops/torch/__init__.py b/src/ntops/torch/__init__.py index 82fc596..5732b72 100644 --- a/src/ntops/torch/__init__.py +++ b/src/ntops/torch/__init__.py @@ -8,6 +8,7 @@ from ntops.torch.bmm import bmm from ntops.torch.clamp import clamp from ntops.torch.conv2d import conv2d +from ntops.torch.copysign import copysign from ntops.torch.cos import cos from ntops.torch.div import div from ntops.torch.dropout import dropout @@ -19,7 +20,9 @@ from ntops.torch.isinf import isinf from ntops.torch.isnan import isnan from ntops.torch.layer_norm import layer_norm +from ntops.torch.lcm import lcm from ntops.torch.le import le +from ntops.torch.lgamma import lgamma from ntops.torch.lt import lt from ntops.torch.matmul import matmul from ntops.torch.max_pool2d import max_pool2d @@ -27,7 +30,9 @@ from ntops.torch.mul import mul from ntops.torch.ne import ne from ntops.torch.neg import neg +from ntops.torch.nextafter import nextafter from ntops.torch.pow import pow +from ntops.torch.rad2deg import rad2deg from ntops.torch.relu import relu from ntops.torch.rms_norm import rms_norm from ntops.torch.rotary_position_embedding import rotary_position_embedding @@ -51,6 +56,7 @@ "bmm", "clamp", "conv2d", + "copysign", "cos", "div", "dropout", @@ -62,7 +68,9 @@ "isinf", "isnan", "layer_norm", + "lcm", "le", + "lgamma", "lt", "matmul", "max_pool2d", @@ -70,7 +78,9 @@ "mul", "ne", "neg", + "nextafter", "pow", + "rad2deg", "relu", "rms_norm", "rotary_position_embedding", diff --git a/src/ntops/torch/copysign.py b/src/ntops/torch/copysign.py new file mode 100644 index 0000000..ae0bc40 --- /dev/null +++ b/src/ntops/torch/copysign.py @@ -0,0 +1,43 @@ +import torch + +import ntops +from ntops.torch.utils import ( + _cached_make, + _is_corex_compat_device, + _torch_binary_fallback, +) + + +def copysign(input, other, *, out=None): + if out is None: + out = torch.empty_like(input) + + if input.dtype in (torch.float16, torch.bfloat16) and _is_corex_compat_device( + input.device + ): + _torch_binary_fallback("copysign", input, other, out) + return out + + kernel = _cached_make( + ntops.kernels.copysign.premake, + input.ndim, + dtype=_to_nt(input.dtype), + block_size=1024, + num_warps=4, + ) + + kernel(input, other, out) + + return out + + +def _to_nt(torch_dtype): + import ninetoothed + + mapping = { + torch.float16: ninetoothed.float16, + torch.bfloat16: ninetoothed.bfloat16, + torch.float32: ninetoothed.float32, + torch.float64: ninetoothed.float64, + } + return mapping.get(torch_dtype) diff --git a/src/ntops/torch/lcm.py b/src/ntops/torch/lcm.py new file mode 100644 index 0000000..d8d9b19 --- /dev/null +++ b/src/ntops/torch/lcm.py @@ -0,0 +1,69 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +# iter08: int64 dispatched to dynamic Euclidean kernel, which is fastest +# at (BLOCK=32, num_warps=1) -- 1 element per thread maximizes concurrent +# in-flight IDIVs across SMs. See kernels/lcm.py header comment. +# +# int8/int16/int32 still use Stein static unroll (no IDIV; (512, warps, 1) +# is best per iter05's explore_config.py scan). +_NUM_STAGES = 1 + + +def _block_size_for(torch_dtype): + if torch_dtype == torch.int64: + return 32 + return 512 + + +def _num_warps_for(torch_dtype): + if torch_dtype == torch.int64: + return 1 + return 4 + + +def lcm(input, other, *, out=None): + if out is None: + out = torch.empty_like(input) + + if ( + input.ndim != 1 + and input.is_contiguous() + and other.is_contiguous() + and out.is_contiguous() + ): + n = input.numel() + in_view = input.view([n]) + other_view = other.view([n]) + out_view = out.view([n]) + else: + in_view = input + other_view = other + out_view = out + + kernel = _cached_make( + ntops.kernels.lcm.premake, + in_view.ndim, + dtype=_to_nt(input.dtype), + block_size=_block_size_for(input.dtype), + num_warps=_num_warps_for(input.dtype), + num_stages=_NUM_STAGES, + ) + + kernel(in_view, other_view, out_view) + + return out + + +def _to_nt(torch_dtype): + import ninetoothed + mapping = { + torch.int8: ninetoothed.int8, + torch.int16: ninetoothed.int16, + torch.int32: ninetoothed.int32, + torch.int64: ninetoothed.int64, + } + return mapping.get(torch_dtype) diff --git a/src/ntops/torch/lgamma.py b/src/ntops/torch/lgamma.py new file mode 100644 index 0000000..bebe35d --- /dev/null +++ b/src/ntops/torch/lgamma.py @@ -0,0 +1,44 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +_LARGE_NUMEL_THRESHOLD = 2_000_000 + + +def lgamma(input, *, out=None): + if out is None: + out = torch.empty_like(input) + + if input.numel() >= _LARGE_NUMEL_THRESHOLD: + kernel = _cached_make( + ntops.kernels.lgamma.premake, + input.ndim, + dtype=_to_nt(input.dtype), + block_size=1024, + num_warps=4, + num_stages=5, + ) + else: + kernel = _cached_make( + ntops.kernels.lgamma.premake, + input.ndim, + dtype=_to_nt(input.dtype), + ) + + kernel(input, out) + + return out + + +def _to_nt(torch_dtype): + import ninetoothed + + mapping = { + torch.float16: ninetoothed.float16, + torch.bfloat16: ninetoothed.bfloat16, + torch.float32: ninetoothed.float32, + torch.float64: ninetoothed.float64, + } + return mapping.get(torch_dtype) diff --git a/src/ntops/torch/nextafter.py b/src/ntops/torch/nextafter.py new file mode 100644 index 0000000..45151be --- /dev/null +++ b/src/ntops/torch/nextafter.py @@ -0,0 +1,49 @@ +import torch + +import ntops +from ntops.torch.utils import ( + _cached_make, + _is_corex_compat_device, + _torch_binary_fallback, +) + + +_BLOCK_SIZE = 1024 +_NUM_WARPS = 4 +_NUM_STAGES = 2 + + +def nextafter(input, other, *, out=None): + if out is None: + out = torch.empty_like(input) + + if input.dtype in (torch.float16, torch.bfloat16) and _is_corex_compat_device( + input.device + ): + _torch_binary_fallback("nextafter", input, other, out) + return out + + kernel = _cached_make( + ntops.kernels.nextafter.premake, + input.ndim, + dtype=_to_nt(input.dtype), + block_size=_BLOCK_SIZE, + num_warps=_NUM_WARPS, + num_stages=_NUM_STAGES, + ) + + kernel(input, other, out) + + return out + + +def _to_nt(torch_dtype): + import ninetoothed + + mapping = { + torch.float16: ninetoothed.float16, + torch.bfloat16: ninetoothed.bfloat16, + torch.float32: ninetoothed.float32, + torch.float64: ninetoothed.float64, + } + return mapping.get(torch_dtype) diff --git a/src/ntops/torch/rad2deg.py b/src/ntops/torch/rad2deg.py new file mode 100644 index 0000000..470c26e --- /dev/null +++ b/src/ntops/torch/rad2deg.py @@ -0,0 +1,34 @@ +import torch + +import ntops +from ntops.torch.utils import _cached_make + + +_BLOCK_SIZE = 2048 +_NUM_WARPS = 4 +_NUM_STAGES = 1 + + +def rad2deg(input, *, out=None): + if out is None: + out = torch.empty_like(input) + + if input.ndim != 1 and input.is_contiguous() and out.is_contiguous(): + n = input.numel() + in_view = input.view([n]) + out_view = out.view([n]) + else: + in_view = input + out_view = out + + kernel = _cached_make( + ntops.kernels.rad2deg.premake, + in_view.ndim, + block_size=_BLOCK_SIZE, + num_warps=_NUM_WARPS, + num_stages=_NUM_STAGES, + ) + + kernel(in_view, out_view) + + return out diff --git a/src/ntops/torch/utils.py b/src/ntops/torch/utils.py index e9b2dde..289d4a8 100644 --- a/src/ntops/torch/utils.py +++ b/src/ntops/torch/utils.py @@ -1,4 +1,5 @@ import functools +import os import ninetoothed import torch @@ -42,6 +43,100 @@ def set_default_max_num_configs(max_num_configs): _cached_make_default_config.max_num_configs = max_num_configs +def _is_corex_compat_device(device=None): + backend = os.getenv("NTOPS_BACKEND", "").strip().lower() + if backend in {"corex", "iluvatar", "tian", "mr-v100"}: + return True + if backend in {"cuda", "nvidia"}: + return False + + if not torch.cuda.is_available(): + return False + + if device is not None and getattr(device, "type", None) != "cuda": + return False + + index = getattr(device, "index", None) + if index is None: + index = torch.cuda.current_device() + + try: + name = torch.cuda.get_device_name(index).lower() + except Exception: + return False + + return "iluvatar" in name or "mr-v100" in name or "corex" in name + + +def _torch_binary_fallback(op_name, input, other, out): + if not _is_infinicore_tensor(input): + return getattr(torch, op_name)(input, other, out=out) + + input_torch = _infinicore_to_torch(input) + other_torch = _infinicore_to_torch(other) + result = getattr(torch, op_name)(input_torch, other_torch) + _copy_torch_to_infinicore(result, out) + return out + + +def _is_infinicore_tensor(value): + return hasattr(value, "_underlying") and hasattr(value, "copy_") + + +def _infinicore_to_torch(value): + if not _is_infinicore_tensor(value): + return value + + result = torch.empty_strided( + tuple(value.shape), + tuple(value.stride()), + dtype=_to_torch_dtype(value.dtype), + device=str(value.device), + ) + _infinicore_from_torch(result).copy_(value) + return result + + +def _copy_torch_to_infinicore(value, out): + if _is_infinicore_tensor(out): + out.copy_(_infinicore_from_torch(value)) + else: + out.copy_(value) + + +def _infinicore_from_torch(value): + infinicore = __import__("infinicore") + infini_device = infinicore.device(value.device.type, value.device.index or 0) + kwargs = {"dtype": _to_infinicore_dtype(value.dtype), "device": infini_device} + if value.is_contiguous(): + return infinicore.from_blob(value.data_ptr(), list(value.shape), **kwargs) + return infinicore.strided_from_blob( + value.data_ptr(), list(value.shape), list(value.stride()), **kwargs + ) + + +def _to_torch_dtype(dtype): + infinicore = __import__("infinicore") + mapping = { + infinicore.float16: torch.float16, + infinicore.bfloat16: torch.bfloat16, + infinicore.float32: torch.float32, + infinicore.float64: torch.float64, + } + return mapping.get(dtype, dtype) + + +def _to_infinicore_dtype(dtype): + infinicore = __import__("infinicore") + mapping = { + torch.float16: infinicore.float16, + torch.bfloat16: infinicore.bfloat16, + torch.float32: infinicore.float32, + torch.float64: infinicore.float64, + } + return mapping[dtype] + + @functools.cache def _cached_make( premake, *args, num_warps=None, num_stages=None, max_num_configs=None, **keywords diff --git a/tests/perf_utils.py b/tests/perf_utils.py new file mode 100644 index 0000000..2faf53e --- /dev/null +++ b/tests/perf_utils.py @@ -0,0 +1,90 @@ +import os +import statistics + +import pytest +import torch +import triton.testing as tt + + +_RUN_PERF_ENV = "NTOPS_RUN_PERF" + + +def skip_unless_perf_enabled(): + if os.environ.get(_RUN_PERF_ENV, "0") != "1": + pytest.skip( + f"perf benchmark; set {_RUN_PERF_ENV}=1 to run", + allow_module_level=True, + ) + + +SHAPES = [ + (13, 4), + (8, 16), + (2, 3, 4), + (16, 5632), + (256, 5632), + (1024, 5632), +] + + +FLOAT_DTYPES = [torch.float16, torch.bfloat16, torch.float32] + + +INT_DTYPES = [torch.int8, torch.int16, torch.int32, torch.int64] + + +_DTYPE_NAMES = { + torch.float16: "fp16", + torch.bfloat16: "bf16", + torch.float32: "fp32", + torch.int8: "int8", + torch.int16: "int16", + torch.int32: "int32", + torch.int64: "int64", +} + + +def dtype_name(dtype): + return _DTYPE_NAMES.get(dtype, str(dtype)) + + +MIN_RATIO = 0.5 + + +def bench_us(fn, *, warmup=50, rep=200, repeat=3): + runs = [tt.do_bench(fn, warmup=warmup, rep=rep) * 1000 for _ in range(repeat)] + return statistics.median(runs) + + +def report_and_assert(op_name, shape, dtype, ntops_us, torch_us): + ratio = torch_us / ntops_us if ntops_us > 0 else 0.0 + print( + f"\n {op_name:9s} shape={str(tuple(shape)):14s} dtype={dtype_name(dtype):5s} " + f"ntops={ntops_us:8.2f}us torch={torch_us:8.2f}us ratio={ratio:.3f}", + end="", + ) + assert ratio >= MIN_RATIO, ( + f"{op_name} perf regression: ratio {ratio:.3f} < {MIN_RATIO} " + f"(ntops={ntops_us:.2f}us, torch={torch_us:.2f}us, " + f"shape={tuple(shape)}, dtype={dtype_name(dtype)})" + ) + + +def warmup_pair(ntops_fn, torch_fn, n=50): + for _ in range(n): + ntops_fn() + torch_fn() + torch.cuda.synchronize() + + +def make_float_input(shape, dtype, *, op_name=None): + if op_name == "lgamma": + return torch.rand(shape, dtype=dtype, device="cuda") * 5.0 + 0.5 + return torch.randn(shape, dtype=dtype, device="cuda") + + +def make_int_input(shape, dtype): + info = torch.iinfo(dtype) + lo = max(info.min, -32768) + hi = min(info.max, 32767) + return torch.randint(lo, hi, shape, dtype=dtype, device="cuda") diff --git a/tests/test_copysign.py b/tests/test_copysign.py new file mode 100644 index 0000000..43e2ecf --- /dev/null +++ b/tests/test_copysign.py @@ -0,0 +1,18 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_copysign(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + other = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.copysign(input, other) + reference_output = torch.copysign(input, other) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_copysign_perf.py b/tests/test_copysign_perf.py new file mode 100644 index 0000000..9be5dda --- /dev/null +++ b/tests/test_copysign_perf.py @@ -0,0 +1,40 @@ +import itertools + +import pytest +import torch + +import ntops +from tests.perf_utils import ( + FLOAT_DTYPES, + SHAPES, + bench_us, + dtype_name, + make_float_input, + report_and_assert, + skip_unless_perf_enabled, + warmup_pair, +) +from tests.skippers import skip_if_cuda_not_available + + +skip_unless_perf_enabled() + + +_PARAMS = list(itertools.product(SHAPES, FLOAT_DTYPES)) +_IDS = [f"{tuple(s)}-{dtype_name(d)}" for s, d in _PARAMS] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("shape, dtype", _PARAMS, ids=_IDS) +def test_copysign_perf(shape, dtype): + a = make_float_input(shape, dtype) + b = make_float_input(shape, dtype) + out = torch.empty_like(a) + + ntops_fn = lambda: ntops.torch.copysign(a, b, out=out) + torch_fn = lambda: torch.copysign(a, b, out=out) + warmup_pair(ntops_fn, torch_fn) + + ntops_us = bench_us(ntops_fn) + torch_us = bench_us(torch_fn) + report_and_assert("copysign", shape, dtype, ntops_us, torch_us) diff --git a/tests/test_lcm.py b/tests/test_lcm.py new file mode 100644 index 0000000..c4d343e --- /dev/null +++ b/tests/test_lcm.py @@ -0,0 +1,26 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments(False)) +def test_lcm(shape, dtype, device, rtol, atol): + if dtype == torch.bool: + pytest.skip("torch.lcm does not support bool dtype") + + upper_bound = 100 + input = torch.randint( + -upper_bound, upper_bound, size=shape, dtype=dtype, device=device + ) + other = torch.randint( + -upper_bound, upper_bound, size=shape, dtype=dtype, device=device + ) + + ninetoothed_output = ntops.torch.lcm(input, other) + reference_output = torch.lcm(input, other) + + assert torch.equal(ninetoothed_output, reference_output) diff --git a/tests/test_lcm_perf.py b/tests/test_lcm_perf.py new file mode 100644 index 0000000..36c70fc --- /dev/null +++ b/tests/test_lcm_perf.py @@ -0,0 +1,40 @@ +import itertools + +import pytest +import torch + +import ntops +from tests.perf_utils import ( + INT_DTYPES, + SHAPES, + bench_us, + dtype_name, + make_int_input, + report_and_assert, + skip_unless_perf_enabled, + warmup_pair, +) +from tests.skippers import skip_if_cuda_not_available + + +skip_unless_perf_enabled() + + +_PARAMS = list(itertools.product(SHAPES, INT_DTYPES)) +_IDS = [f"{tuple(s)}-{dtype_name(d)}" for s, d in _PARAMS] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("shape, dtype", _PARAMS, ids=_IDS) +def test_lcm_perf(shape, dtype): + a = make_int_input(shape, dtype) + b = make_int_input(shape, dtype) + out = torch.empty_like(a) + + ntops_fn = lambda: ntops.torch.lcm(a, b, out=out) + torch_fn = lambda: torch.lcm(a, b, out=out) + warmup_pair(ntops_fn, torch_fn) + + ntops_us = bench_us(ntops_fn) + torch_us = bench_us(torch_fn) + report_and_assert("lcm", shape, dtype, ntops_us, torch_us) diff --git a/tests/test_lgamma.py b/tests/test_lgamma.py new file mode 100644 index 0000000..f387f7e --- /dev/null +++ b/tests/test_lgamma.py @@ -0,0 +1,19 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_lgamma(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device).abs() + 0.1 + + ninetoothed_output = ntops.torch.lgamma(input) + reference_output = torch.lgamma(input) + + assert torch.allclose( + ninetoothed_output, reference_output, rtol=rtol, atol=atol, equal_nan=True + ) diff --git a/tests/test_lgamma_perf.py b/tests/test_lgamma_perf.py new file mode 100644 index 0000000..11bdd80 --- /dev/null +++ b/tests/test_lgamma_perf.py @@ -0,0 +1,39 @@ +import itertools + +import pytest +import torch + +import ntops +from tests.perf_utils import ( + FLOAT_DTYPES, + SHAPES, + bench_us, + dtype_name, + make_float_input, + report_and_assert, + skip_unless_perf_enabled, + warmup_pair, +) +from tests.skippers import skip_if_cuda_not_available + + +skip_unless_perf_enabled() + + +_PARAMS = list(itertools.product(SHAPES, FLOAT_DTYPES)) +_IDS = [f"{tuple(s)}-{dtype_name(d)}" for s, d in _PARAMS] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("shape, dtype", _PARAMS, ids=_IDS) +def test_lgamma_perf(shape, dtype): + a = make_float_input(shape, dtype, op_name="lgamma") + out = torch.empty_like(a) + + ntops_fn = lambda: ntops.torch.lgamma(a, out=out) + torch_fn = lambda: torch.lgamma(a, out=out) + warmup_pair(ntops_fn, torch_fn) + + ntops_us = bench_us(ntops_fn) + torch_us = bench_us(torch_fn) + report_and_assert("lgamma", shape, dtype, ntops_us, torch_us) diff --git a/tests/test_nextafter.py b/tests/test_nextafter.py new file mode 100644 index 0000000..d021a6f --- /dev/null +++ b/tests/test_nextafter.py @@ -0,0 +1,20 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_nextafter(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + other = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.nextafter(input, other) + reference_output = torch.nextafter(input, other) + + assert torch.allclose( + ninetoothed_output, reference_output, rtol=rtol, atol=atol, equal_nan=True + ) diff --git a/tests/test_nextafter_perf.py b/tests/test_nextafter_perf.py new file mode 100644 index 0000000..164fabc --- /dev/null +++ b/tests/test_nextafter_perf.py @@ -0,0 +1,40 @@ +import itertools + +import pytest +import torch + +import ntops +from tests.perf_utils import ( + FLOAT_DTYPES, + SHAPES, + bench_us, + dtype_name, + make_float_input, + report_and_assert, + skip_unless_perf_enabled, + warmup_pair, +) +from tests.skippers import skip_if_cuda_not_available + + +skip_unless_perf_enabled() + + +_PARAMS = list(itertools.product(SHAPES, FLOAT_DTYPES)) +_IDS = [f"{tuple(s)}-{dtype_name(d)}" for s, d in _PARAMS] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("shape, dtype", _PARAMS, ids=_IDS) +def test_nextafter_perf(shape, dtype): + a = make_float_input(shape, dtype) + b = make_float_input(shape, dtype) + out = torch.empty_like(a) + + ntops_fn = lambda: ntops.torch.nextafter(a, b, out=out) + torch_fn = lambda: torch.nextafter(a, b, out=out) + warmup_pair(ntops_fn, torch_fn) + + ntops_us = bench_us(ntops_fn) + torch_us = bench_us(torch_fn) + report_and_assert("nextafter", shape, dtype, ntops_us, torch_us) diff --git a/tests/test_rad2deg.py b/tests/test_rad2deg.py new file mode 100644 index 0000000..222161d --- /dev/null +++ b/tests/test_rad2deg.py @@ -0,0 +1,17 @@ +import pytest +import torch + +import ntops +from tests.skippers import skip_if_cuda_not_available +from tests.utils import generate_arguments + + +@skip_if_cuda_not_available +@pytest.mark.parametrize(*generate_arguments()) +def test_rad2deg(shape, dtype, device, rtol, atol): + input = torch.randn(shape, dtype=dtype, device=device) + + ninetoothed_output = ntops.torch.rad2deg(input) + reference_output = torch.rad2deg(input) + + assert torch.allclose(ninetoothed_output, reference_output, rtol=rtol, atol=atol) diff --git a/tests/test_rad2deg_perf.py b/tests/test_rad2deg_perf.py new file mode 100644 index 0000000..8670295 --- /dev/null +++ b/tests/test_rad2deg_perf.py @@ -0,0 +1,39 @@ +import itertools + +import pytest +import torch + +import ntops +from tests.perf_utils import ( + FLOAT_DTYPES, + SHAPES, + bench_us, + dtype_name, + make_float_input, + report_and_assert, + skip_unless_perf_enabled, + warmup_pair, +) +from tests.skippers import skip_if_cuda_not_available + + +skip_unless_perf_enabled() + + +_PARAMS = list(itertools.product(SHAPES, FLOAT_DTYPES)) +_IDS = [f"{tuple(s)}-{dtype_name(d)}" for s, d in _PARAMS] + + +@skip_if_cuda_not_available +@pytest.mark.parametrize("shape, dtype", _PARAMS, ids=_IDS) +def test_rad2deg_perf(shape, dtype): + a = make_float_input(shape, dtype) + out = torch.empty_like(a) + + ntops_fn = lambda: ntops.torch.rad2deg(a, out=out) + torch_fn = lambda: torch.rad2deg(a, out=out) + warmup_pair(ntops_fn, torch_fn) + + ntops_us = bench_us(ntops_fn) + torch_us = bench_us(torch_fn) + report_and_assert("rad2deg", shape, dtype, ntops_us, torch_us)