InfiniTensor · mygitljf · May 18, 2026 · May 18, 2026 · May 18, 2026 · May 19, 2026
diff --git a/HONOR_CODE.md b/HONOR_CODE.md
@@ -0,0 +1,72 @@
+# 2026 春季启元人工智能大赛诚信守则（Honor Code）
+
+
+本人作为 2026 春季启元人工智能大赛（以下简称“比赛”）的参赛选手，郑重承诺严格遵守比赛规则及本诚信守则，秉持诚信、公正、廉洁的参赛原则，自觉维护比赛的公平性与严肃性。本人充分理解并认可，违反本准则将导致参赛资格被取消、比赛成绩作废等相应后果，且愿意承担由此产生的一切责任。
+
+## 一、参赛诚信承诺
+
+1. 本人保证所提交的赛题PR（Pull Request）中包含的算子实现代码及相关文档，均为本人（及参赛团队，如为团队参赛）在比赛期间独立完成或在明确标注参考来源的基础上进行开发，不存在任何欺诈、抄袭、作弊行为。
+
+2. 本人承诺主动、全面、真实地披露赛题实现过程中所有参考的外部资源，尤其是开源代码资源，不隐瞒任何可能影响比赛公平性的信息。
+
+3. 本人保证不采用任何不正当手段获取比赛优势，包括但不限于窃取其他参赛选手的代码成果、利用非比赛允许的工具或技术、与他人串通作弊等。
+
+## 二、参考资源说明
+
+本人确认已按比赛要求，将本次赛题实现过程中涉及的参考资源信息单独撰写至`REFERENCE.md`文件中，该文件将与本诚信守则一同作为PR附件提交。`REFERENCE.md`需根据实际参考情况，按以下要求完整填写，信息不完整或虚假填写将视为违反本准则：
+
+**情况1：无参考外部开源代码及核心实现思路**
+
+`REFERENCE.md`中需明确声明：“本次赛题提交的算子代码、核心算法逻辑及实现方案均为本人（及参赛团队）独立设计与开发，未参考任何外部开源项目、技术文档中的核心代码片段或实现思路，未接受任何第三方的技术指导或代码支持。”
+
+**情况2：有参考外部开源代码及相关资源**
+
+对每个参考资源提供以下信息陈述： 
+1. 参考开源项目/资源名称
+
+2. 参考资源链接（GitHub/Gitee/论文/技术文档等）
+
+3.  参考的具体内容（请明确说明参考的代码片段、算法逻辑、实现思路等，需标注对应资源的具体位置，如文件路径、代码行数等）
+
+4. 本人对参考内容的修改与优化说明：（请详细说明在参考基础上，本人所做的独立开发、修改、优化工作，体现自身技术贡献）
+
+5. 若是开源项目，提供参考资源的开源协议类型：（如MIT、Apache 2.0、GPL等）
+
+6. 其他需要补充说明的信息
+
+
+## 三、禁止行为确认
+
+本人明确知晓并承诺避免以下违反比赛公平性的行为，若存在以下任一情况，自愿接受比赛组委会的相应处罚：
+
+1. 未经授权复制、抄袭他人（包括其他参赛选手、开源项目、商业代码）的代码、算法或技术方案，且未进行明确标注；
+
+2. 隐瞒或虚假披露参考资源信息，包括遗漏重要参考来源、伪造参考内容说明等；
+
+3. 与其他参赛选手或第三方串通，进行代码共享、成果交换等违规协作；
+
+4. 利用比赛平台漏洞、技术缺陷或非比赛允许的工具获取不正当利益；
+
+5. 伪造比赛相关证明材料、提交虚假信息；
+
+6. 其他违反比赛规则及公序良俗的不诚信行为。
+
+
+## 四、责任与确认
+
+1. 本人充分理解，比赛组委会将对所有提交的PR进行代码溯源、参考信息核查等公平性审查，若发现本人存在违反本准则的行为，有权随时取消本人的参赛资格、作废比赛成绩，情节严重的将在比赛相关平台进行公示。
+
+2. 若因本人违反本准则导致比赛争议或第三方权益受损（如开源协议侵权等），本人将独立承担全部法律责任及相关损失，与比赛组委会无关。
+
+3. 本人确认已仔细阅读并完全理解本诚信守则的全部内容，自愿签署本准则，接受比赛组委会的监督与审查。
+
+## 五、签署信息
+
+参赛选手姓名（团队参赛需填写所有成员姓名）
+
+    练锦烽
+
+签署日期
+
+___2026___年__5__月__18__日
+
diff --git a/src/ntops/kernels/__init__.py b/src/ntops/kernels/__init__.py
@@ -9,6 +9,7 @@
     bmm,
     clamp,
     conv2d,
+    copysign,
     cos,
     div,
     dropout,
@@ -20,14 +21,18 @@
     isinf,
     isnan,
     layer_norm,
+    lcm,
     le,
+    lgamma,
     lt,
     max_pool2d,
     mm,
     mul,
     ne,
     neg,
+    nextafter,
     pow,
+    rad2deg,
     relu,
     rms_norm,
     rotary_position_embedding,
@@ -52,6 +57,7 @@
     "bmm",
     "clamp",
     "conv2d",
+    "copysign",
     "cos",
     "div",
     "dropout",
@@ -63,14 +69,18 @@
     "isinf",
     "isnan",
     "layer_norm",
+    "lcm",
     "le",
+    "lgamma",
     "lt",
     "max_pool2d",
     "mm",
     "mul",
     "ne",
     "neg",
+    "nextafter",
     "pow",
+    "rad2deg",
     "relu",
     "rms_norm",
     "rotary_position_embedding",

diff --git a/src/ntops/kernels/copysign.py b/src/ntops/kernels/copysign.py
@@ -0,0 +1,68 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+
+from ntops.kernels.element_wise import arrangement
+
+
+def application_int16(input, other, output):
+    # Pure bit manipulation: take magnitude bits of input, sign bit of other.
+    # Avoids the fp16/bf16 -> fp32 -> fp16/bf16 round-trip required by
+    # libdevice.copysign, which doesn't support narrow floats.
+    dtype = output.dtype
+    int_dtype = ntl.int16
+
+    input_bits = ntl.cast(input, int_dtype, bitcast=True)
+    other_bits = ntl.cast(other, int_dtype, bitcast=True)
+    sign_bit = ntl.cast(1, int_dtype) << 15
+    magn_mask = sign_bit - ntl.cast(1, int_dtype)
+    output = ntl.cast(  # noqa: F841
+        (input_bits & magn_mask) | (other_bits & sign_bit), dtype, bitcast=True
+    )
+
+
+def application_int32(input, other, output):
+    dtype = output.dtype
+    int_dtype = ntl.int32
+
+    input_bits = ntl.cast(input, int_dtype, bitcast=True)
+    other_bits = ntl.cast(other, int_dtype, bitcast=True)
+    sign_bit = ntl.cast(1, int_dtype) << 31
+    magn_mask = sign_bit - ntl.cast(1, int_dtype)
+    output = ntl.cast(  # noqa: F841
+        (input_bits & magn_mask) | (other_bits & sign_bit), dtype, bitcast=True
+    )
+
+
+def application_int64(input, other, output):
+    dtype = output.dtype
+    int_dtype = ntl.int64
+
+    input_bits = ntl.cast(input, int_dtype, bitcast=True)
+    other_bits = ntl.cast(other, int_dtype, bitcast=True)
+    sign_bit = ntl.cast(1, int_dtype) << 63
+    magn_mask = sign_bit - ntl.cast(1, int_dtype)
+    output = ntl.cast(  # noqa: F841
+        (input_bits & magn_mask) | (other_bits & sign_bit), dtype, bitcast=True
+    )
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+
+    if dtype in (ninetoothed.float16, ninetoothed.bfloat16):
+        application = application_int16
+    elif dtype == ninetoothed.float32:
+        application = application_int32
+    else:
+        application = application_int64
+
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+
+    return arrangement_, application, tensors
diff --git a/src/ntops/kernels/lcm.py b/src/ntops/kernels/lcm.py
@@ -0,0 +1,184 @@
+import functools
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor
+from ninetoothed.language import libdevice
+
+from ntops.kernels.element_wise import arrangement
+
+
+# T1-1-1 lcm: dtype-dispatched algorithm.
+#
+# int8/int16/int32 -> Stein binary GCD (no IDIV, cheap per-iter).
+# int64            -> Dynamic Euclidean with grouped block-level early stop.
+#
+# Why two algorithms:
+#   Stein per-iter on A100: ~14 us at BLOCK=512/warps=8 (no IDIV; just
+#       ffs+shift+sub+min+abs + 1 where).
+#   Euclidean per-iter on A100: ~14 us at BLOCK=32/warps=1 (1 IDIV +
+#       2 wheres; the int64 IDIV ~30 cycles, but BLOCK=32/warps=1 means
+#       1 element per thread which maximizes the number of concurrent
+#       in-flight IDIVs across SMs).
+#       At BLOCK=512/warps=8 (= 2 elements per thread), Euclidean is ~43
+#       us per iter because each thread's two dependent IDIV chains block
+#       each other -> 3x slower. The (32, 1) config is critical.
+#   For int8/16/32, Stein static unroll is unbeatable (no IDIV, no dynamic
+#       check overhead). For int64 with v2-style small inputs (values <=
+#       ~2^20), Euclidean dynamic averages ~14 outer iters vs Stein's
+#       fixed 60, giving ~4x speedup.
+#
+# Stein unroll counts (worst-case empirically validated):
+#   int8  (value range <= 127):                            max  5 -> use  8
+#   int16 (value range <= 32767):                          max 13 -> use 16
+#   int32 (value range <= 2^31):                           max 31 -> use 36
+#
+# Euclidean (int64) uses grouped dynamic stop:
+#   outer cap = 12, inner unroll = 8 -> max 96 Euclidean iters.
+#   Block-level `ntl.max(b) != 0` check every 8 inner iters.
+#   N=96 covers Fibonacci adversarial worst case (~91 iters) for full
+#   int64 range.
+#
+# Sentinel-merge (iter05): one `where` per Stein iter using `a` (always
+# odd, always >= 1) as the `b == 0` sentinel.
+#
+# History:
+#   iter05: sentinel-merge + flat 1D + explicit (512, warps_per_dt, 1)
+#   iter06: int64 Stein 72 -> 64
+#   iter07: int64 Stein 64 -> 60 (1M+ sample empirical worst case = 57)
+#   iter08: int64 switch Stein -> dynamic Euclidean at (BLOCK=32, warps=1).
+#           Trade-off: int64 v2-style small-input launches ~2-4x faster
+#           than Stein; full-range int64 launches ~1.9x slower. Other
+#           dtypes unchanged.
+def application_8(input, other, output):
+    dtype = output.dtype
+    compute_dtype = ntl.int32
+    abs_a = ntl.abs(ntl.cast(input, compute_dtype))
+    abs_b = ntl.abs(ntl.cast(other, compute_dtype))
+    or_ab = abs_a | abs_b
+    safe_or = ntl.where(or_ab != 0, or_ab, 1)
+    k = ntl.cast(libdevice.ffs(safe_or) - 1, compute_dtype)
+    a0 = abs_a >> k
+    b0 = abs_b >> k
+    nonzero_a0 = a0 != 0
+    safe_a0 = ntl.where(nonzero_a0, a0, 1)
+    ctz_a0 = ntl.cast(libdevice.ffs(safe_a0) - 1, compute_dtype)
+    a = ntl.where(nonzero_a0, a0 >> ctz_a0, b0)
+    b = ntl.where(nonzero_a0, b0, ntl.cast(0, compute_dtype))
+    a = ntl.where(a == 0, ntl.cast(1, compute_dtype), a)
+    for _ in range(8):
+        b_for_calc = ntl.where(b != 0, b, a)
+        ctz_b = ntl.cast(libdevice.ffs(b_for_calc) - 1, compute_dtype)
+        b_odd = b_for_calc >> ctz_b
+        diff = b_odd - a
+        a = ntl.minimum(a, b_odd)
+        b = ntl.abs(diff)
+    gcd = a << k
+    safe_gcd = ntl.where(gcd == 0, 1, gcd)
+    output = ntl.cast(  # noqa: F841
+        ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype
+    )
+
+
+def application_16(input, other, output):
+    dtype = output.dtype
+    compute_dtype = ntl.int32
+    abs_a = ntl.abs(ntl.cast(input, compute_dtype))
+    abs_b = ntl.abs(ntl.cast(other, compute_dtype))
+    or_ab = abs_a | abs_b
+    safe_or = ntl.where(or_ab != 0, or_ab, 1)
+    k = ntl.cast(libdevice.ffs(safe_or) - 1, compute_dtype)
+    a0 = abs_a >> k
+    b0 = abs_b >> k
+    nonzero_a0 = a0 != 0
+    safe_a0 = ntl.where(nonzero_a0, a0, 1)
+    ctz_a0 = ntl.cast(libdevice.ffs(safe_a0) - 1, compute_dtype)
+    a = ntl.where(nonzero_a0, a0 >> ctz_a0, b0)
+    b = ntl.where(nonzero_a0, b0, ntl.cast(0, compute_dtype))
+    a = ntl.where(a == 0, ntl.cast(1, compute_dtype), a)
+    for _ in range(16):
+        b_for_calc = ntl.where(b != 0, b, a)
+        ctz_b = ntl.cast(libdevice.ffs(b_for_calc) - 1, compute_dtype)
+        b_odd = b_for_calc >> ctz_b
+        diff = b_odd - a
+        a = ntl.minimum(a, b_odd)
+        b = ntl.abs(diff)
+    gcd = a << k
+    safe_gcd = ntl.where(gcd == 0, 1, gcd)
+    output = ntl.cast(  # noqa: F841
+        ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype
+    )
+
+
+def application_36(input, other, output):
+    dtype = output.dtype
+    compute_dtype = ntl.int32
+    abs_a = ntl.abs(ntl.cast(input, compute_dtype))
+    abs_b = ntl.abs(ntl.cast(other, compute_dtype))
+    or_ab = abs_a | abs_b
+    safe_or = ntl.where(or_ab != 0, or_ab, 1)
+    k = ntl.cast(libdevice.ffs(safe_or) - 1, compute_dtype)
+    a0 = abs_a >> k
+    b0 = abs_b >> k
+    nonzero_a0 = a0 != 0
+    safe_a0 = ntl.where(nonzero_a0, a0, 1)
+    ctz_a0 = ntl.cast(libdevice.ffs(safe_a0) - 1, compute_dtype)
+    a = ntl.where(nonzero_a0, a0 >> ctz_a0, b0)
+    b = ntl.where(nonzero_a0, b0, ntl.cast(0, compute_dtype))
+    a = ntl.where(a == 0, ntl.cast(1, compute_dtype), a)
+    for _ in range(36):
+        b_for_calc = ntl.where(b != 0, b, a)
+        ctz_b = ntl.cast(libdevice.ffs(b_for_calc) - 1, compute_dtype)
+        b_odd = b_for_calc >> ctz_b
+        diff = b_odd - a
+        a = ntl.minimum(a, b_odd)
+        b = ntl.abs(diff)
+    gcd = a << k
+    safe_gcd = ntl.where(gcd == 0, 1, gcd)
+    output = ntl.cast(  # noqa: F841
+        ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype
+    )
+
+
+def application_euclidean_dyn(input, other, output):
+    # Dynamic Euclidean for int64.
+    # Block-level early stop every 8 inner iters; outer cap 12 -> N=96.
+    # Convergence for random uniform full-range int64 averages ~36 iters;
+    # for v2-style range (<=2^20) averages ~14 iters.
+    dtype = output.dtype
+    abs_a = ntl.abs(input)
+    abs_b = ntl.abs(other)
+    or_ab = abs_a | abs_b
+    a = ntl.where(abs_a >= abs_b, abs_a, abs_b)
+    b = ntl.where(abs_a >= abs_b, abs_b, abs_a)
+    outer = 0
+    while ntl.max(b) != 0 and outer < 12:
+        for _ in range(8):
+            b_safe = ntl.where(b != 0, b, ntl.cast(1, dtype))
+            r = a % b_safe
+            a = ntl.where(b != 0, b, a)
+            b = r
+        outer += 1
+    gcd = a
+    safe_gcd = ntl.where(gcd == 0, 1, gcd)
+    output = ntl.cast(  # noqa: F841
+        ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype
+    )
+
+
+def premake(ndim, dtype=None, block_size=None):
+    arrangement_ = functools.partial(arrangement, block_size=block_size)
+    if dtype == ninetoothed.int64:
+        application = application_euclidean_dyn
+    elif dtype == ninetoothed.int32:
+        application = application_36
+    elif dtype == ninetoothed.int16:
+        application = application_16
+    else:
+        application = application_8
+    tensors = (
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+        Tensor(ndim, dtype=dtype),
+    )
+    return arrangement_, application, tensors