Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions HONOR_CODE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# 2026 春季启元人工智能大赛诚信守则(Honor Code)


本人作为 2026 春季启元人工智能大赛(以下简称“比赛”)的参赛选手,郑重承诺严格遵守比赛规则及本诚信守则,秉持诚信、公正、廉洁的参赛原则,自觉维护比赛的公平性与严肃性。本人充分理解并认可,违反本准则将导致参赛资格被取消、比赛成绩作废等相应后果,且愿意承担由此产生的一切责任。

## 一、参赛诚信承诺

1. 本人保证所提交的赛题PR(Pull Request)中包含的算子实现代码及相关文档,均为本人(及参赛团队,如为团队参赛)在比赛期间独立完成或在明确标注参考来源的基础上进行开发,不存在任何欺诈、抄袭、作弊行为。

2. 本人承诺主动、全面、真实地披露赛题实现过程中所有参考的外部资源,尤其是开源代码资源,不隐瞒任何可能影响比赛公平性的信息。

3. 本人保证不采用任何不正当手段获取比赛优势,包括但不限于窃取其他参赛选手的代码成果、利用非比赛允许的工具或技术、与他人串通作弊等。

## 二、参考资源说明

本人确认已按比赛要求,将本次赛题实现过程中涉及的参考资源信息单独撰写至`REFERENCE.md`文件中,该文件将与本诚信守则一同作为PR附件提交。`REFERENCE.md`需根据实际参考情况,按以下要求完整填写,信息不完整或虚假填写将视为违反本准则:

**情况1:无参考外部开源代码及核心实现思路**

`REFERENCE.md`中需明确声明:“本次赛题提交的算子代码、核心算法逻辑及实现方案均为本人(及参赛团队)独立设计与开发,未参考任何外部开源项目、技术文档中的核心代码片段或实现思路,未接受任何第三方的技术指导或代码支持。”

**情况2:有参考外部开源代码及相关资源**

对每个参考资源提供以下信息陈述:
1. 参考开源项目/资源名称

2. 参考资源链接(GitHub/Gitee/论文/技术文档等)

3. 参考的具体内容(请明确说明参考的代码片段、算法逻辑、实现思路等,需标注对应资源的具体位置,如文件路径、代码行数等)

4. 本人对参考内容的修改与优化说明:(请详细说明在参考基础上,本人所做的独立开发、修改、优化工作,体现自身技术贡献)

5. 若是开源项目,提供参考资源的开源协议类型:(如MIT、Apache 2.0、GPL等)

6. 其他需要补充说明的信息


## 三、禁止行为确认

本人明确知晓并承诺避免以下违反比赛公平性的行为,若存在以下任一情况,自愿接受比赛组委会的相应处罚:

1. 未经授权复制、抄袭他人(包括其他参赛选手、开源项目、商业代码)的代码、算法或技术方案,且未进行明确标注;

2. 隐瞒或虚假披露参考资源信息,包括遗漏重要参考来源、伪造参考内容说明等;

3. 与其他参赛选手或第三方串通,进行代码共享、成果交换等违规协作;

4. 利用比赛平台漏洞、技术缺陷或非比赛允许的工具获取不正当利益;

5. 伪造比赛相关证明材料、提交虚假信息;

6. 其他违反比赛规则及公序良俗的不诚信行为。


## 四、责任与确认

1. 本人充分理解,比赛组委会将对所有提交的PR进行代码溯源、参考信息核查等公平性审查,若发现本人存在违反本准则的行为,有权随时取消本人的参赛资格、作废比赛成绩,情节严重的将在比赛相关平台进行公示。

2. 若因本人违反本准则导致比赛争议或第三方权益受损(如开源协议侵权等),本人将独立承担全部法律责任及相关损失,与比赛组委会无关。

3. 本人确认已仔细阅读并完全理解本诚信守则的全部内容,自愿签署本准则,接受比赛组委会的监督与审查。

## 五、签署信息

参赛选手姓名(团队参赛需填写所有成员姓名)

练锦烽

签署日期

___2026___年__5__月__18__日

10 changes: 10 additions & 0 deletions src/ntops/kernels/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
bmm,
clamp,
conv2d,
copysign,
cos,
div,
dropout,
Expand All @@ -20,14 +21,18 @@
isinf,
isnan,
layer_norm,
lcm,
le,
lgamma,
lt,
max_pool2d,
mm,
mul,
ne,
neg,
nextafter,
pow,
rad2deg,
relu,
rms_norm,
rotary_position_embedding,
Expand All @@ -52,6 +57,7 @@
"bmm",
"clamp",
"conv2d",
"copysign",
"cos",
"div",
"dropout",
Expand All @@ -63,14 +69,18 @@
"isinf",
"isnan",
"layer_norm",
"lcm",
"le",
"lgamma",
"lt",
"max_pool2d",
"mm",
"mul",
"ne",
"neg",
"nextafter",
"pow",
"rad2deg",
"relu",
"rms_norm",
"rotary_position_embedding",
Expand Down
68 changes: 68 additions & 0 deletions src/ntops/kernels/copysign.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import functools

import ninetoothed
import ninetoothed.language as ntl
from ninetoothed import Tensor

from ntops.kernels.element_wise import arrangement


def application_int16(input, other, output):
# Pure bit manipulation: take magnitude bits of input, sign bit of other.
# Avoids the fp16/bf16 -> fp32 -> fp16/bf16 round-trip required by
# libdevice.copysign, which doesn't support narrow floats.
dtype = output.dtype
int_dtype = ntl.int16

input_bits = ntl.cast(input, int_dtype, bitcast=True)
other_bits = ntl.cast(other, int_dtype, bitcast=True)
sign_bit = ntl.cast(1, int_dtype) << 15
magn_mask = sign_bit - ntl.cast(1, int_dtype)
output = ntl.cast( # noqa: F841
(input_bits & magn_mask) | (other_bits & sign_bit), dtype, bitcast=True
)


def application_int32(input, other, output):
dtype = output.dtype
int_dtype = ntl.int32

input_bits = ntl.cast(input, int_dtype, bitcast=True)
other_bits = ntl.cast(other, int_dtype, bitcast=True)
sign_bit = ntl.cast(1, int_dtype) << 31
magn_mask = sign_bit - ntl.cast(1, int_dtype)
output = ntl.cast( # noqa: F841
(input_bits & magn_mask) | (other_bits & sign_bit), dtype, bitcast=True
)


def application_int64(input, other, output):
dtype = output.dtype
int_dtype = ntl.int64

input_bits = ntl.cast(input, int_dtype, bitcast=True)
other_bits = ntl.cast(other, int_dtype, bitcast=True)
sign_bit = ntl.cast(1, int_dtype) << 63
magn_mask = sign_bit - ntl.cast(1, int_dtype)
output = ntl.cast( # noqa: F841
(input_bits & magn_mask) | (other_bits & sign_bit), dtype, bitcast=True
)


def premake(ndim, dtype=None, block_size=None):
arrangement_ = functools.partial(arrangement, block_size=block_size)

if dtype in (ninetoothed.float16, ninetoothed.bfloat16):
application = application_int16
elif dtype == ninetoothed.float32:
application = application_int32
else:
application = application_int64

tensors = (
Tensor(ndim, dtype=dtype),
Tensor(ndim, dtype=dtype),
Tensor(ndim, dtype=dtype),
)

return arrangement_, application, tensors
184 changes: 184 additions & 0 deletions src/ntops/kernels/lcm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import functools

import ninetoothed
import ninetoothed.language as ntl
from ninetoothed import Tensor
from ninetoothed.language import libdevice

from ntops.kernels.element_wise import arrangement


# T1-1-1 lcm: dtype-dispatched algorithm.
#
# int8/int16/int32 -> Stein binary GCD (no IDIV, cheap per-iter).
# int64 -> Dynamic Euclidean with grouped block-level early stop.
#
# Why two algorithms:
# Stein per-iter on A100: ~14 us at BLOCK=512/warps=8 (no IDIV; just
# ffs+shift+sub+min+abs + 1 where).
# Euclidean per-iter on A100: ~14 us at BLOCK=32/warps=1 (1 IDIV +
# 2 wheres; the int64 IDIV ~30 cycles, but BLOCK=32/warps=1 means
# 1 element per thread which maximizes the number of concurrent
# in-flight IDIVs across SMs).
# At BLOCK=512/warps=8 (= 2 elements per thread), Euclidean is ~43
# us per iter because each thread's two dependent IDIV chains block
# each other -> 3x slower. The (32, 1) config is critical.
# For int8/16/32, Stein static unroll is unbeatable (no IDIV, no dynamic
# check overhead). For int64 with v2-style small inputs (values <=
# ~2^20), Euclidean dynamic averages ~14 outer iters vs Stein's
# fixed 60, giving ~4x speedup.
#
# Stein unroll counts (worst-case empirically validated):
# int8 (value range <= 127): max 5 -> use 8
# int16 (value range <= 32767): max 13 -> use 16
# int32 (value range <= 2^31): max 31 -> use 36
#
# Euclidean (int64) uses grouped dynamic stop:
# outer cap = 12, inner unroll = 8 -> max 96 Euclidean iters.
# Block-level `ntl.max(b) != 0` check every 8 inner iters.
# N=96 covers Fibonacci adversarial worst case (~91 iters) for full
# int64 range.
#
# Sentinel-merge (iter05): one `where` per Stein iter using `a` (always
# odd, always >= 1) as the `b == 0` sentinel.
#
# History:
# iter05: sentinel-merge + flat 1D + explicit (512, warps_per_dt, 1)
# iter06: int64 Stein 72 -> 64
# iter07: int64 Stein 64 -> 60 (1M+ sample empirical worst case = 57)
# iter08: int64 switch Stein -> dynamic Euclidean at (BLOCK=32, warps=1).
# Trade-off: int64 v2-style small-input launches ~2-4x faster
# than Stein; full-range int64 launches ~1.9x slower. Other
# dtypes unchanged.
def application_8(input, other, output):
dtype = output.dtype
compute_dtype = ntl.int32
abs_a = ntl.abs(ntl.cast(input, compute_dtype))
abs_b = ntl.abs(ntl.cast(other, compute_dtype))
or_ab = abs_a | abs_b
safe_or = ntl.where(or_ab != 0, or_ab, 1)
k = ntl.cast(libdevice.ffs(safe_or) - 1, compute_dtype)
a0 = abs_a >> k
b0 = abs_b >> k
nonzero_a0 = a0 != 0
safe_a0 = ntl.where(nonzero_a0, a0, 1)
ctz_a0 = ntl.cast(libdevice.ffs(safe_a0) - 1, compute_dtype)
a = ntl.where(nonzero_a0, a0 >> ctz_a0, b0)
b = ntl.where(nonzero_a0, b0, ntl.cast(0, compute_dtype))
a = ntl.where(a == 0, ntl.cast(1, compute_dtype), a)
for _ in range(8):
b_for_calc = ntl.where(b != 0, b, a)
ctz_b = ntl.cast(libdevice.ffs(b_for_calc) - 1, compute_dtype)
b_odd = b_for_calc >> ctz_b
diff = b_odd - a
a = ntl.minimum(a, b_odd)
b = ntl.abs(diff)
gcd = a << k
safe_gcd = ntl.where(gcd == 0, 1, gcd)
output = ntl.cast( # noqa: F841
ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype
)


def application_16(input, other, output):
dtype = output.dtype
compute_dtype = ntl.int32
abs_a = ntl.abs(ntl.cast(input, compute_dtype))
abs_b = ntl.abs(ntl.cast(other, compute_dtype))
or_ab = abs_a | abs_b
safe_or = ntl.where(or_ab != 0, or_ab, 1)
k = ntl.cast(libdevice.ffs(safe_or) - 1, compute_dtype)
a0 = abs_a >> k
b0 = abs_b >> k
nonzero_a0 = a0 != 0
safe_a0 = ntl.where(nonzero_a0, a0, 1)
ctz_a0 = ntl.cast(libdevice.ffs(safe_a0) - 1, compute_dtype)
a = ntl.where(nonzero_a0, a0 >> ctz_a0, b0)
b = ntl.where(nonzero_a0, b0, ntl.cast(0, compute_dtype))
a = ntl.where(a == 0, ntl.cast(1, compute_dtype), a)
for _ in range(16):
b_for_calc = ntl.where(b != 0, b, a)
ctz_b = ntl.cast(libdevice.ffs(b_for_calc) - 1, compute_dtype)
b_odd = b_for_calc >> ctz_b
diff = b_odd - a
a = ntl.minimum(a, b_odd)
b = ntl.abs(diff)
gcd = a << k
safe_gcd = ntl.where(gcd == 0, 1, gcd)
output = ntl.cast( # noqa: F841
ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype
)


def application_36(input, other, output):
dtype = output.dtype
compute_dtype = ntl.int32
abs_a = ntl.abs(ntl.cast(input, compute_dtype))
abs_b = ntl.abs(ntl.cast(other, compute_dtype))
or_ab = abs_a | abs_b
safe_or = ntl.where(or_ab != 0, or_ab, 1)
k = ntl.cast(libdevice.ffs(safe_or) - 1, compute_dtype)
a0 = abs_a >> k
b0 = abs_b >> k
nonzero_a0 = a0 != 0
safe_a0 = ntl.where(nonzero_a0, a0, 1)
ctz_a0 = ntl.cast(libdevice.ffs(safe_a0) - 1, compute_dtype)
a = ntl.where(nonzero_a0, a0 >> ctz_a0, b0)
b = ntl.where(nonzero_a0, b0, ntl.cast(0, compute_dtype))
a = ntl.where(a == 0, ntl.cast(1, compute_dtype), a)
for _ in range(36):
b_for_calc = ntl.where(b != 0, b, a)
ctz_b = ntl.cast(libdevice.ffs(b_for_calc) - 1, compute_dtype)
b_odd = b_for_calc >> ctz_b
diff = b_odd - a
a = ntl.minimum(a, b_odd)
b = ntl.abs(diff)
gcd = a << k
safe_gcd = ntl.where(gcd == 0, 1, gcd)
output = ntl.cast( # noqa: F841
ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype
)


def application_euclidean_dyn(input, other, output):
# Dynamic Euclidean for int64.
# Block-level early stop every 8 inner iters; outer cap 12 -> N=96.
# Convergence for random uniform full-range int64 averages ~36 iters;
# for v2-style range (<=2^20) averages ~14 iters.
dtype = output.dtype
abs_a = ntl.abs(input)
abs_b = ntl.abs(other)
or_ab = abs_a | abs_b
a = ntl.where(abs_a >= abs_b, abs_a, abs_b)
b = ntl.where(abs_a >= abs_b, abs_b, abs_a)
outer = 0
while ntl.max(b) != 0 and outer < 12:
for _ in range(8):
b_safe = ntl.where(b != 0, b, ntl.cast(1, dtype))
r = a % b_safe
a = ntl.where(b != 0, b, a)
b = r
outer += 1
gcd = a
safe_gcd = ntl.where(gcd == 0, 1, gcd)
output = ntl.cast( # noqa: F841
ntl.where(or_ab == 0, 0, ntl.abs((abs_a // safe_gcd) * abs_b)), dtype
)


def premake(ndim, dtype=None, block_size=None):
arrangement_ = functools.partial(arrangement, block_size=block_size)
if dtype == ninetoothed.int64:
application = application_euclidean_dyn
elif dtype == ninetoothed.int32:
application = application_36
elif dtype == ninetoothed.int16:
application = application_16
else:
application = application_8
tensors = (
Tensor(ndim, dtype=dtype),
Tensor(ndim, dtype=dtype),
Tensor(ndim, dtype=dtype),
)
return arrangement_, application, tensors
Loading