diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 8c9adc64c..0500fa389 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -76,6 +76,7 @@
 from infinicore.ops.broadcast_to import broadcast_to
 from infinicore.ops.cat import cat
 from infinicore.ops.cdist import cdist
+from infinicore.ops.copysign import copysign
 from infinicore.ops.cross_entropy import cross_entropy
 from infinicore.ops.diff import diff
 from infinicore.ops.digamma import digamma
@@ -95,7 +96,9 @@
 from infinicore.ops.kthvalue import kthvalue
 from infinicore.ops.kv_caching import kv_caching
 from infinicore.ops.ldexp import ldexp
+from infinicore.ops.lcm import lcm
 from infinicore.ops.lerp import lerp
+from infinicore.ops.lgamma import lgamma
 from infinicore.ops.logaddexp import logaddexp
 from infinicore.ops.logaddexp2 import logaddexp2
 from infinicore.ops.logcumsumexp import logcumsumexp
@@ -108,10 +111,12 @@
 from infinicore.ops.mha_varlen import mha_varlen
 from infinicore.ops.mul import mul
 from infinicore.ops.narrow import narrow
+from infinicore.ops.nextafter import nextafter
 from infinicore.ops.nrm2 import nrm2
 from infinicore.ops.paged_attention import paged_attention
 from infinicore.ops.paged_attention_prefill import paged_attention_prefill
 from infinicore.ops.paged_caching import paged_caching
+from infinicore.ops.rad2deg import rad2deg
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.reciprocal import reciprocal
 from infinicore.ops.rot import rot
@@ -217,6 +222,7 @@
     "bilinear",
     "fmod",
     "cat",
+    "copysign",
     "inner",
     "masked_select",
     "logaddexp",
@@ -231,7 +237,9 @@
     "narrow",
     "nrm2",
     "ldexp",
+    "lcm",
     "lerp",
+    "lgamma",
     "kthvalue",
     "squeeze",
     "unsqueeze",
@@ -246,6 +254,7 @@
     "from_torch",
     "mha_kvcache",
     "mha_varlen",
+    "nextafter",
     "fmin",
     "floor_divide",
     "float_power",
@@ -261,6 +270,7 @@
     "logical_and",
     "vander",
     "paged_caching",
+    "rad2deg",
     "paged_attention",
     "paged_attention_prefill",
     "hypot",
diff --git a/python/infinicore/ops/copysign.py b/python/infinicore/ops/copysign.py
new file mode 100644
index 000000000..093a92c47
--- /dev/null
+++ b/python/infinicore/ops/copysign.py
@@ -0,0 +1,9 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def copysign(input: Tensor, other: Tensor, *, out=None) -> Tensor:
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.copysign(input, other, out=out)
+
+    raise NotImplementedError("copysign is only implemented through the ntops GPU path")
diff --git a/python/infinicore/ops/lcm.py b/python/infinicore/ops/lcm.py
new file mode 100644
index 000000000..b477eb617
--- /dev/null
+++ b/python/infinicore/ops/lcm.py
@@ -0,0 +1,9 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def lcm(input: Tensor, other: Tensor, *, out=None) -> Tensor:
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.lcm(input, other, out=out)
+
+    raise NotImplementedError("lcm is only implemented through the ntops GPU path")
diff --git a/python/infinicore/ops/lgamma.py b/python/infinicore/ops/lgamma.py
new file mode 100644
index 000000000..28eae1bbf
--- /dev/null
+++ b/python/infinicore/ops/lgamma.py
@@ -0,0 +1,9 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def lgamma(input: Tensor, *, out=None) -> Tensor:
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.lgamma(input, out=out)
+
+    raise NotImplementedError("lgamma is only implemented through the ntops GPU path")
diff --git a/python/infinicore/ops/nextafter.py b/python/infinicore/ops/nextafter.py
new file mode 100644
index 000000000..808fd86d1
--- /dev/null
+++ b/python/infinicore/ops/nextafter.py
@@ -0,0 +1,9 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def nextafter(input: Tensor, other: Tensor, *, out=None) -> Tensor:
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.nextafter(input, other, out=out)
+
+    raise NotImplementedError("nextafter is only implemented through the ntops GPU path")
diff --git a/python/infinicore/ops/rad2deg.py b/python/infinicore/ops/rad2deg.py
new file mode 100644
index 000000000..8f4bdac96
--- /dev/null
+++ b/python/infinicore/ops/rad2deg.py
@@ -0,0 +1,9 @@
+import infinicore
+from infinicore.tensor import Tensor
+
+
+def rad2deg(input: Tensor, *, out=None) -> Tensor:
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.rad2deg(input, out=out)
+
+    raise NotImplementedError("rad2deg is only implemented through the ntops GPU path")
diff --git a/test/infinicore/ops/copysign.py b/test/infinicore/ops/copysign.py
new file mode 100644
index 000000000..5c626f990
--- /dev/null
+++ b/test/infinicore/ops/copysign.py
@@ -0,0 +1,76 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import infinicore
+import torch
+from framework import BaseOperatorTest, GenericTestRunner, TensorSpec, TestCase
+
+_TEST_CASES_DATA = [
+    ((13, 4), None),
+    ((2, 3, 4), None),
+    ((16, 5632), None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float32: {"atol": 0, "rtol": 0},
+}
+
+_TENSOR_DTYPES = [infinicore.float32]
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            input_spec = TensorSpec.from_tensor(shape, strides, dtype, name="input")
+            other_spec = TensorSpec.from_tensor(shape, strides, dtype, name="other")
+            out_spec = TensorSpec.from_tensor(shape, None, dtype, name="out")
+            tolerance = _TOLERANCE_MAP[dtype]
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec, other_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description="copysign - OUT_OF_PLACE",
+                )
+            )
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec, other_spec],
+                    kwargs={},
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=tolerance,
+                    description="copysign - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("Copysign")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.copysign(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.copysign(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/lcm.py b/test/infinicore/ops/lcm.py
new file mode 100644
index 000000000..26d2311e1
--- /dev/null
+++ b/test/infinicore/ops/lcm.py
@@ -0,0 +1,98 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import infinicore
+import torch
+from framework import (
+    BaseOperatorTest,
+    GenericTestRunner,
+    TensorInitializer,
+    TensorSpec,
+    TestCase,
+)
+
+_TEST_CASES_DATA = [
+    ((13, 4), None),
+    ((2, 3, 4), None),
+    ((16, 5632), None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.int32: {"atol": 0, "rtol": 0},
+}
+
+_TENSOR_DTYPES = [infinicore.int32]
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            input_spec = TensorSpec.from_tensor(
+                shape,
+                strides,
+                dtype,
+                init_mode=TensorInitializer.RANDINT,
+                low=-100,
+                high=100,
+                name="input",
+            )
+            other_spec = TensorSpec.from_tensor(
+                shape,
+                strides,
+                dtype,
+                init_mode=TensorInitializer.RANDINT,
+                low=-100,
+                high=100,
+                name="other",
+            )
+            out_spec = TensorSpec.from_tensor(shape, None, dtype, name="out")
+            tolerance = _TOLERANCE_MAP[dtype]
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec, other_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description="lcm - OUT_OF_PLACE",
+                )
+            )
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec, other_spec],
+                    kwargs={},
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=tolerance,
+                    description="lcm - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("Lcm")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.lcm(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.lcm(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/lgamma.py b/test/infinicore/ops/lgamma.py
new file mode 100644
index 000000000..1424c4831
--- /dev/null
+++ b/test/infinicore/ops/lgamma.py
@@ -0,0 +1,75 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import infinicore
+import torch
+from framework import BaseOperatorTest, GenericTestRunner, TensorSpec, TestCase
+
+_TEST_CASES_DATA = [
+    ((13, 4), None),
+    ((2, 3, 4), None),
+    ((16, 5632), None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+_TENSOR_DTYPES = [infinicore.float32]
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            input_spec = TensorSpec.from_tensor(shape, strides, dtype, name="input")
+            out_spec = TensorSpec.from_tensor(shape, None, dtype, name="out")
+            tolerance = _TOLERANCE_MAP[dtype]
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description="lgamma - OUT_OF_PLACE",
+                )
+            )
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs={},
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=tolerance,
+                    description="lgamma - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("Lgamma")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.lgamma(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.lgamma(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/nextafter.py b/test/infinicore/ops/nextafter.py
new file mode 100644
index 000000000..3c2237a53
--- /dev/null
+++ b/test/infinicore/ops/nextafter.py
@@ -0,0 +1,76 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import infinicore
+import torch
+from framework import BaseOperatorTest, GenericTestRunner, TensorSpec, TestCase
+
+_TEST_CASES_DATA = [
+    ((13, 4), None),
+    ((2, 3, 4), None),
+    ((16, 5632), None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float32: {"atol": 0, "rtol": 0},
+}
+
+_TENSOR_DTYPES = [infinicore.float32]
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            input_spec = TensorSpec.from_tensor(shape, strides, dtype, name="input")
+            other_spec = TensorSpec.from_tensor(shape, strides, dtype, name="other")
+            out_spec = TensorSpec.from_tensor(shape, None, dtype, name="out")
+            tolerance = _TOLERANCE_MAP[dtype]
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec, other_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description="nextafter - OUT_OF_PLACE",
+                )
+            )
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec, other_spec],
+                    kwargs={},
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=tolerance,
+                    description="nextafter - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("Nextafter")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.nextafter(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.nextafter(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/rad2deg.py b/test/infinicore/ops/rad2deg.py
new file mode 100644
index 000000000..6b8f9186f
--- /dev/null
+++ b/test/infinicore/ops/rad2deg.py
@@ -0,0 +1,75 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import infinicore
+import torch
+from framework import BaseOperatorTest, GenericTestRunner, TensorSpec, TestCase
+
+_TEST_CASES_DATA = [
+    ((13, 4), None),
+    ((2, 3, 4), None),
+    ((16, 5632), None),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+_TENSOR_DTYPES = [infinicore.float32]
+
+
+def parse_test_cases():
+    test_cases = []
+    for shape, strides in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            input_spec = TensorSpec.from_tensor(shape, strides, dtype, name="input")
+            out_spec = TensorSpec.from_tensor(shape, None, dtype, name="out")
+            tolerance = _TOLERANCE_MAP[dtype]
+
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tolerance,
+                    description="rad2deg - OUT_OF_PLACE",
+                )
+            )
+            test_cases.append(
+                TestCase(
+                    inputs=[input_spec],
+                    kwargs={},
+                    output_spec=out_spec,
+                    comparison_target="out",
+                    tolerance=tolerance,
+                    description="rad2deg - INPLACE(out)",
+                )
+            )
+
+    return test_cases
+
+
+class OpTest(BaseOperatorTest):
+    def __init__(self):
+        super().__init__("Rad2deg")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        return torch.rad2deg(*args, **kwargs)
+
+    def infinicore_operator(self, *args, **kwargs):
+        return infinicore.rad2deg(*args, **kwargs)
+
+
+def main():
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/xmake.lua b/xmake.lua
index ccae79cd2..ab2fe4a7c 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -1,5 +1,4 @@
 add_rules("mode.debug", "mode.release")
-add_requires("boost", {configs = {stacktrace = true}})
 add_requires("pybind11")
 
 -- Define color codes
@@ -70,7 +69,7 @@ end
 option("cuda_arch")
     set_showmenu(true)
     set_description("Set CUDA GPU architecture (e.g. sm_90)")
-    set_values("sm_50", "sm_60", "sm_70", "sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_90a")
+    set_values("sm_50", "sm_60", "sm_70", "sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_90a", "sm_120")
     set_category("option")
 option_end()
 
diff --git a/xmake/iluvatar.lua b/xmake/iluvatar.lua
index 34a913a11..d4f5b94ac 100644
--- a/xmake/iluvatar.lua
+++ b/xmake/iluvatar.lua
@@ -1,4 +1,10 @@
 local iluvatar_arch = get_config("iluvatar_arch") or "ivcore20"
+local iluvatar_warning_flags = {
+    "-Wno-error=unused-private-field",
+    "-Wno-error=unused-variable",
+    "-Wno-unused-variable",
+    "-Wno-error=pass-failed",
+}
 
 toolchain("iluvatar.toolchain")
     set_toolset("cc"  , "clang"  )
@@ -44,12 +50,12 @@ target("infiniop-iluvatar")
     add_links("cudart", "cublas", "cudnn")
 
     set_warnings("all", "error")
-    add_cuflags("-Wno-error=unused-private-field", "-Wno-error=unused-variable", "-Wno-unused-variable")
+    add_cuflags(iluvatar_warning_flags)
     add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
     add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true})
     add_culdflags("-fPIC")
-    add_cxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable")
-    add_cxxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable")
+    add_cxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable", "-Wno-error=pass-failed")
+    add_cxxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable", "-Wno-error=pass-failed")
 
     -- set_languages("cxx17") 天数似乎不能用这个配置
     add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu")
@@ -76,6 +82,7 @@ target("infinirt-iluvatar")
     add_links("cudart")
 
     set_warnings("all", "error")
+    add_cuflags(iluvatar_warning_flags)
     add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
     add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true})
     add_culdflags("-fPIC")
@@ -99,6 +106,7 @@ target("infiniccl-iluvatar")
         add_links("cudart")
 
         set_warnings("all", "error")
+        add_cuflags(iluvatar_warning_flags)
         add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true})
         add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true})
         add_culdflags("-fPIC")
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
index f0d273d77..331e5fb89 100644
--- a/xmake/nvidia.lua
+++ b/xmake/nvidia.lua
@@ -51,6 +51,8 @@ target("infiniop-nvidia")
                     -- H100 (sm_90a): use sm_90a for cutlass 3.x
                     if sm == 90 then
                         target:add("cuflags", "-gencode=arch=compute_90a,code=sm_90a")
+                    elseif sm == 120 then
+                        table.insert(archs, "sm_120")
                     elseif sm > 90 then
                         table.insert(archs, "sm_90")
                     end