diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py index 8c9adc64c..0500fa389 100644 --- a/python/infinicore/__init__.py +++ b/python/infinicore/__init__.py @@ -76,6 +76,7 @@ from infinicore.ops.broadcast_to import broadcast_to from infinicore.ops.cat import cat from infinicore.ops.cdist import cdist +from infinicore.ops.copysign import copysign from infinicore.ops.cross_entropy import cross_entropy from infinicore.ops.diff import diff from infinicore.ops.digamma import digamma @@ -95,7 +96,9 @@ from infinicore.ops.kthvalue import kthvalue from infinicore.ops.kv_caching import kv_caching from infinicore.ops.ldexp import ldexp +from infinicore.ops.lcm import lcm from infinicore.ops.lerp import lerp +from infinicore.ops.lgamma import lgamma from infinicore.ops.logaddexp import logaddexp from infinicore.ops.logaddexp2 import logaddexp2 from infinicore.ops.logcumsumexp import logcumsumexp @@ -108,10 +111,12 @@ from infinicore.ops.mha_varlen import mha_varlen from infinicore.ops.mul import mul from infinicore.ops.narrow import narrow +from infinicore.ops.nextafter import nextafter from infinicore.ops.nrm2 import nrm2 from infinicore.ops.paged_attention import paged_attention from infinicore.ops.paged_attention_prefill import paged_attention_prefill from infinicore.ops.paged_caching import paged_caching +from infinicore.ops.rad2deg import rad2deg from infinicore.ops.rearrange import rearrange from infinicore.ops.reciprocal import reciprocal from infinicore.ops.rot import rot @@ -217,6 +222,7 @@ "bilinear", "fmod", "cat", + "copysign", "inner", "masked_select", "logaddexp", @@ -231,7 +237,9 @@ "narrow", "nrm2", "ldexp", + "lcm", "lerp", + "lgamma", "kthvalue", "squeeze", "unsqueeze", @@ -246,6 +254,7 @@ "from_torch", "mha_kvcache", "mha_varlen", + "nextafter", "fmin", "floor_divide", "float_power", @@ -261,6 +270,7 @@ "logical_and", "vander", "paged_caching", + "rad2deg", "paged_attention", "paged_attention_prefill", "hypot", diff --git a/python/infinicore/ops/copysign.py b/python/infinicore/ops/copysign.py new file mode 100644 index 000000000..093a92c47 --- /dev/null +++ b/python/infinicore/ops/copysign.py @@ -0,0 +1,9 @@ +import infinicore +from infinicore.tensor import Tensor + + +def copysign(input: Tensor, other: Tensor, *, out=None) -> Tensor: + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.copysign(input, other, out=out) + + raise NotImplementedError("copysign is only implemented through the ntops GPU path") diff --git a/python/infinicore/ops/lcm.py b/python/infinicore/ops/lcm.py new file mode 100644 index 000000000..b477eb617 --- /dev/null +++ b/python/infinicore/ops/lcm.py @@ -0,0 +1,9 @@ +import infinicore +from infinicore.tensor import Tensor + + +def lcm(input: Tensor, other: Tensor, *, out=None) -> Tensor: + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.lcm(input, other, out=out) + + raise NotImplementedError("lcm is only implemented through the ntops GPU path") diff --git a/python/infinicore/ops/lgamma.py b/python/infinicore/ops/lgamma.py new file mode 100644 index 000000000..28eae1bbf --- /dev/null +++ b/python/infinicore/ops/lgamma.py @@ -0,0 +1,9 @@ +import infinicore +from infinicore.tensor import Tensor + + +def lgamma(input: Tensor, *, out=None) -> Tensor: + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.lgamma(input, out=out) + + raise NotImplementedError("lgamma is only implemented through the ntops GPU path") diff --git a/python/infinicore/ops/nextafter.py b/python/infinicore/ops/nextafter.py new file mode 100644 index 000000000..808fd86d1 --- /dev/null +++ b/python/infinicore/ops/nextafter.py @@ -0,0 +1,9 @@ +import infinicore +from infinicore.tensor import Tensor + + +def nextafter(input: Tensor, other: Tensor, *, out=None) -> Tensor: + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.nextafter(input, other, out=out) + + raise NotImplementedError("nextafter is only implemented through the ntops GPU path") diff --git a/python/infinicore/ops/rad2deg.py b/python/infinicore/ops/rad2deg.py new file mode 100644 index 000000000..8f4bdac96 --- /dev/null +++ b/python/infinicore/ops/rad2deg.py @@ -0,0 +1,9 @@ +import infinicore +from infinicore.tensor import Tensor + + +def rad2deg(input: Tensor, *, out=None) -> Tensor: + if infinicore.use_ntops and input.device.type in ("cuda", "musa"): + return infinicore.ntops.torch.rad2deg(input, out=out) + + raise NotImplementedError("rad2deg is only implemented through the ntops GPU path") diff --git a/test/infinicore/ops/copysign.py b/test/infinicore/ops/copysign.py new file mode 100644 index 000000000..5c626f990 --- /dev/null +++ b/test/infinicore/ops/copysign.py @@ -0,0 +1,76 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import infinicore +import torch +from framework import BaseOperatorTest, GenericTestRunner, TensorSpec, TestCase + +_TEST_CASES_DATA = [ + ((13, 4), None), + ((2, 3, 4), None), + ((16, 5632), None), +] + +_TOLERANCE_MAP = { + infinicore.float32: {"atol": 0, "rtol": 0}, +} + +_TENSOR_DTYPES = [infinicore.float32] + + +def parse_test_cases(): + test_cases = [] + for shape, strides in _TEST_CASES_DATA: + for dtype in _TENSOR_DTYPES: + input_spec = TensorSpec.from_tensor(shape, strides, dtype, name="input") + other_spec = TensorSpec.from_tensor(shape, strides, dtype, name="other") + out_spec = TensorSpec.from_tensor(shape, None, dtype, name="out") + tolerance = _TOLERANCE_MAP[dtype] + + test_cases.append( + TestCase( + inputs=[input_spec, other_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tolerance, + description="copysign - OUT_OF_PLACE", + ) + ) + test_cases.append( + TestCase( + inputs=[input_spec, other_spec], + kwargs={}, + output_spec=out_spec, + comparison_target="out", + tolerance=tolerance, + description="copysign - INPLACE(out)", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("Copysign") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + return torch.copysign(*args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + return infinicore.copysign(*args, **kwargs) + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/test/infinicore/ops/lcm.py b/test/infinicore/ops/lcm.py new file mode 100644 index 000000000..26d2311e1 --- /dev/null +++ b/test/infinicore/ops/lcm.py @@ -0,0 +1,98 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import infinicore +import torch +from framework import ( + BaseOperatorTest, + GenericTestRunner, + TensorInitializer, + TensorSpec, + TestCase, +) + +_TEST_CASES_DATA = [ + ((13, 4), None), + ((2, 3, 4), None), + ((16, 5632), None), +] + +_TOLERANCE_MAP = { + infinicore.int32: {"atol": 0, "rtol": 0}, +} + +_TENSOR_DTYPES = [infinicore.int32] + + +def parse_test_cases(): + test_cases = [] + for shape, strides in _TEST_CASES_DATA: + for dtype in _TENSOR_DTYPES: + input_spec = TensorSpec.from_tensor( + shape, + strides, + dtype, + init_mode=TensorInitializer.RANDINT, + low=-100, + high=100, + name="input", + ) + other_spec = TensorSpec.from_tensor( + shape, + strides, + dtype, + init_mode=TensorInitializer.RANDINT, + low=-100, + high=100, + name="other", + ) + out_spec = TensorSpec.from_tensor(shape, None, dtype, name="out") + tolerance = _TOLERANCE_MAP[dtype] + + test_cases.append( + TestCase( + inputs=[input_spec, other_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tolerance, + description="lcm - OUT_OF_PLACE", + ) + ) + test_cases.append( + TestCase( + inputs=[input_spec, other_spec], + kwargs={}, + output_spec=out_spec, + comparison_target="out", + tolerance=tolerance, + description="lcm - INPLACE(out)", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("Lcm") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + return torch.lcm(*args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + return infinicore.lcm(*args, **kwargs) + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/test/infinicore/ops/lgamma.py b/test/infinicore/ops/lgamma.py new file mode 100644 index 000000000..1424c4831 --- /dev/null +++ b/test/infinicore/ops/lgamma.py @@ -0,0 +1,75 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import infinicore +import torch +from framework import BaseOperatorTest, GenericTestRunner, TensorSpec, TestCase + +_TEST_CASES_DATA = [ + ((13, 4), None), + ((2, 3, 4), None), + ((16, 5632), None), +] + +_TOLERANCE_MAP = { + infinicore.float32: {"atol": 1e-4, "rtol": 1e-4}, +} + +_TENSOR_DTYPES = [infinicore.float32] + + +def parse_test_cases(): + test_cases = [] + for shape, strides in _TEST_CASES_DATA: + for dtype in _TENSOR_DTYPES: + input_spec = TensorSpec.from_tensor(shape, strides, dtype, name="input") + out_spec = TensorSpec.from_tensor(shape, None, dtype, name="out") + tolerance = _TOLERANCE_MAP[dtype] + + test_cases.append( + TestCase( + inputs=[input_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tolerance, + description="lgamma - OUT_OF_PLACE", + ) + ) + test_cases.append( + TestCase( + inputs=[input_spec], + kwargs={}, + output_spec=out_spec, + comparison_target="out", + tolerance=tolerance, + description="lgamma - INPLACE(out)", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("Lgamma") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + return torch.lgamma(*args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + return infinicore.lgamma(*args, **kwargs) + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/test/infinicore/ops/nextafter.py b/test/infinicore/ops/nextafter.py new file mode 100644 index 000000000..3c2237a53 --- /dev/null +++ b/test/infinicore/ops/nextafter.py @@ -0,0 +1,76 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import infinicore +import torch +from framework import BaseOperatorTest, GenericTestRunner, TensorSpec, TestCase + +_TEST_CASES_DATA = [ + ((13, 4), None), + ((2, 3, 4), None), + ((16, 5632), None), +] + +_TOLERANCE_MAP = { + infinicore.float32: {"atol": 0, "rtol": 0}, +} + +_TENSOR_DTYPES = [infinicore.float32] + + +def parse_test_cases(): + test_cases = [] + for shape, strides in _TEST_CASES_DATA: + for dtype in _TENSOR_DTYPES: + input_spec = TensorSpec.from_tensor(shape, strides, dtype, name="input") + other_spec = TensorSpec.from_tensor(shape, strides, dtype, name="other") + out_spec = TensorSpec.from_tensor(shape, None, dtype, name="out") + tolerance = _TOLERANCE_MAP[dtype] + + test_cases.append( + TestCase( + inputs=[input_spec, other_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tolerance, + description="nextafter - OUT_OF_PLACE", + ) + ) + test_cases.append( + TestCase( + inputs=[input_spec, other_spec], + kwargs={}, + output_spec=out_spec, + comparison_target="out", + tolerance=tolerance, + description="nextafter - INPLACE(out)", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("Nextafter") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + return torch.nextafter(*args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + return infinicore.nextafter(*args, **kwargs) + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/test/infinicore/ops/rad2deg.py b/test/infinicore/ops/rad2deg.py new file mode 100644 index 000000000..6b8f9186f --- /dev/null +++ b/test/infinicore/ops/rad2deg.py @@ -0,0 +1,75 @@ +import os +import sys + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +import infinicore +import torch +from framework import BaseOperatorTest, GenericTestRunner, TensorSpec, TestCase + +_TEST_CASES_DATA = [ + ((13, 4), None), + ((2, 3, 4), None), + ((16, 5632), None), +] + +_TOLERANCE_MAP = { + infinicore.float32: {"atol": 1e-4, "rtol": 1e-4}, +} + +_TENSOR_DTYPES = [infinicore.float32] + + +def parse_test_cases(): + test_cases = [] + for shape, strides in _TEST_CASES_DATA: + for dtype in _TENSOR_DTYPES: + input_spec = TensorSpec.from_tensor(shape, strides, dtype, name="input") + out_spec = TensorSpec.from_tensor(shape, None, dtype, name="out") + tolerance = _TOLERANCE_MAP[dtype] + + test_cases.append( + TestCase( + inputs=[input_spec], + kwargs={}, + output_spec=None, + comparison_target=None, + tolerance=tolerance, + description="rad2deg - OUT_OF_PLACE", + ) + ) + test_cases.append( + TestCase( + inputs=[input_spec], + kwargs={}, + output_spec=out_spec, + comparison_target="out", + tolerance=tolerance, + description="rad2deg - INPLACE(out)", + ) + ) + + return test_cases + + +class OpTest(BaseOperatorTest): + def __init__(self): + super().__init__("Rad2deg") + + def get_test_cases(self): + return parse_test_cases() + + def torch_operator(self, *args, **kwargs): + return torch.rad2deg(*args, **kwargs) + + def infinicore_operator(self, *args, **kwargs): + return infinicore.rad2deg(*args, **kwargs) + + +def main(): + runner = GenericTestRunner(OpTest) + runner.run_and_exit() + + +if __name__ == "__main__": + main() diff --git a/xmake.lua b/xmake.lua index ccae79cd2..ab2fe4a7c 100644 --- a/xmake.lua +++ b/xmake.lua @@ -1,5 +1,4 @@ add_rules("mode.debug", "mode.release") -add_requires("boost", {configs = {stacktrace = true}}) add_requires("pybind11") -- Define color codes @@ -70,7 +69,7 @@ end option("cuda_arch") set_showmenu(true) set_description("Set CUDA GPU architecture (e.g. sm_90)") - set_values("sm_50", "sm_60", "sm_70", "sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_90a") + set_values("sm_50", "sm_60", "sm_70", "sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_90a", "sm_120") set_category("option") option_end() diff --git a/xmake/iluvatar.lua b/xmake/iluvatar.lua index 34a913a11..d4f5b94ac 100644 --- a/xmake/iluvatar.lua +++ b/xmake/iluvatar.lua @@ -1,4 +1,10 @@ local iluvatar_arch = get_config("iluvatar_arch") or "ivcore20" +local iluvatar_warning_flags = { + "-Wno-error=unused-private-field", + "-Wno-error=unused-variable", + "-Wno-unused-variable", + "-Wno-error=pass-failed", +} toolchain("iluvatar.toolchain") set_toolset("cc" , "clang" ) @@ -44,12 +50,12 @@ target("infiniop-iluvatar") add_links("cudart", "cublas", "cudnn") set_warnings("all", "error") - add_cuflags("-Wno-error=unused-private-field", "-Wno-error=unused-variable", "-Wno-unused-variable") + add_cuflags(iluvatar_warning_flags) add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true}) add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true}) add_culdflags("-fPIC") - add_cxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable") - add_cxxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable") + add_cxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable", "-Wno-error=pass-failed") + add_cxxflags("-fPIC", "-Wno-error=unused-variable", "-Wno-unused-variable", "-Wno-error=pass-failed") -- set_languages("cxx17") 天数似乎不能用这个配置 add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu") @@ -76,6 +82,7 @@ target("infinirt-iluvatar") add_links("cudart") set_warnings("all", "error") + add_cuflags(iluvatar_warning_flags) add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true}) add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true}) add_culdflags("-fPIC") @@ -99,6 +106,7 @@ target("infiniccl-iluvatar") add_links("cudart") set_warnings("all", "error") + add_cuflags(iluvatar_warning_flags) add_cuflags("-fPIC", "-x", "ivcore", "-std=c++17", {force = true}) add_cuflags("--cuda-gpu-arch=" .. iluvatar_arch, {force = true}) add_culdflags("-fPIC") diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua index f0d273d77..331e5fb89 100644 --- a/xmake/nvidia.lua +++ b/xmake/nvidia.lua @@ -51,6 +51,8 @@ target("infiniop-nvidia") -- H100 (sm_90a): use sm_90a for cutlass 3.x if sm == 90 then target:add("cuflags", "-gencode=arch=compute_90a,code=sm_90a") + elseif sm == 120 then + table.insert(archs, "sm_120") elseif sm > 90 then table.insert(archs, "sm_90") end