diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 2d522f33e28..38083487e8d 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -243,6 +243,12 @@ install( if(BUILD_TESTING) include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake) + et_cxx_test( + test_cuda_allocator SOURCES runtime/test/test_cuda_allocator.cpp + EXTRA_LIBS aoti_cuda_backend + ) + target_compile_definitions(test_cuda_allocator PRIVATE CUDA_AVAILABLE=1) + et_cxx_test( test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp EXTRA_LIBS aoti_cuda_backend diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS index 348f8a6ba78..122560e98ec 100644 --- a/backends/cuda/runtime/TARGETS +++ b/backends/cuda/runtime/TARGETS @@ -92,6 +92,7 @@ runtime.cxx_library( "//executorch/runtime/core:device_allocator", ], deps = [ + "//executorch/extension/cuda:caller_stream", "//executorch/runtime/platform:platform", ], nvcc_flags = get_nvcc_arch_args() + [ @@ -163,3 +164,20 @@ cpp_unittest( platform = "gpu-remote-execution", ), ) + +cpp_unittest( + name = "test_cuda_allocator", + srcs = ["test/test_cuda_allocator.cpp"], + deps = [ + ":cuda_allocator", + "//executorch/extension/cuda:caller_stream", + "//executorch/runtime/core:core", + "//executorch/runtime/platform:platform", + ], + external_deps = [("cuda", None, "cuda-lazy")], + preprocessor_flags = ["-DCUDA_AVAILABLE=1"], + keep_gpu_sections = True, + remote_execution = re_test_utils.remote_execution( + platform = "gpu-remote-execution", + ), +) diff --git a/backends/cuda/runtime/cuda_allocator.cpp b/backends/cuda/runtime/cuda_allocator.cpp index 94294b08fa0..e979f943ee5 100644 --- a/backends/cuda/runtime/cuda_allocator.cpp +++ b/backends/cuda/runtime/cuda_allocator.cpp @@ -10,6 +10,7 @@ #include +#include #include namespace executorch::backends::cuda { @@ -124,12 +125,30 @@ void CudaAllocator::deallocate(void* ptr, DeviceIndex index) { } } -// TODO(gasoonjia): Add support for async copy Error CudaAllocator::copy_host_to_device( void* dst, const void* src, size_t nbytes, DeviceIndex index) { + if (nbytes == 0) { + return Error::Ok; + } + ET_CHECK_OR_RETURN_ERROR( + dst != nullptr, + InvalidArgument, + "CudaAllocator::copy_host_to_device dst is null"); + ET_CHECK_OR_RETURN_ERROR( + src != nullptr, + InvalidArgument, + "CudaAllocator::copy_host_to_device src is null"); + // TODO: validate caller stream device matches index. + // For now assert single-GPU case. + ET_CHECK_OR_RETURN_ERROR( + index == -1 || index == 0, + InvalidArgument, + "CudaAllocator::copy_host_to_device only supports device 0, got %d", + static_cast(index)); + int prev_device = 0; cudaError_t prev_device_err = cudaSuccess; @@ -139,8 +158,14 @@ Error CudaAllocator::copy_host_to_device( cudaSetDevice(index); } } - - cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice); + cudaError_t err = cudaSuccess; + const auto caller_stream = executorch::extension::cuda::getCallerStream(); + if (caller_stream) { + err = cudaMemcpyAsync( + dst, src, nbytes, cudaMemcpyHostToDevice, *caller_stream); + } else { + err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice); + } if (index >= 0 && prev_device_err == cudaSuccess) { cudaSetDevice(prev_device); @@ -158,12 +183,30 @@ Error CudaAllocator::copy_host_to_device( return Error::Ok; } -// TODO(gasoonjia): Add support for async copy Error CudaAllocator::copy_device_to_host( void* dst, const void* src, size_t nbytes, DeviceIndex index) { + if (nbytes == 0) { + return Error::Ok; + } + ET_CHECK_OR_RETURN_ERROR( + dst != nullptr, + InvalidArgument, + "CudaAllocator::copy_device_to_host dst is null"); + ET_CHECK_OR_RETURN_ERROR( + src != nullptr, + InvalidArgument, + "CudaAllocator::copy_device_to_host src is null"); + // TODO: validate caller stream device matches index. + // For now assert single-GPU case. + ET_CHECK_OR_RETURN_ERROR( + index == -1 || index == 0, + InvalidArgument, + "CudaAllocator::copy_device_to_host only supports device 0, got %d", + static_cast(index)); + int prev_device = 0; cudaError_t prev_device_err = cudaSuccess; @@ -173,8 +216,17 @@ Error CudaAllocator::copy_device_to_host( cudaSetDevice(index); } } - - cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost); + cudaError_t err = cudaSuccess; + const auto caller_stream = executorch::extension::cuda::getCallerStream(); + if (caller_stream) { + err = cudaMemcpyAsync( + dst, src, nbytes, cudaMemcpyDeviceToHost, *caller_stream); + if (err == cudaSuccess) { + err = cudaStreamSynchronize(*caller_stream); + } + } else { + err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost); + } if (index >= 0 && prev_device_err == cudaSuccess) { cudaSetDevice(prev_device); diff --git a/backends/cuda/runtime/test/test_cuda_allocator.cpp b/backends/cuda/runtime/test/test_cuda_allocator.cpp new file mode 100644 index 00000000000..998430ce5bf --- /dev/null +++ b/backends/cuda/runtime/test/test_cuda_allocator.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include +#include + +#include +#include +#include +#include + +using executorch::backends::cuda::CudaAllocator; +using executorch::runtime::Error; + +namespace { +bool cuda_device_available() { + int device_count = 0; + const cudaError_t err = cudaGetDeviceCount(&device_count); + return err == cudaSuccess && device_count > 0; +} +} // namespace + +TEST(CudaAllocatorTest, CopyHostToDevice) { + if (!cuda_device_available()) { + GTEST_SKIP() << "CUDA device unavailable"; + } + et_pal_init(); + CudaAllocator& a = CudaAllocator::instance(); + constexpr size_t N = 1024; + auto res = a.allocate(N, 0); + ASSERT_TRUE(res.ok()); + void* dptr = res.get(); + + std::vector h_src(N, 42); + EXPECT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok); + + a.deallocate(dptr, 0); +} + +TEST(CudaAllocatorTest, CopyDeviceToHost) { + if (!cuda_device_available()) { + GTEST_SKIP() << "CUDA device unavailable"; + } + et_pal_init(); + CudaAllocator& a = CudaAllocator::instance(); + constexpr size_t N = 1024; + auto res = a.allocate(N, 0); + ASSERT_TRUE(res.ok()); + void* dptr = res.get(); + + std::vector h_src(N, 42), h_dst(N, 0); + ASSERT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok); + EXPECT_EQ(a.copy_device_to_host(h_dst.data(), dptr, N, 0), Error::Ok); + EXPECT_EQ(h_src, h_dst); + + a.deallocate(dptr, 0); +} + +TEST(CudaAllocatorTest, CopyHostToDeviceWithCallerStream) { + if (!cuda_device_available()) { + GTEST_SKIP() << "CUDA device unavailable"; + } + et_pal_init(); + int device = 0; + ASSERT_EQ(cudaGetDevice(&device), cudaSuccess); + ASSERT_EQ(device, 0) << "test assumes single GPU device 0"; + // TODO: validate caller stream device matches index once CallerStreamGuard + // exposes device. For now assert single-GPU case. + cudaStream_t s; + ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess); + executorch::extension::cuda::CallerStreamGuard g(s); + + CudaAllocator& a = CudaAllocator::instance(); + auto res = a.allocate(256, 0); + ASSERT_TRUE(res.ok()); + void* d = res.get(); + std::vector h(256, 7); + // should take async branch internally, still return Ok + EXPECT_EQ(a.copy_host_to_device(d, h.data(), 256, 0), Error::Ok); + a.deallocate(d, 0); + cudaStreamDestroy(s); +} + +TEST(CudaAllocatorTest, CopyDeviceToHostWithCallerStream) { + if (!cuda_device_available()) { + GTEST_SKIP() << "CUDA device unavailable"; + } + et_pal_init(); + int device = 0; + ASSERT_EQ(cudaGetDevice(&device), cudaSuccess); + ASSERT_EQ(device, 0) << "test assumes single GPU device 0"; + // TODO: validate caller stream device matches index once CallerStreamGuard + // exposes device. For now assert single-GPU case. + cudaStream_t s; + ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess); + executorch::extension::cuda::CallerStreamGuard g(s); + + CudaAllocator& a = CudaAllocator::instance(); + auto res = a.allocate(256, 0); + ASSERT_TRUE(res.ok()); + void* d = res.get(); + std::vector h_src(256, 5), h_dst(256, 0); + ASSERT_EQ(a.copy_host_to_device(d, h_src.data(), 256, 0), Error::Ok); + EXPECT_EQ(a.copy_device_to_host(h_dst.data(), d, 256, 0), Error::Ok); + EXPECT_EQ(h_src, h_dst); + + a.deallocate(d, 0); + cudaStreamDestroy(s); +} + +TEST(CudaAllocatorTest, CopyHostToDeviceNullDstReturnsInvalidArgument) { + if (!cuda_device_available()) { + GTEST_SKIP() << "CUDA device unavailable"; + } + et_pal_init(); + CudaAllocator& a = CudaAllocator::instance(); + // null dst should fail gracefully not CHECK abort + std::vector h(8, 1); + Error e = a.copy_host_to_device(nullptr, h.data(), 8, 0); + EXPECT_EQ(e, Error::InvalidArgument) + << "expected InvalidArgument for null dst, got " + << static_cast(e); +} + +TEST(CudaAllocatorTest, CopyHostToDeviceNullSrcReturnsInvalidArgument) { + if (!cuda_device_available()) { + GTEST_SKIP() << "CUDA device unavailable"; + } + et_pal_init(); + CudaAllocator& a = CudaAllocator::instance(); + void* dummy_dst = reinterpret_cast(0x1); + Error e = a.copy_host_to_device(dummy_dst, nullptr, 8, 0); + EXPECT_EQ(e, Error::InvalidArgument) + << "expected InvalidArgument for null src, got " + << static_cast(e); +} + +TEST(CudaAllocatorTest, CopyDeviceToHostNullDstReturnsInvalidArgument) { + if (!cuda_device_available()) { + GTEST_SKIP() << "CUDA device unavailable"; + } + et_pal_init(); + CudaAllocator& a = CudaAllocator::instance(); + void* dummy_src = reinterpret_cast(0x1); + Error e = a.copy_device_to_host(nullptr, dummy_src, 8, 0); + EXPECT_EQ(e, Error::InvalidArgument) + << "expected InvalidArgument for null dst, got " + << static_cast(e); +} + +TEST(CudaAllocatorTest, CopyDeviceToHostNullSrcReturnsInvalidArgument) { + if (!cuda_device_available()) { + GTEST_SKIP() << "CUDA device unavailable"; + } + et_pal_init(); + CudaAllocator& a = CudaAllocator::instance(); + std::vector h(8, 1); + // null src should fail gracefully not CHECK abort + Error e = a.copy_device_to_host(h.data(), nullptr, 8, 0); + EXPECT_EQ(e, Error::InvalidArgument) + << "expected InvalidArgument for null src, got " + << static_cast(e); +}