Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions backends/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
Expand Down Expand Up @@ -243,6 +243,12 @@
if(BUILD_TESTING)
include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)

et_cxx_test(
test_cuda_allocator SOURCES runtime/test/test_cuda_allocator.cpp
EXTRA_LIBS aoti_cuda_backend
)
target_compile_definitions(test_cuda_allocator PRIVATE CUDA_AVAILABLE=1)

et_cxx_test(
test_cuda_mutable_state SOURCES runtime/test/test_cuda_mutable_state.cpp
EXTRA_LIBS aoti_cuda_backend
Expand Down
18 changes: 18 additions & 0 deletions backends/cuda/runtime/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ runtime.cxx_library(
"//executorch/runtime/core:device_allocator",
],
deps = [
"//executorch/extension/cuda:caller_stream",
"//executorch/runtime/platform:platform",
],
nvcc_flags = get_nvcc_arch_args() + [
Expand Down Expand Up @@ -163,3 +164,20 @@ cpp_unittest(
platform = "gpu-remote-execution",
),
)

cpp_unittest(
name = "test_cuda_allocator",
srcs = ["test/test_cuda_allocator.cpp"],
deps = [
":cuda_allocator",
"//executorch/extension/cuda:caller_stream",
"//executorch/runtime/core:core",
"//executorch/runtime/platform:platform",
],
external_deps = [("cuda", None, "cuda-lazy")],
preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
keep_gpu_sections = True,
remote_execution = re_test_utils.remote_execution(
platform = "gpu-remote-execution",
),
)
64 changes: 58 additions & 6 deletions backends/cuda/runtime/cuda_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <cuda_runtime.h>

#include <executorch/extension/cuda/caller_stream.h>
#include <executorch/runtime/platform/log.h>

namespace executorch::backends::cuda {
Expand Down Expand Up @@ -124,12 +125,30 @@ void CudaAllocator::deallocate(void* ptr, DeviceIndex index) {
}
}

// TODO(gasoonjia): Add support for async copy
Error CudaAllocator::copy_host_to_device(
void* dst,
const void* src,
size_t nbytes,
DeviceIndex index) {
if (nbytes == 0) {
return Error::Ok;
}
ET_CHECK_OR_RETURN_ERROR(
dst != nullptr,
InvalidArgument,
"CudaAllocator::copy_host_to_device dst is null");
ET_CHECK_OR_RETURN_ERROR(
src != nullptr,
InvalidArgument,
"CudaAllocator::copy_host_to_device src is null");
// TODO: validate caller stream device matches index.
// For now assert single-GPU case.
ET_CHECK_OR_RETURN_ERROR(
index == -1 || index == 0,
InvalidArgument,
"CudaAllocator::copy_host_to_device only supports device 0, got %d",
static_cast<int>(index));

int prev_device = 0;
cudaError_t prev_device_err = cudaSuccess;

Expand All @@ -139,8 +158,14 @@ Error CudaAllocator::copy_host_to_device(
cudaSetDevice(index);
}
}

cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
cudaError_t err = cudaSuccess;
const auto caller_stream = executorch::extension::cuda::getCallerStream();
if (caller_stream) {
err = cudaMemcpyAsync(
dst, src, nbytes, cudaMemcpyHostToDevice, *caller_stream);
} else {
err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice);
}

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
Expand All @@ -158,12 +183,30 @@ Error CudaAllocator::copy_host_to_device(
return Error::Ok;
}

// TODO(gasoonjia): Add support for async copy
Error CudaAllocator::copy_device_to_host(
void* dst,
const void* src,
size_t nbytes,
DeviceIndex index) {
if (nbytes == 0) {
return Error::Ok;
}
ET_CHECK_OR_RETURN_ERROR(
dst != nullptr,
InvalidArgument,
"CudaAllocator::copy_device_to_host dst is null");
ET_CHECK_OR_RETURN_ERROR(
src != nullptr,
InvalidArgument,
"CudaAllocator::copy_device_to_host src is null");
// TODO: validate caller stream device matches index.
// For now assert single-GPU case.
ET_CHECK_OR_RETURN_ERROR(
index == -1 || index == 0,
InvalidArgument,
"CudaAllocator::copy_device_to_host only supports device 0, got %d",
static_cast<int>(index));

int prev_device = 0;
cudaError_t prev_device_err = cudaSuccess;

Expand All @@ -173,8 +216,17 @@ Error CudaAllocator::copy_device_to_host(
cudaSetDevice(index);
}
}

cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
cudaError_t err = cudaSuccess;
const auto caller_stream = executorch::extension::cuda::getCallerStream();
if (caller_stream) {
err = cudaMemcpyAsync(
dst, src, nbytes, cudaMemcpyDeviceToHost, *caller_stream);
if (err == cudaSuccess) {
err = cudaStreamSynchronize(*caller_stream);
}
} else {
err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost);
}

if (index >= 0 && prev_device_err == cudaSuccess) {
cudaSetDevice(prev_device);
Expand Down
172 changes: 172 additions & 0 deletions backends/cuda/runtime/test/test_cuda_allocator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <gtest/gtest.h>

#include <cuda_runtime.h>

#include <cstdint>
#include <vector>

#include <executorch/backends/cuda/runtime/cuda_allocator.h>
#include <executorch/extension/cuda/caller_stream.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/platform/platform.h>

using executorch::backends::cuda::CudaAllocator;
using executorch::runtime::Error;

namespace {
bool cuda_device_available() {
int device_count = 0;
const cudaError_t err = cudaGetDeviceCount(&device_count);
return err == cudaSuccess && device_count > 0;
}
} // namespace

TEST(CudaAllocatorTest, CopyHostToDevice) {
if (!cuda_device_available()) {
GTEST_SKIP() << "CUDA device unavailable";
}
et_pal_init();
CudaAllocator& a = CudaAllocator::instance();
constexpr size_t N = 1024;
auto res = a.allocate(N, 0);
ASSERT_TRUE(res.ok());
void* dptr = res.get();

std::vector<uint8_t> h_src(N, 42);
EXPECT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);

a.deallocate(dptr, 0);
}

TEST(CudaAllocatorTest, CopyDeviceToHost) {
if (!cuda_device_available()) {
GTEST_SKIP() << "CUDA device unavailable";
}
et_pal_init();
CudaAllocator& a = CudaAllocator::instance();
constexpr size_t N = 1024;
auto res = a.allocate(N, 0);
ASSERT_TRUE(res.ok());
void* dptr = res.get();

std::vector<uint8_t> h_src(N, 42), h_dst(N, 0);
ASSERT_EQ(a.copy_host_to_device(dptr, h_src.data(), N, 0), Error::Ok);
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), dptr, N, 0), Error::Ok);
EXPECT_EQ(h_src, h_dst);

a.deallocate(dptr, 0);
}

TEST(CudaAllocatorTest, CopyHostToDeviceWithCallerStream) {
if (!cuda_device_available()) {
GTEST_SKIP() << "CUDA device unavailable";
}
et_pal_init();
int device = 0;
ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
// TODO: validate caller stream device matches index once CallerStreamGuard
// exposes device. For now assert single-GPU case.
cudaStream_t s;
ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
executorch::extension::cuda::CallerStreamGuard g(s);

CudaAllocator& a = CudaAllocator::instance();
auto res = a.allocate(256, 0);
ASSERT_TRUE(res.ok());
void* d = res.get();
std::vector<uint8_t> h(256, 7);
// should take async branch internally, still return Ok
EXPECT_EQ(a.copy_host_to_device(d, h.data(), 256, 0), Error::Ok);
a.deallocate(d, 0);
cudaStreamDestroy(s);
}

TEST(CudaAllocatorTest, CopyDeviceToHostWithCallerStream) {
if (!cuda_device_available()) {
GTEST_SKIP() << "CUDA device unavailable";
}
et_pal_init();
int device = 0;
ASSERT_EQ(cudaGetDevice(&device), cudaSuccess);
ASSERT_EQ(device, 0) << "test assumes single GPU device 0";
// TODO: validate caller stream device matches index once CallerStreamGuard
// exposes device. For now assert single-GPU case.
cudaStream_t s;
ASSERT_EQ(cudaStreamCreate(&s), cudaSuccess);
executorch::extension::cuda::CallerStreamGuard g(s);

CudaAllocator& a = CudaAllocator::instance();
auto res = a.allocate(256, 0);
ASSERT_TRUE(res.ok());
void* d = res.get();
std::vector<uint8_t> h_src(256, 5), h_dst(256, 0);
ASSERT_EQ(a.copy_host_to_device(d, h_src.data(), 256, 0), Error::Ok);
EXPECT_EQ(a.copy_device_to_host(h_dst.data(), d, 256, 0), Error::Ok);
EXPECT_EQ(h_src, h_dst);

a.deallocate(d, 0);
cudaStreamDestroy(s);
}

TEST(CudaAllocatorTest, CopyHostToDeviceNullDstReturnsInvalidArgument) {
if (!cuda_device_available()) {
GTEST_SKIP() << "CUDA device unavailable";
}
et_pal_init();
CudaAllocator& a = CudaAllocator::instance();
// null dst should fail gracefully not CHECK abort
std::vector<uint8_t> h(8, 1);
Error e = a.copy_host_to_device(nullptr, h.data(), 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null dst, got "
<< static_cast<uint32_t>(e);
}

TEST(CudaAllocatorTest, CopyHostToDeviceNullSrcReturnsInvalidArgument) {
if (!cuda_device_available()) {
GTEST_SKIP() << "CUDA device unavailable";
}
et_pal_init();
CudaAllocator& a = CudaAllocator::instance();
void* dummy_dst = reinterpret_cast<void*>(0x1);
Error e = a.copy_host_to_device(dummy_dst, nullptr, 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null src, got "
<< static_cast<uint32_t>(e);
}

TEST(CudaAllocatorTest, CopyDeviceToHostNullDstReturnsInvalidArgument) {
if (!cuda_device_available()) {
GTEST_SKIP() << "CUDA device unavailable";
}
et_pal_init();
CudaAllocator& a = CudaAllocator::instance();
void* dummy_src = reinterpret_cast<void*>(0x1);
Error e = a.copy_device_to_host(nullptr, dummy_src, 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null dst, got "
<< static_cast<uint32_t>(e);
}

TEST(CudaAllocatorTest, CopyDeviceToHostNullSrcReturnsInvalidArgument) {
if (!cuda_device_available()) {
GTEST_SKIP() << "CUDA device unavailable";
}
et_pal_init();
CudaAllocator& a = CudaAllocator::instance();
std::vector<uint8_t> h(8, 1);
// null src should fail gracefully not CHECK abort
Error e = a.copy_device_to_host(h.data(), nullptr, 8, 0);
EXPECT_EQ(e, Error::InvalidArgument)
<< "expected InvalidArgument for null src, got "
<< static_cast<uint32_t>(e);
}
Loading