diff --git a/vortex-cuda/cub/src/lib.rs b/vortex-cuda/cub/src/lib.rs
index d7666a46f55..706373255aa 100644
--- a/vortex-cuda/cub/src/lib.rs
+++ b/vortex-cuda/cub/src/lib.rs
@@ -17,8 +17,6 @@
 use std::path::PathBuf;
 use std::sync::OnceLock;
 
-use vortex_cuda_macros::cuda_tests;
-
 /// Raw FFI type definitions and dynamically-loaded function pointers from bindgen.
 #[allow(
     non_upper_case_globals,
@@ -60,11 +58,11 @@ pub fn cub_library() -> Result<&'static sys::CubLibrary, CubError> {
         .map_err(|e| CubError::LibraryLoadError(e.clone()))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use crate::filter;
 
-    #[test]
+    #[vortex_cuda_macros::test]
     fn test_filter_temp_size_u64() -> Result<(), crate::CubError> {
         let temp_bytes = filter::filter_get_temp_size_u64(1000)?;
         // CUB requires some temporary storage
@@ -72,14 +70,14 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[vortex_cuda_macros::test]
     fn test_filter_temp_size_f64() -> Result<(), crate::CubError> {
         let temp_bytes = filter::filter_get_temp_size_f64(10000)?;
         assert!(temp_bytes > 0);
         Ok(())
     }
 
-    #[test]
+    #[vortex_cuda_macros::test]
     fn test_filter_temp_size_zero_items() -> Result<(), crate::CubError> {
         // Just verify the call doesn't fail with zero items
         let _temp_bytes = filter::filter_get_temp_size_u8(0)?;
diff --git a/vortex-cuda/macros/src/lib.rs b/vortex-cuda/macros/src/lib.rs
index afe26bfc99e..722664db915 100644
--- a/vortex-cuda/macros/src/lib.rs
+++ b/vortex-cuda/macros/src/lib.rs
@@ -18,10 +18,10 @@
 //! #[cuda_not_available]
 //! fn fallback_function() { /* ... */ }
 //!
-//! // Only compiled in test builds when CUDA is available
-//! #[cuda_test]
-//! mod tests {
-//!     // ...
+//! // Ignore tests when CUDA is not available
+//! #[crate::test]
+//! async fn my_test() {
+//! ...
 //! }
 //! ```
 
@@ -30,7 +30,6 @@ use std::sync::LazyLock;
 
 use proc_macro::TokenStream;
 use quote::quote;
-use syn::Item;
 use syn::parse_macro_input;
 
 /// Cached result of nvcc availability check.
@@ -61,17 +60,39 @@ pub fn cuda_not_available(_attr: TokenStream, item: TokenStream) -> TokenStream
     }
 }
 
-/// Conditionally compiles the annotated item only in test builds when CUDA is available.
+/// Test attribute to ignore tests if CUDA isn't available. Supports both sync and async tests (using tokio).
+///
+/// Must be named `test` to work with frameworks like `rstest`.
 #[proc_macro_attribute]
-pub fn cuda_tests(_attr: TokenStream, item: TokenStream) -> TokenStream {
+pub fn test(_attr: TokenStream, item: TokenStream) -> TokenStream {
+    let item = parse_macro_input!(item as syn::ItemFn);
     if *NVCC_AVAILABLE {
-        let item = parse_macro_input!(item as Item);
-        quote! {
-            #[cfg(test)]
-            #item
+        if item.sig.asyncness.is_some() {
+            quote! {
+                #[tokio::test]
+                #item
+            }
+        } else {
+            quote! {
+                #[test]
+                #item
+            }
         }
         .into()
     } else {
-        TokenStream::new()
+        if item.sig.asyncness.is_some() {
+            quote! {
+                #[tokio::test]
+                #[ignore]
+                #item
+            }
+        } else {
+            quote! {
+                #[test]
+                #[ignore]
+                #item
+            }
+        }
+        .into()
     }
 }
diff --git a/vortex-cuda/nvcomp/src/lib.rs b/vortex-cuda/nvcomp/src/lib.rs
index 650f06069ff..2f9fd00deb9 100644
--- a/vortex-cuda/nvcomp/src/lib.rs
+++ b/vortex-cuda/nvcomp/src/lib.rs
@@ -35,7 +35,6 @@ mod error;
 pub mod zstd;
 
 pub use error::NvcompError;
-use vortex_cuda_macros::cuda_tests;
 
 /// The loaded nvcomp library instance.
 static NVCOMP_LIB: OnceLock<Result<sys::NvcompLibrary, String>> = OnceLock::new();
@@ -66,13 +65,12 @@ pub fn nvcomp_library() -> Result<&'static sys::NvcompLibrary, NvcompError> {
         .as_ref()
         .map_err(|e| NvcompError::LibraryLoadError(e.clone()))
 }
-
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use crate::zstd;
 
     /// Test that we can call nvcompBatchedZstdDecompressGetTempSizeAsync.
-    #[test]
+    #[vortex_cuda_macros::test]
     fn test_get_decompress_temp_size() {
         let num_chunks = 10;
         let max_uncompressed_chunk_bytes = 65536; // 64KB recommended chunk size
diff --git a/vortex-cuda/src/arrow/canonical.rs b/vortex-cuda/src/arrow/canonical.rs
index 3a6d0b3d666..2dc35b1f360 100644
--- a/vortex-cuda/src/arrow/canonical.rs
+++ b/vortex-cuda/src/arrow/canonical.rs
@@ -18,7 +18,6 @@ use vortex::error::VortexResult;
 use vortex::error::vortex_bail;
 use vortex::error::vortex_ensure;
 use vortex::extension::datetime::AnyTemporal;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaExecutionCtx;
 use crate::arrow::ArrowArray;
@@ -275,8 +274,7 @@ unsafe extern "C" fn release_array(array: *mut ArrowArray) {
     }
 }
 
-#[cuda_tests]
-#[allow(clippy::unwrap_used)]
+#[cfg(test)]
 mod tests {
     use rstest::rstest;
     use vortex::array::IntoArray;
@@ -308,7 +306,7 @@ mod tests {
     #[case::i64(PrimitiveArray::from_iter(0i64..10).into_array(), 10)]
     #[case::f32(PrimitiveArray::from_iter([1.0f32, 2.0, 3.0]).into_array(), 3)]
     #[case::f64(PrimitiveArray::from_iter([1.0f64, 2.0, 3.0]).into_array(), 3)]
-    #[tokio::test]
+    #[crate::test]
     async fn test_export_primitive(
         #[case] array: vortex::array::ArrayRef,
         #[case] expected_len: i64,
@@ -330,7 +328,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_export_null() -> VortexResult<()> {
         let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -346,7 +344,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_export_decimal() -> VortexResult<()> {
         let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -365,7 +363,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_export_temporal() -> VortexResult<()> {
         let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -388,7 +386,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_export_varbinview() -> VortexResult<()> {
         let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -413,7 +411,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_export_struct() -> VortexResult<()> {
         let mut ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
diff --git a/vortex-cuda/src/dynamic_dispatch/mod.rs b/vortex-cuda/src/dynamic_dispatch/mod.rs
index 25de9da0473..a2a431f1490 100644
--- a/vortex-cuda/src/dynamic_dispatch/mod.rs
+++ b/vortex-cuda/src/dynamic_dispatch/mod.rs
@@ -19,8 +19,6 @@
 #![allow(non_snake_case)]
 #![allow(clippy::cast_possible_truncation)]
 
-use vortex_cuda_macros::cuda_tests;
-
 mod plan_builder;
 pub use plan_builder::build_plan;
 
@@ -187,8 +185,7 @@ impl DynamicDispatchPlan {
     }
 }
 
-#[cuda_tests]
-#[allow(clippy::cast_possible_truncation)]
+#[cfg(test)]
 mod tests {
     use std::sync::Arc;
 
@@ -232,11 +229,11 @@ mod tests {
             .map(|i| ((i as u64) % (max_val + 1)) as u32)
             .collect();
         let primitive = PrimitiveArray::new(Buffer::from(values), NonNullable);
-        BitPackedArray::encode(&primitive.to_array(), bit_width)
+        BitPackedArray::encode(&primitive.into_array(), bit_width)
             .vortex_expect("failed to create BitPacked array")
     }
 
-    #[test]
+    #[crate::test]
     fn test_max_scalar_ops() -> VortexResult<()> {
         let bit_width: u8 = 6;
         let len = 2050;
@@ -273,7 +270,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_plan_structure() {
         // Stage 0: input dict values (BP→FoR) into smem[0..256)
         // Stage 1: output codes (BP→FoR→DICT) into smem[256..2304), gather from smem[0]
@@ -321,7 +318,7 @@ mod tests {
         Ok((ptr, device_buf))
     }
 
-    #[test]
+    #[crate::test]
     fn test_load_for_zigzag_alp() -> VortexResult<()> {
         // Max scalar ops depth with LOAD source: LOAD → FoR → ZigZag → ALP
         // (Exercises all four scalar op types without DICT)
@@ -422,7 +419,7 @@ mod tests {
         Ok(unsafe { std::mem::transmute::<Vec<u32>, Vec<f32>>(actual) })
     }
 
-    #[test]
+    #[crate::test]
     fn test_bitpacked() -> VortexResult<()> {
         let bit_width: u8 = 10;
         let len = 3000;
@@ -433,7 +430,7 @@ mod tests {
 
         let bp = make_bitpacked_array_u32(bit_width, len);
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let (plan, _bufs) = build_plan(&bp.to_array(), &cuda_ctx)?;
+        let (plan, _bufs) = build_plan(&bp.into_array(), &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan)?;
         assert_eq!(actual, expected);
@@ -441,7 +438,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_for_bitpacked() -> VortexResult<()> {
         let bit_width: u8 = 6;
         let len = 3000;
@@ -457,7 +454,7 @@ mod tests {
         let for_arr = FoRArray::try_new(bp.into_array(), Scalar::from(reference))?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let (plan, _bufs) = build_plan(&for_arr.to_array(), &cuda_ctx)?;
+        let (plan, _bufs) = build_plan(&for_arr.into_array(), &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan)?;
         assert_eq!(actual, expected);
@@ -465,7 +462,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_runend() -> VortexResult<()> {
         let ends: Vec<u32> = vec![1000, 2000, 3000];
         let values: Vec<u32> = vec![10, 20, 30];
@@ -482,7 +479,7 @@ mod tests {
         let re = RunEndArray::new(ends_arr, values_arr);
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let (plan, _bufs) = build_plan(&re.to_array(), &cuda_ctx)?;
+        let (plan, _bufs) = build_plan(&re.into_array(), &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan)?;
         assert_eq!(actual, expected);
@@ -490,7 +487,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_dict_for_bp_values_bp_codes() -> VortexResult<()> {
         // Dict where both codes and values are BitPacked+FoR.
         let dict_reference = 1_000_000u32;
@@ -504,17 +501,17 @@ mod tests {
 
         // BitPack+FoR the dict values
         let dict_prim = PrimitiveArray::new(Buffer::from(dict_residuals), NonNullable);
-        let dict_bp = BitPackedArray::encode(&dict_prim.to_array(), 6)?;
+        let dict_bp = BitPackedArray::encode(&dict_prim.into_array(), 6)?;
         let dict_for = FoRArray::try_new(dict_bp.into_array(), Scalar::from(dict_reference))?;
 
         // BitPack the codes
         let codes_prim = PrimitiveArray::new(Buffer::from(codes), NonNullable);
-        let codes_bp = BitPackedArray::encode(&codes_prim.to_array(), 6)?;
+        let codes_bp = BitPackedArray::encode(&codes_prim.into_array(), 6)?;
 
         let dict = DictArray::try_new(codes_bp.into_array(), dict_for.into_array())?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let (plan, _bufs) = build_plan(&dict.to_array(), &cuda_ctx)?;
+        let (plan, _bufs) = build_plan(&dict.into_array(), &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan)?;
         assert_eq!(actual, expected);
@@ -522,7 +519,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_alp_for_bitpacked() -> VortexResult<()> {
         // ALP(FoR(BitPacked)): encode each layer, then reassemble the tree
         // bottom-up because encode() methods produce flat outputs.
@@ -545,7 +542,7 @@ mod tests {
         );
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let (plan, _bufs) = build_plan(&tree.to_array(), &cuda_ctx)?;
+        let (plan, _bufs) = build_plan(&tree.into_array(), &cuda_ctx)?;
 
         let actual = run_dispatch_plan_f32(&cuda_ctx, len, &plan)?;
         assert_eq!(actual, floats);
@@ -553,7 +550,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_zigzag_bitpacked() -> VortexResult<()> {
         // ZigZag(BitPacked): unpack then zigzag-decode.
         let bit_width: u8 = 4;
@@ -569,11 +566,11 @@ mod tests {
             .collect();
 
         let prim = PrimitiveArray::new(Buffer::from(raw), NonNullable);
-        let bp = BitPackedArray::encode(&prim.to_array(), bit_width)?;
+        let bp = BitPackedArray::encode(&prim.into_array(), bit_width)?;
         let zz = ZigZagArray::try_new(bp.into_array())?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let (plan, _bufs) = build_plan(&zz.to_array(), &cuda_ctx)?;
+        let (plan, _bufs) = build_plan(&zz.into_array(), &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan)?;
         assert_eq!(actual, expected);
@@ -581,7 +578,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_for_runend() -> VortexResult<()> {
         // FoR(RunEnd): expand runs then add constant.
         let ends: Vec<u32> = vec![500, 1000, 1500, 2000, 2500, 3000];
@@ -601,7 +598,7 @@ mod tests {
         let for_arr = FoRArray::try_new(re.into_array(), Scalar::from(reference))?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let (plan, _bufs) = build_plan(&for_arr.to_array(), &cuda_ctx)?;
+        let (plan, _bufs) = build_plan(&for_arr.into_array(), &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan)?;
         assert_eq!(actual, expected);
@@ -609,7 +606,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_for_dict() -> VortexResult<()> {
         // FoR(Dict(codes=Primitive, values=Primitive)): gather then add constant.
         let dict_values: Vec<u32> = vec![100, 200, 300, 400];
@@ -629,7 +626,7 @@ mod tests {
         let for_arr = FoRArray::try_new(dict.into_array(), Scalar::from(reference))?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let (plan, _bufs) = build_plan(&for_arr.to_array(), &cuda_ctx)?;
+        let (plan, _bufs) = build_plan(&for_arr.into_array(), &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan)?;
         assert_eq!(actual, expected);
@@ -637,7 +634,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_dict_for_bp_codes() -> VortexResult<()> {
         // Dict(codes=FoR(BitPacked), values=primitive)
         let dict_values: Vec<u32> = (0..8).map(|i| i * 1000 + 7).collect();
@@ -649,14 +646,14 @@ mod tests {
         // BitPack codes, then wrap in FoR (reference=0 so values unchanged)
         let bit_width: u8 = 3;
         let codes_prim = PrimitiveArray::new(Buffer::from(codes), NonNullable);
-        let codes_bp = BitPackedArray::encode(&codes_prim.to_array(), bit_width)?;
+        let codes_bp = BitPackedArray::encode(&codes_prim.into_array(), bit_width)?;
         let codes_for = FoRArray::try_new(codes_bp.into_array(), Scalar::from(0u32))?;
 
         let values_prim = PrimitiveArray::new(Buffer::from(dict_values), NonNullable);
         let dict = DictArray::try_new(codes_for.into_array(), values_prim.into_array())?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let (plan, _bufs) = build_plan(&dict.to_array(), &cuda_ctx)?;
+        let (plan, _bufs) = build_plan(&dict.into_array(), &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan)?;
         assert_eq!(actual, expected);
@@ -664,7 +661,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_dict_primitive_values_bp_codes() -> VortexResult<()> {
         let dict_values: Vec<u32> = vec![100, 200, 300, 400];
         let dict_size = dict_values.len();
@@ -674,13 +671,13 @@ mod tests {
 
         let bit_width: u8 = 2;
         let codes_prim = PrimitiveArray::new(Buffer::from(codes), NonNullable);
-        let codes_bp = BitPackedArray::encode(&codes_prim.to_array(), bit_width)?;
+        let codes_bp = BitPackedArray::encode(&codes_prim.into_array(), bit_width)?;
         let values_prim = PrimitiveArray::new(Buffer::from(dict_values), NonNullable);
 
         let dict = DictArray::try_new(codes_bp.into_array(), values_prim.into_array())?;
 
         let cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())?;
-        let (plan, _bufs) = build_plan(&dict.to_array(), &cuda_ctx)?;
+        let (plan, _bufs) = build_plan(&dict.into_array(), &cuda_ctx)?;
 
         let actual = run_dynamic_dispatch_plan(&cuda_ctx, len, &plan)?;
         assert_eq!(actual, expected);
diff --git a/vortex-cuda/src/kernel/arrays/constant.rs b/vortex-cuda/src/kernel/arrays/constant.rs
index 9fcab51a654..bebb7eae662 100644
--- a/vortex-cuda/src/kernel/arrays/constant.rs
+++ b/vortex-cuda/src/kernel/arrays/constant.rs
@@ -26,7 +26,6 @@ use vortex::dtype::NativePType;
 use vortex::error::VortexResult;
 use vortex::error::vortex_bail;
 use vortex::error::vortex_err;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaDeviceBuffer;
 use crate::executor::CudaExecute;
@@ -190,7 +189,7 @@ where
     )))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use rstest::rstest;
     use vortex::array::IntoArray;
@@ -221,7 +220,7 @@ mod tests {
     #[case::i64(make_constant_array(-1000000i64, 2050))]
     #[case::f32(make_constant_array(1.23f32, 2050))]
     #[case::f64(make_constant_array(4.56789f64, 2050))]
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_constant_materialization(
         #[case] constant_array: ConstantArray,
     ) -> VortexResult<()> {
@@ -231,7 +230,7 @@ mod tests {
         let cpu_result = constant_array.to_canonical()?;
 
         let gpu_result = ConstantNumericExecutor
-            .execute(constant_array.to_array(), &mut cuda_ctx)
+            .execute(constant_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU materialization failed")
             .into_host()
@@ -243,7 +242,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_constant_empty_array() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -252,7 +251,7 @@ mod tests {
         let cpu_result = constant_array.to_canonical()?;
 
         let gpu_result = ConstantNumericExecutor
-            .execute(constant_array.to_array(), &mut cuda_ctx)
+            .execute(constant_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU materialization failed")
             .into_host()
@@ -264,7 +263,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_constant_small_array() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -274,7 +273,7 @@ mod tests {
         let cpu_result = constant_array.to_canonical()?;
 
         let gpu_result = ConstantNumericExecutor
-            .execute(constant_array.to_array(), &mut cuda_ctx)
+            .execute(constant_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU materialization failed")
             .into_host()
diff --git a/vortex-cuda/src/kernel/arrays/dict.rs b/vortex-cuda/src/kernel/arrays/dict.rs
index b4abd7e9691..334ca9caf03 100644
--- a/vortex-cuda/src/kernel/arrays/dict.rs
+++ b/vortex-cuda/src/kernel/arrays/dict.rs
@@ -29,7 +29,6 @@ use vortex::dtype::NativePType;
 use vortex::error::VortexExpect;
 use vortex::error::VortexResult;
 use vortex::error::vortex_bail;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
@@ -301,7 +300,7 @@ async fn execute_dict_varbinview(
     }))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use vortex::array::IntoArray;
     use vortex::array::arrays::DecimalArray;
@@ -329,7 +328,7 @@ mod tests {
         ))
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_u32_values_u8_codes() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -349,7 +348,7 @@ mod tests {
 
         // Execute on CUDA
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
@@ -361,7 +360,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_u64_values_u16_codes() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -384,7 +383,7 @@ mod tests {
 
         // Execute on CUDA
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
@@ -396,7 +395,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_i32_values_u32_codes() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -416,7 +415,7 @@ mod tests {
 
         // Execute on CUDA
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
@@ -427,7 +426,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_large_array() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -447,7 +446,7 @@ mod tests {
 
         // Execute on CUDA
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
@@ -459,7 +458,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_values_with_validity() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -491,7 +490,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_codes_with_validity() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -517,7 +516,7 @@ mod tests {
 
         // Execute on CUDA
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
@@ -528,7 +527,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_both_with_validity() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -561,7 +560,7 @@ mod tests {
 
         // Execute on CUDA
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
@@ -572,7 +571,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_i64_values_with_validity() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -606,7 +605,7 @@ mod tests {
 
         // Execute on CUDA
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
@@ -617,7 +616,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_all_valid_matches_baseline() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -639,7 +638,7 @@ mod tests {
 
         // Execute on CUDA
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_primitive();
@@ -660,7 +659,7 @@ mod tests {
         ))
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_decimal_i8_values() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -678,7 +677,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
@@ -688,7 +687,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_decimal_i16_values() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -706,7 +705,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
@@ -716,7 +715,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_decimal_i32_values() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -734,7 +733,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
@@ -744,7 +743,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_decimal_i64_values() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -765,7 +764,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
@@ -775,7 +774,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_decimal_i128_values() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -801,7 +800,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
@@ -819,7 +818,7 @@ mod tests {
             .into_varbinview())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_string_values_u8_codes() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -834,7 +833,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_varbinview();
@@ -844,7 +843,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_string_values_u16_codes() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -859,7 +858,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_varbinview();
@@ -869,7 +868,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_string_max_inlined_12_bytes() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -886,7 +885,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_varbinview();
@@ -896,7 +895,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_string_outlined_views() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -916,7 +915,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_varbinview();
@@ -926,7 +925,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_string_empty_strings() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -941,7 +940,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_varbinview();
@@ -951,7 +950,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_string_values_with_validity() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -967,7 +966,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_varbinview();
@@ -977,7 +976,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_string_outlined_with_validity() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -1000,7 +999,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_varbinview();
@@ -1010,7 +1009,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_dict_decimal_i256_values() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -1036,7 +1035,7 @@ mod tests {
         let baseline = dict_array.to_canonical()?;
 
         let cuda_result = DictExecutor
-            .execute(dict_array.to_array(), &mut cuda_ctx)
+            .execute(dict_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_decimal();
diff --git a/vortex-cuda/src/kernel/encodings/alp.rs b/vortex-cuda/src/kernel/encodings/alp.rs
index 9aabdea8658..a47cb24c42d 100644
--- a/vortex-cuda/src/kernel/encodings/alp.rs
+++ b/vortex-cuda/src/kernel/encodings/alp.rs
@@ -23,7 +23,6 @@ use vortex::encodings::alp::match_each_alp_float_ptype;
 use vortex::error::VortexResult;
 use vortex::error::vortex_ensure;
 use vortex::error::vortex_err;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
@@ -116,7 +115,7 @@ where
     )))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use vortex::array::IntoArray;
     use vortex::array::arrays::PrimitiveArray;
@@ -134,7 +133,7 @@ mod tests {
     use crate::CanonicalCudaExt;
     use crate::session::CudaSession;
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_alp_decompression_f32() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -166,7 +165,7 @@ mod tests {
         let cpu_result = alp_array.to_canonical()?.into_array();
 
         let gpu_result = ALPExecutor
-            .execute(alp_array.to_array(), &mut cuda_ctx)
+            .execute(alp_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
index 54d1ef65fe0..ff078411ea1 100644
--- a/vortex-cuda/src/kernel/encodings/bitpacked.rs
+++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -23,7 +23,6 @@ use vortex::encodings::fastlanes::unpack_iter::BitPacked;
 use vortex::error::VortexResult;
 use vortex::error::vortex_ensure;
 use vortex::error::vortex_err;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
@@ -166,7 +165,7 @@ where
     )))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use futures::executor::block_on;
     use rstest::rstest;
@@ -191,6 +190,7 @@ mod tests {
     #[case::u32((0u16..128u16).cycle().take(2048), 6)]
     #[case::u16((0u32..128u32).cycle().take(2048), 6)]
     #[case::u16((0u64..128u64).cycle().take(2048), 6)]
+    #[crate::test]
     fn test_patched<T: NativePType>(
         #[case] iter: impl Iterator<Item = T>,
         #[case] bw: u8,
@@ -201,14 +201,14 @@ mod tests {
         let array = PrimitiveArray::new(iter.collect::<Buffer<_>>(), NonNullable);
 
         // Last two items should be patched
-        let bp_with_patches = BitPackedArray::encode(&array.to_array(), bw)?;
+        let bp_with_patches = BitPackedArray::encode(&array.into_array(), bw)?;
         assert!(bp_with_patches.patches().is_some());
 
         let cpu_result = bp_with_patches.to_canonical()?.into_array();
 
         let gpu_result = block_on(async {
             BitPackedExecutor
-                .execute(bp_with_patches.to_array(), &mut cuda_ctx)
+                .execute(bp_with_patches.into_array(), &mut cuda_ctx)
                 .await
                 .vortex_expect("GPU decompression failed")
                 .into_host()
@@ -221,7 +221,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_patches() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -232,14 +232,14 @@ mod tests {
         );
 
         // Last two items should be patched
-        let bp_with_patches = BitPackedArray::encode(&array.to_array(), 9)?;
+        let bp_with_patches = BitPackedArray::encode(&array.into_array(), 9)?;
         assert!(bp_with_patches.patches().is_some());
 
         let cpu_result = bp_with_patches.to_canonical()?.into_array();
 
         let gpu_result = block_on(async {
             BitPackedExecutor
-                .execute(bp_with_patches.to_array(), &mut cuda_ctx)
+                .execute(bp_with_patches.into_array(), &mut cuda_ctx)
                 .await
                 .vortex_expect("GPU decompression failed")
                 .into_host()
@@ -260,6 +260,7 @@ mod tests {
     #[case::bw_5(5)]
     #[case::bw_6(6)]
     #[case::bw_7(7)]
+    #[crate::test]
     fn test_cuda_bitunpack_u8(#[case] bit_width: u8) -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -273,13 +274,13 @@ mod tests {
             NonNullable,
         );
 
-        let bitpacked_array = BitPackedArray::encode(&primitive_array.to_array(), bit_width)
+        let bitpacked_array = BitPackedArray::encode(&primitive_array.into_array(), bit_width)
             .vortex_expect("operation should succeed in test");
         let cpu_result = bitpacked_array.to_canonical()?;
 
         let gpu_result = block_on(async {
             BitPackedExecutor
-                .execute(bitpacked_array.to_array(), &mut cuda_ctx)
+                .execute(bitpacked_array.into_array(), &mut cuda_ctx)
                 .await
                 .vortex_expect("GPU decompression failed")
                 .into_host()
@@ -308,6 +309,7 @@ mod tests {
     #[case::bw_13(13)]
     #[case::bw_14(14)]
     #[case::bw_15(15)]
+    #[crate::test]
     fn test_cuda_bitunpack_u16(#[case] bit_width: u8) -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -321,13 +323,13 @@ mod tests {
             NonNullable,
         );
 
-        let bitpacked_array = BitPackedArray::encode(&primitive_array.to_array(), bit_width)
+        let bitpacked_array = BitPackedArray::encode(&primitive_array.into_array(), bit_width)
             .vortex_expect("operation should succeed in test");
         let cpu_result = bitpacked_array.to_canonical()?;
 
         let gpu_result = block_on(async {
             BitPackedExecutor
-                .execute(bitpacked_array.to_array(), &mut cuda_ctx)
+                .execute(bitpacked_array.into_array(), &mut cuda_ctx)
                 .await
                 .vortex_expect("GPU decompression failed")
                 .into_host()
@@ -372,6 +374,7 @@ mod tests {
     #[case::bw_29(29)]
     #[case::bw_30(30)]
     #[case::bw_31(31)]
+    #[crate::test]
     fn test_cuda_bitunpack_u32(#[case] bit_width: u8) -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -385,13 +388,13 @@ mod tests {
             NonNullable,
         );
 
-        let bitpacked_array = BitPackedArray::encode(&primitive_array.to_array(), bit_width)
+        let bitpacked_array = BitPackedArray::encode(&primitive_array.into_array(), bit_width)
             .vortex_expect("operation should succeed in test");
         let cpu_result = bitpacked_array.to_canonical()?;
 
         let gpu_result = block_on(async {
             BitPackedExecutor
-                .execute(bitpacked_array.to_array(), &mut cuda_ctx)
+                .execute(bitpacked_array.into_array(), &mut cuda_ctx)
                 .await
                 .vortex_expect("GPU decompression failed")
                 .into_host()
@@ -468,6 +471,7 @@ mod tests {
     #[case::bw_61(61)]
     #[case::bw_62(62)]
     #[case::bw_63(63)]
+    #[crate::test]
     fn test_cuda_bitunpack_u64(#[case] bit_width: u8) -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -481,12 +485,12 @@ mod tests {
             NonNullable,
         );
 
-        let bitpacked_array = BitPackedArray::encode(&primitive_array.to_array(), bit_width)
+        let bitpacked_array = BitPackedArray::encode(&primitive_array.into_array(), bit_width)
             .vortex_expect("operation should succeed in test");
         let cpu_result = bitpacked_array.to_canonical()?;
         let gpu_result = block_on(async {
             BitPackedExecutor
-                .execute(bitpacked_array.to_array(), &mut cuda_ctx)
+                .execute(bitpacked_array.into_array(), &mut cuda_ctx)
                 .await
                 .vortex_expect("GPU decompression failed")
                 .into_host()
@@ -499,7 +503,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn test_cuda_bitunpack_sliced() -> VortexResult<()> {
         let bit_width = 32;
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
@@ -514,7 +518,7 @@ mod tests {
             NonNullable,
         );
 
-        let bitpacked_array = BitPackedArray::encode(&primitive_array.to_array(), bit_width)
+        let bitpacked_array = BitPackedArray::encode(&primitive_array.into_array(), bit_width)
             .vortex_expect("operation should succeed in test");
         let slice_ref = bitpacked_array.clone().into_array().slice(67..3969)?;
         let mut exec_ctx = ExecutionCtx::new(VortexSession::empty().with::<ArraySession>());
diff --git a/vortex-cuda/src/kernel/encodings/date_time_parts.rs b/vortex-cuda/src/kernel/encodings/date_time_parts.rs
index e68e8f320fc..1f99ab341f3 100644
--- a/vortex-cuda/src/kernel/encodings/date_time_parts.rs
+++ b/vortex-cuda/src/kernel/encodings/date_time_parts.rs
@@ -28,7 +28,6 @@ use vortex::error::vortex_err;
 use vortex::extension::datetime::TimeUnit;
 use vortex::extension::datetime::Timestamp;
 use vortex::scalar::Scalar;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
@@ -199,7 +198,7 @@ where
     ))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use rstest::rstest;
     use vortex::array::IntoArray;
@@ -272,7 +271,7 @@ mod tests {
         vec![123456789i64, 0, 0],
         TimeUnit::Nanoseconds
     )]
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_datetimeparts_decompression(
         #[case] days: Vec<i32>,
         #[case] seconds: Vec<i32>,
@@ -286,7 +285,7 @@ mod tests {
         let cpu_result = dtp_array.to_canonical()?;
 
         let gpu_result = DateTimePartsExecutor
-            .execute(dtp_array.to_array(), &mut cuda_ctx)
+            .execute(dtp_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
@@ -298,7 +297,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_datetimeparts_large_array() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -312,7 +311,7 @@ mod tests {
         let cpu_result = dtp_array.to_canonical()?;
 
         let gpu_result = DateTimePartsExecutor
-            .execute(dtp_array.to_array(), &mut cuda_ctx)
+            .execute(dtp_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
@@ -324,7 +323,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_datetimeparts_with_nulls() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -360,7 +359,7 @@ mod tests {
         let cpu_result = dtp_array.to_canonical()?;
 
         let gpu_result = DateTimePartsExecutor
-            .execute(dtp_array.to_array(), &mut cuda_ctx)
+            .execute(dtp_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
diff --git a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
index 2615f0a67d9..d7ba349dc97 100644
--- a/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
+++ b/vortex-cuda/src/kernel/encodings/decimal_byte_parts.rs
@@ -13,7 +13,6 @@ use vortex::encodings::decimal_byte_parts::DecimalBytePartsArrayParts;
 use vortex::encodings::decimal_byte_parts::DecimalBytePartsVTable;
 use vortex::error::VortexResult;
 use vortex::error::vortex_bail;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaExecutionCtx;
 use crate::executor::CudaArrayExt;
@@ -52,7 +51,7 @@ impl CudaExecute for DecimalBytePartsExecutor {
     }
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use rstest::rstest;
     use vortex::array::IntoArray;
@@ -73,7 +72,7 @@ mod tests {
     #[case::i16_p10_s2(Buffer::from(vec![100i16, 200, 300, 400, 500]), 10, 2)]
     #[case::i32_p18_s4(Buffer::from(vec![100i32, 200, 300, 400, 500]), 18, 4)]
     #[case::i64_p38_s6(Buffer::from(vec![100i64, 200, 300, 400, 500]), 38, 6)]
-    #[tokio::test]
+    #[crate::test]
     async fn test_decimal_byte_parts_gpu_decode<T: vortex::dtype::NativePType>(
         #[case] encoded: Buffer<T>,
         #[case] precision: u8,
@@ -92,7 +91,7 @@ mod tests {
         let cpu_result = dbp_array.to_canonical().vortex_expect("CPU canonicalize");
 
         let gpu_result = DecimalBytePartsExecutor
-            .execute(dbp_array.to_array(), &mut cuda_ctx)
+            .execute(dbp_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decode");
 
diff --git a/vortex-cuda/src/kernel/encodings/for_.rs b/vortex-cuda/src/kernel/encodings/for_.rs
index aef5202e2ab..019b6fb4952 100644
--- a/vortex-cuda/src/kernel/encodings/for_.rs
+++ b/vortex-cuda/src/kernel/encodings/for_.rs
@@ -23,7 +23,6 @@ use vortex::error::VortexExpect;
 use vortex::error::VortexResult;
 use vortex::error::vortex_ensure;
 use vortex::error::vortex_err;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaBufferExt;
 use crate::executor::CudaArrayExt;
@@ -119,7 +118,7 @@ where
     )))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use rstest::rstest;
     use vortex::array::IntoArray;
@@ -151,7 +150,7 @@ mod tests {
     #[case::u16(make_for_array((0..2050).map(|i| (i % 2050) as u16).collect(), 1000u16))]
     #[case::u32(make_for_array((0..2050).map(|i| (i % 2050) as u32).collect(), 100000u32))]
     #[case::u64(make_for_array((0..2050).map(|i| (i % 2050) as u64).collect(), 1000000u64))]
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_for_decompression(#[case] for_array: FoRArray) -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -159,7 +158,7 @@ mod tests {
         let cpu_result = for_array.to_canonical()?;
 
         let gpu_result = FoRExecutor
-            .execute(for_array.to_array(), &mut cuda_ctx)
+            .execute(for_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
@@ -171,7 +170,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_signed_ffor() {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -187,7 +186,7 @@ mod tests {
         let cpu_result = for_array.to_canonical().unwrap();
 
         let gpu_result = FoRExecutor
-            .execute(for_array.to_array(), &mut cuda_ctx)
+            .execute(for_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
diff --git a/vortex-cuda/src/kernel/encodings/runend.rs b/vortex-cuda/src/kernel/encodings/runend.rs
index ad22f555c50..f7c8fc28088 100644
--- a/vortex-cuda/src/kernel/encodings/runend.rs
+++ b/vortex-cuda/src/kernel/encodings/runend.rs
@@ -26,7 +26,6 @@ use vortex::error::vortex_bail;
 use vortex::error::vortex_ensure;
 use vortex::error::vortex_err;
 use vortex::scalar::Scalar;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
@@ -157,8 +156,7 @@ async fn decode_runend_typed<V: DeviceRepr + NativePType, E: DeviceRepr + Native
     )))
 }
 
-#[cuda_tests]
-#[allow(clippy::cast_possible_truncation)]
+#[cfg(test)]
 mod tests {
     use rstest::rstest;
     use vortex::array::IntoArray;
@@ -194,7 +192,7 @@ mod tests {
     #[case::u8_ends_i32_values(make_runend_array(vec![2u8, 5, 10], vec![1i32, 2, 3]))]
     #[case::u32_ends_i32_values(make_runend_array(vec![2u32, 5, 10], vec![1i32, 2, 3]))]
     #[case::u64_ends_i32_values(make_runend_array(vec![2u64, 5, 10], vec![1i32, 2, 3]))]
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_runend_types(#[case] runend_array: RunEndArray) -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -202,7 +200,7 @@ mod tests {
         let cpu_result = runend_array.to_canonical()?;
 
         let gpu_result = RunEndExecutor
-            .execute(runend_array.to_array(), &mut cuda_ctx)
+            .execute(runend_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
@@ -214,7 +212,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_runend_large_array() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -224,7 +222,7 @@ mod tests {
         let total_len = num_runs * run_length;
 
         let ends: Vec<u64> = (1..=num_runs).map(|i| (i * run_length) as u64).collect();
-        let values: Vec<i32> = (0..num_runs).map(|i| i as i32).collect();
+        let values: Vec<i32> = (0..num_runs).map(|i| i32::try_from(i).unwrap()).collect();
 
         let runend_array = make_runend_array(ends, values);
         assert_eq!(runend_array.len(), total_len);
@@ -232,7 +230,7 @@ mod tests {
         let cpu_result = runend_array.to_canonical()?;
 
         let gpu_result = RunEndExecutor
-            .execute(runend_array.to_array(), &mut cuda_ctx)
+            .execute(runend_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
@@ -244,7 +242,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_runend_single_run() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -254,7 +252,7 @@ mod tests {
         let cpu_result = runend_array.to_canonical()?;
 
         let gpu_result = RunEndExecutor
-            .execute(runend_array.to_array(), &mut cuda_ctx)
+            .execute(runend_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
@@ -266,7 +264,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_runend_many_small_runs() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -281,7 +279,7 @@ mod tests {
         let cpu_result = runend_array.to_canonical()?;
 
         let gpu_result = RunEndExecutor
-            .execute(runend_array.to_array(), &mut cuda_ctx)
+            .execute(runend_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
diff --git a/vortex-cuda/src/kernel/encodings/sequence.rs b/vortex-cuda/src/kernel/encodings/sequence.rs
index 312d7ef1475..abe53cdd18f 100644
--- a/vortex-cuda/src/kernel/encodings/sequence.rs
+++ b/vortex-cuda/src/kernel/encodings/sequence.rs
@@ -18,7 +18,6 @@ use vortex::encodings::sequence::SequenceArrayParts;
 use vortex::encodings::sequence::SequenceVTable;
 use vortex::error::VortexResult;
 use vortex::error::vortex_err;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaDeviceBuffer;
 use crate::CudaExecutionCtx;
@@ -82,7 +81,7 @@ async fn execute_typed<T: NativePType + DeviceRepr>(
     )))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use futures::executor::block_on;
     use rstest::rstest;
@@ -104,6 +103,7 @@ mod tests {
     #[case::u16(10u16, 2u16, 100)]
     #[case::u32(10u32, 2u32, 1000)]
     #[case::u64(100u64, 20u64, 500)]
+    #[crate::test]
     fn test_sequence<T: NativePType + Into<PValue>>(
         #[case] base: T,
         #[case] multiplier: T,
diff --git a/vortex-cuda/src/kernel/encodings/zigzag.rs b/vortex-cuda/src/kernel/encodings/zigzag.rs
index f3ca80c2aa4..f18be9d2b76 100644
--- a/vortex-cuda/src/kernel/encodings/zigzag.rs
+++ b/vortex-cuda/src/kernel/encodings/zigzag.rs
@@ -19,7 +19,6 @@ use vortex::encodings::zigzag::ZigZagVTable;
 use vortex::error::VortexResult;
 use vortex::error::vortex_ensure;
 use vortex::error::vortex_err;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaBufferExt;
 use crate::executor::CudaArrayExt;
@@ -96,7 +95,7 @@ where
     )))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use vortex::array::IntoArray;
     use vortex::array::arrays::PrimitiveArray;
@@ -111,7 +110,7 @@ mod tests {
     use crate::CanonicalCudaExt;
     use crate::session::CudaSession;
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_zigzag_decompression_u32() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -127,7 +126,7 @@ mod tests {
         let cpu_result = zigzag_array.to_canonical()?;
 
         let gpu_result = ZigZagExecutor
-            .execute(zigzag_array.to_array(), &mut cuda_ctx)
+            .execute(zigzag_array.into_array(), &mut cuda_ctx)
             .await
             .vortex_expect("GPU decompression failed")
             .into_host()
diff --git a/vortex-cuda/src/kernel/encodings/zstd.rs b/vortex-cuda/src/kernel/encodings/zstd.rs
index e70047f61b3..7c8caba89a7 100644
--- a/vortex-cuda/src/kernel/encodings/zstd.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd.rs
@@ -31,7 +31,6 @@ use vortex::error::VortexExpect;
 use vortex::error::VortexResult;
 use vortex::error::vortex_err;
 use vortex::mask::AllOr;
-use vortex_cuda_macros::cuda_tests;
 use vortex_nvcomp::sys::nvcompStatus_t;
 use vortex_nvcomp::zstd as nvcomp_zstd;
 
@@ -341,7 +340,7 @@ async fn decode_zstd(array: ZstdArray, ctx: &mut CudaExecutionCtx) -> VortexResu
     }
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use vortex::array::IntoArray;
     use vortex::array::arrays::VarBinViewArray;
@@ -353,7 +352,7 @@ mod tests {
     use super::*;
     use crate::session::CudaSession;
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_zstd_decompression_utf8() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -378,7 +377,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_zstd_decompression_multiple_frames() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -413,7 +412,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_zstd_decompression_sliced() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
diff --git a/vortex-cuda/src/kernel/encodings/zstd_buffers.rs b/vortex-cuda/src/kernel/encodings/zstd_buffers.rs
index 6396f73681c..bc218c841c0 100644
--- a/vortex-cuda/src/kernel/encodings/zstd_buffers.rs
+++ b/vortex-cuda/src/kernel/encodings/zstd_buffers.rs
@@ -20,7 +20,6 @@ use vortex::encodings::zstd::ZstdBuffersArray;
 use vortex::encodings::zstd::ZstdBuffersVTable;
 use vortex::error::VortexResult;
 use vortex::error::vortex_err;
-use vortex_cuda_macros::cuda_tests;
 use vortex_nvcomp::sys;
 use vortex_nvcomp::sys::nvcompStatus_t;
 use vortex_nvcomp::zstd as nvcomp_zstd;
@@ -218,7 +217,7 @@ async fn validate_decompress_results(
     Ok(())
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use vortex::array::IntoArray;
     use vortex::array::arrays::PrimitiveArray;
@@ -233,7 +232,7 @@ mod tests {
     use crate::CanonicalCudaExt;
     use crate::session::CudaSession;
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_zstd_buffers_decompression_primitive() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
@@ -252,7 +251,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_cuda_zstd_buffers_decompression_varbinview() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create execution context");
diff --git a/vortex-cuda/src/kernel/filter/decimal.rs b/vortex-cuda/src/kernel/filter/decimal.rs
index a42d51b9f04..8da8ed17b12 100644
--- a/vortex-cuda/src/kernel/filter/decimal.rs
+++ b/vortex-cuda/src/kernel/filter/decimal.rs
@@ -9,7 +9,6 @@ use vortex::dtype::NativeDecimalType;
 use vortex::error::VortexResult;
 use vortex::mask::Mask;
 use vortex_cub::filter::CubFilterable;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaExecutionCtx;
 use crate::kernel::filter::filter_sized;
@@ -37,7 +36,7 @@ pub(super) async fn filter_decimal<D: NativeDecimalType + DeviceRepr + CubFilter
     )))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use rstest::rstest;
     use vortex::array::IntoArray;
@@ -85,7 +84,7 @@ mod tests {
         DecimalArray::from_iter([i256::from_i128(1), i256::from_i128(2), i256::from_i128(3), i256::from_i128(4), i256::from_i128(5)], DecimalDType::new(19, 5)),
         Mask::from_iter([false, true, false, true, false])
     )]
-    #[tokio::test]
+    #[crate::test]
     async fn test_gpu_filter_decimal(
         #[case] input: DecimalArray,
         #[case] mask: Mask,
@@ -110,7 +109,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_gpu_filter_decimal_large_array() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create CUDA execution context");
diff --git a/vortex-cuda/src/kernel/filter/primitive.rs b/vortex-cuda/src/kernel/filter/primitive.rs
index 5019ff9b770..32d6746de8b 100644
--- a/vortex-cuda/src/kernel/filter/primitive.rs
+++ b/vortex-cuda/src/kernel/filter/primitive.rs
@@ -9,7 +9,6 @@ use vortex::dtype::NativePType;
 use vortex::error::VortexResult;
 use vortex::mask::Mask;
 use vortex_cub::filter::CubFilterable;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaExecutionCtx;
 use crate::kernel::filter::filter_sized;
@@ -37,7 +36,7 @@ where
     )))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use rstest::rstest;
     use vortex::array::IntoArray;
@@ -79,7 +78,7 @@ mod tests {
         PrimitiveArray::from_iter([1u32, 2, 3, 4, 5]),
         Mask::from_iter([false, false, false, false, false])
     )]
-    #[tokio::test]
+    #[crate::test]
     async fn test_gpu_filter(
         #[case] input: PrimitiveArray,
         #[case] mask: Mask,
@@ -104,7 +103,7 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
+    #[crate::test]
     async fn test_gpu_filter_large_array() -> VortexResult<()> {
         let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
             .vortex_expect("failed to create CUDA execution context");
diff --git a/vortex-cuda/src/kernel/filter/varbinview.rs b/vortex-cuda/src/kernel/filter/varbinview.rs
index f5e205f07aa..e9994a375f1 100644
--- a/vortex-cuda/src/kernel/filter/varbinview.rs
+++ b/vortex-cuda/src/kernel/filter/varbinview.rs
@@ -6,7 +6,6 @@ use vortex::array::arrays::VarBinViewArray;
 use vortex::array::arrays::VarBinViewArrayParts;
 use vortex::error::VortexResult;
 use vortex::mask::Mask;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaExecutionCtx;
 use crate::kernel::filter::filter_sized;
@@ -37,7 +36,7 @@ pub(super) async fn filter_varbinview(
     )))
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use rstest::rstest;
     use vortex::array::IntoArray;
@@ -65,7 +64,7 @@ mod tests {
         ),
         Mask::from_iter([true, true, true, true, true, true, true, true, false])
     )]
-    #[tokio::test]
+    #[crate::test]
     async fn test_gpu_filter_strings(
         #[case] input: VarBinViewArray,
         #[case] mask: Mask,
diff --git a/vortex-cuda/src/kernel/mod.rs b/vortex-cuda/src/kernel/mod.rs
index 221307cdbc4..93ffd768df5 100644
--- a/vortex-cuda/src/kernel/mod.rs
+++ b/vortex-cuda/src/kernel/mod.rs
@@ -20,7 +20,6 @@ use tracing::trace;
 use vortex::error::VortexResult;
 use vortex::error::vortex_err;
 use vortex::utils::aliases::dash_map::DashMap;
-use vortex_cuda_macros::cuda_tests;
 
 mod arrays;
 mod encodings;
@@ -282,9 +281,8 @@ impl KernelLoader {
     }
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
-    #![allow(clippy::expect_used)]
 
     use cudarc::driver::CudaContext;
     use cudarc::driver::PushKernelArg;
@@ -297,7 +295,7 @@ mod tests {
     /// This test launches a special config_check kernel that reports the kernel-side
     /// constants, then verifies they match the Rust-side constants used in
     /// `launch_cuda_kernel_impl`.
-    #[test]
+    #[crate::test]
     fn test_kernel_config_matches_rust_config() {
         // These must match the constants in launch_cuda_kernel_impl
         const THREADS_PER_BLOCK: u32 = 64;
diff --git a/vortex-cuda/src/kernel/patches/mod.rs b/vortex-cuda/src/kernel/patches/mod.rs
index 79654907ce6..38b2248ef08 100644
--- a/vortex-cuda/src/kernel/patches/mod.rs
+++ b/vortex-cuda/src/kernel/patches/mod.rs
@@ -19,7 +19,6 @@ use vortex::array::vtable::ValidityHelper;
 use vortex::dtype::NativePType;
 use vortex::error::VortexResult;
 use vortex::error::vortex_ensure;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaBufferExt;
 use crate::CudaDeviceBuffer;
@@ -98,7 +97,7 @@ pub(crate) async fn execute_patches<
     Ok(target)
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use std::sync::Arc;
 
@@ -135,7 +134,7 @@ mod tests {
     #[case::i64(0_i64)]
     #[case::f32(0_f32)]
     #[case::f64(0_f64)]
-    #[tokio::test]
+    #[crate::test]
     async fn test_patches<Values: NativePType + DeviceRepr>(#[case] _v: Values) {
         tokio::join!(
             full_test_case::<Values, u8>(),
@@ -203,7 +202,7 @@ mod tests {
 
     fn force_cast<T: NativePType>(array: PrimitiveArray) -> PrimitiveArray {
         array
-            .to_array()
+            .into_array()
             .cast(DType::Primitive(T::PTYPE, Nullability::NonNullable))
             .unwrap()
             .to_primitive()
diff --git a/vortex-cuda/src/kernel/patches/types.rs b/vortex-cuda/src/kernel/patches/types.rs
index b8a2157ff3d..2e6d429bb23 100644
--- a/vortex-cuda/src/kernel/patches/types.rs
+++ b/vortex-cuda/src/kernel/patches/types.rs
@@ -235,7 +235,7 @@ mod tests {
 
     use crate::kernel::patches::types::transpose;
 
-    #[test]
+    #[crate::test]
     fn test_transpose_patches() {
         let patch_values = buffer![0u32, 10, 20, 30, 40, 50, 60, 70, 80];
 
diff --git a/vortex-cuda/src/lib.rs b/vortex-cuda/src/lib.rs
index 9f3d6df94f2..665b9e30c62 100644
--- a/vortex-cuda/src/lib.rs
+++ b/vortex-cuda/src/lib.rs
@@ -74,6 +74,8 @@ use vortex::encodings::zigzag::ZigZagVTable;
 #[cfg(feature = "unstable_encodings")]
 use vortex::encodings::zstd::ZstdBuffersVTable;
 use vortex::encodings::zstd::ZstdVTable;
+#[cfg(test)]
+use vortex_cuda_macros::test;
 pub use vortex_nvcomp as nvcomp;
 
 use crate::kernel::SequenceExecutor;
diff --git a/vortex-cuda/src/pinned.rs b/vortex-cuda/src/pinned.rs
index ae6c24cd40d..c48cf98624b 100644
--- a/vortex-cuda/src/pinned.rs
+++ b/vortex-cuda/src/pinned.rs
@@ -14,7 +14,6 @@ use vortex::error::VortexResult;
 use vortex::error::vortex_err;
 use vortex::error::vortex_panic;
 use vortex::utils::aliases::hash_map::HashMap;
-use vortex_cuda_macros::cuda_tests;
 
 use crate::CudaDeviceBuffer;
 use crate::stream::VortexCudaStream;
@@ -351,7 +350,7 @@ impl Drop for PooledPinnedBuffer {
     }
 }
 
-#[cuda_tests]
+#[cfg(test)]
 mod tests {
     use std::sync::Arc;
 
@@ -372,7 +371,7 @@ mod tests {
         Ok((pool, stream))
     }
 
-    #[test]
+    #[crate::test]
     fn transfer_to_device_round_trip() -> VortexResult<()> {
         let (pool, stream) = setup()?;
         let data: Vec<u8> = (0..=255u8).collect();
@@ -387,7 +386,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn transfer_puts_buffer_inflight() -> VortexResult<()> {
         let (pool, stream) = setup()?;
 
@@ -411,7 +410,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn pool_reclaims_after_transfer_completes() -> VortexResult<()> {
         let (pool, stream) = setup()?;
 
@@ -444,7 +443,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn drop_returns_buffer_to_pool() -> VortexResult<()> {
         let (pool, _stream) = setup()?;
 
@@ -466,7 +465,7 @@ mod tests {
         Ok(())
     }
 
-    #[test]
+    #[crate::test]
     fn transfer_consumes_inner_so_drop_is_noop() -> VortexResult<()> {
         let (pool, stream) = setup()?;