From a4eb9757de2ae4e9a1e203811ff727081a466ce7 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 23 Dec 2025 15:04:00 +0900 Subject: [PATCH 1/2] [R] Add test coverage for joins with duplicate columns and type casting --- r/tests/testthat/test-dplyr-join.R | 37 ++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R index 51ca528a644..b64f093cd95 100644 --- a/r/tests/testthat/test-dplyr-join.R +++ b/r/tests/testthat/test-dplyr-join.R @@ -188,8 +188,41 @@ test_that("Error handling for unsupported expressions in join_by", { ) }) -# TODO: test duplicate col names -# TODO: casting: int and float columns? +test_that("joins with duplicate column names", { + # When column names are duplicated (not in by), suffixes are added + left_dup <- tibble::tibble( + x = 1:5, + y = 1:5, + z = letters[1:5] + ) + right_dup <- tibble::tibble( + x = 1:5, + y = 6:10, + z = LETTERS[1:5] + ) + + compare_dplyr_binding( + .input |> + left_join(right_dup, by = "x") |> + collect(), + left_dup + ) + + compare_dplyr_binding( + .input |> + inner_join(right_dup, by = "x") |> + collect(), + left_dup + ) + + # Test with custom suffixes + compare_dplyr_binding( + .input |> + left_join(right_dup, by = "x", suffix = c("_left", "_right")) |> + collect(), + left_dup + ) +}) test_that("right_join", { compare_dplyr_binding( From ee3578ea66789afc5702a1c1e2f3a1243c150c41 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Mon, 29 Dec 2025 10:25:36 +0900 Subject: [PATCH 2/2] Address a review comment --- r/tests/testthat/test-dplyr-join.R | 92 ++++++++++++++++++++---------- 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/r/tests/testthat/test-dplyr-join.R b/r/tests/testthat/test-dplyr-join.R index b64f093cd95..ce7ed2e63fc 100644 --- a/r/tests/testthat/test-dplyr-join.R +++ b/r/tests/testthat/test-dplyr-join.R @@ -191,26 +191,29 @@ test_that("Error handling for unsupported expressions in join_by", { test_that("joins with duplicate column names", { # When column names are duplicated (not in by), suffixes are added left_dup <- tibble::tibble( - x = 1:5, - y = 1:5, - z = letters[1:5] + key = 1:5, + shared = 1:5, + shared_float = c(1.1, 2.2, 3.3, 4.4, 5.5), + left_unique = letters[1:5] ) right_dup <- tibble::tibble( - x = 1:5, - y = 6:10, - z = LETTERS[1:5] + key = 1:5, + shared = 6:10, + shared_float = c(6.1, 7.2, 8.3, 9.4, 10.5), + right_unique = LETTERS[1:5] ) + # Test with default suffixes (.x and .y) compare_dplyr_binding( .input |> - left_join(right_dup, by = "x") |> + left_join(right_dup, by = "key") |> collect(), left_dup ) compare_dplyr_binding( .input |> - inner_join(right_dup, by = "x") |> + inner_join(right_dup, by = "key") |> collect(), left_dup ) @@ -218,10 +221,61 @@ test_that("joins with duplicate column names", { # Test with custom suffixes compare_dplyr_binding( .input |> - left_join(right_dup, by = "x", suffix = c("_left", "_right")) |> + left_join(right_dup, by = "key", suffix = c("_left", "_right")) |> + collect(), + left_dup + ) + + compare_dplyr_binding( + .input |> + inner_join(right_dup, by = "key", suffix = c("_left", "_right")) |> collect(), left_dup ) + + # Test that column names are correctly suffixed + # Verify exact column names match expected pattern using the same fixture + result <- arrow_table(left_dup) |> + inner_join( + arrow_table(right_dup), + by = "key", + suffix = c("_left", "_right") + ) |> + collect() + res_col_names <- names(result) + # Column order: join key first, then left table columns (with suffixes), + # then right table columns (with suffixes) + expected_col_names <- c( + "key", + "shared_left", + "shared_float_left", + "left_unique", + "shared_right", + "shared_float_right", + "right_unique" + ) + expect_equal(expected_col_names, res_col_names) +}) + +test_that("joins with incompatible types for join keys", { + # Test that joining on columns with incompatible types (int vs float) fails + # Arrow requires join keys to have compatible types - type casting is not + # automatically performed for join keys + left_int <- Table$create( + x = c(1L, 2L), + shared = c(10L, 20L) + ) + right_float <- Table$create( + x = c(1.0, 2.0), + shared = c(10.1, 20.2) + ) + + expect_error( + left_int |> + left_join(right_float, by = "x") |> + collect(), + "Incompatible data types for corresponding join field keys" + ) }) test_that("right_join", { @@ -350,26 +404,6 @@ test_that("arrow dplyr query correctly filters then joins", { ) }) -test_that("suffix", { - left_suf <- Table$create( - key = c(1, 2), - left_unique = c(2.1, 3.1), - shared = c(10.1, 10.3) - ) - - right_suf <- Table$create( - key = c(1, 2, 3, 10, 20), - right_unique = c(1.1, 1.2, 3.1, 4.1, 4.3), - shared = c(20.1, 30, 40, 50, 60) - ) - - join_op <- inner_join(left_suf, right_suf, by = "key", suffix = c("_left", "_right")) - output <- collect(join_op) - res_col_names <- names(output) - expected_col_names <- c("key", "left_unique", "shared_left", "right_unique", "shared_right") - expect_equal(expected_col_names, res_col_names) -}) - test_that("suffix and implicit schema", { left_suf <- Table$create( key = c(1, 2),