From a41c532ace08c0d56432f28e6e3df4480fa5fd48 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Thu, 19 Feb 2026 09:21:40 +0000 Subject: [PATCH 1/4] GH-35806: [R] Improve error message for null type inference with sparse CSV data When a CSV column contains only missing values in the first block of data, Arrow infers the type as null. If a non-null value appears later, the conversion fails with an unhelpful error suggesting `skip = 1`. This change adds a specific check for "conversion error to null" and provides a more helpful message explaining the cause (type inference from sparse data) and the solution (specify column types explicitly). Co-Authored-By: Claude Opus 4.5 --- r/R/util.R | 15 +++++++++++++++ r/tests/testthat/test-dataset-csv.R | 18 ++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/r/R/util.R b/r/R/util.R index c63e1ee5459..acbd39e2037 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -196,6 +196,21 @@ repeat_value_as_array <- function(object, n) { } handle_csv_read_error <- function(msg, call, schema) { + # Handle null type inference issue with sparse data + if (grepl("conversion error to null", msg)) { + msg <- c( + msg, + i = paste( + "Column type was inferred as null because the first block of data", + "(default 1MB, set via `block_size` in read options) contained only", + "missing values. Try specifying the column types explicitly using the", + "`col_types` or `schema` argument." + ) + ) + abort(msg, call = call) + } + + # Handle schema + header row issue if (grepl("conversion error", msg) && inherits(schema, "Schema")) { msg <- c( msg, diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index 749d1672ac5..145a376da97 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -711,3 +711,21 @@ test_that("open_dataset() with `decimal_point` argument", { tibble(x = 1.2, y = "c") ) }) + + +test_that("more informative error when column inferred as null due to sparse data (GH-35806)", { + tf <- tempfile() + on.exit(unlink(tf)) + + # Create a CSV where the second column has NAs in the first rows + # but a value later - this causes Arrow to infer null type + writeLines(c("x,y", paste0(1:100, ",")), tf) + write("101,foo", tf, append = TRUE) + + # Use small block_size to force type inference from only the first rows + expect_error( + open_dataset(tf, format = "csv", read_options = csv_read_options(block_size = 100L)) |> + collect(), + "inferred as null" + ) +}) From 2154bdc27d3b052a43da18506b28c99e5d1efc4c Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Feb 2026 14:16:31 +0000 Subject: [PATCH 2/4] Only give 1 option as to what to do, remove redundant comments --- r/R/util.R | 5 ++--- r/tests/testthat/test-dataset-csv.R | 10 +++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/r/R/util.R b/r/R/util.R index acbd39e2037..aa102b5005a 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -202,9 +202,8 @@ handle_csv_read_error <- function(msg, call, schema) { msg, i = paste( "Column type was inferred as null because the first block of data", - "(default 1MB, set via `block_size` in read options) contained only", - "missing values. Try specifying the column types explicitly using the", - "`col_types` or `schema` argument." + "contained only missing values. See `?csv_read_options` for how to", + "set a smaller value." ) ) abort(msg, call = call) diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R index 145a376da97..5cf8e23b097 100644 --- a/r/tests/testthat/test-dataset-csv.R +++ b/r/tests/testthat/test-dataset-csv.R @@ -712,19 +712,19 @@ test_that("open_dataset() with `decimal_point` argument", { ) }) - test_that("more informative error when column inferred as null due to sparse data (GH-35806)", { tf <- tempfile() on.exit(unlink(tf)) - # Create a CSV where the second column has NAs in the first rows - # but a value later - this causes Arrow to infer null type writeLines(c("x,y", paste0(1:100, ",")), tf) write("101,foo", tf, append = TRUE) - # Use small block_size to force type inference from only the first rows expect_error( - open_dataset(tf, format = "csv", read_options = csv_read_options(block_size = 100L)) |> + open_dataset( + tf, + format = "csv", + read_options = csv_read_options(block_size = 100L) + ) |> collect(), "inferred as null" ) From 3538a2de1be231ba259b89fa7280fdd5a4f91ba9 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Feb 2026 14:19:09 +0000 Subject: [PATCH 3/4] Remove redundant comments --- r/R/util.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/R/util.R b/r/R/util.R index aa102b5005a..83b22f2d9cf 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -196,7 +196,6 @@ repeat_value_as_array <- function(object, n) { } handle_csv_read_error <- function(msg, call, schema) { - # Handle null type inference issue with sparse data if (grepl("conversion error to null", msg)) { msg <- c( msg, From 921701a7c989c6ece644416c97aae477bcefbe86 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 23 Feb 2026 14:19:16 +0000 Subject: [PATCH 4/4] Remove redundant comments --- r/R/util.R | 1 - 1 file changed, 1 deletion(-) diff --git a/r/R/util.R b/r/R/util.R index 83b22f2d9cf..cb98358b432 100644 --- a/r/R/util.R +++ b/r/R/util.R @@ -208,7 +208,6 @@ handle_csv_read_error <- function(msg, call, schema) { abort(msg, call = call) } - # Handle schema + header row issue if (grepl("conversion error", msg) && inherits(schema, "Schema")) { msg <- c( msg,