From a41c532ace08c0d56432f28e6e3df4480fa5fd48 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Thu, 19 Feb 2026 09:21:40 +0000
Subject: [PATCH 1/4] GH-35806: [R] Improve error message for null type
 inference with sparse CSV data

When a CSV column contains only missing values in the first block of data,
Arrow infers the type as null. If a non-null value appears later, the
conversion fails with an unhelpful error suggesting `skip = 1`.

This change adds a specific check for "conversion error to null" and
provides a more helpful message explaining the cause (type inference
from sparse data) and the solution (specify column types explicitly).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 r/R/util.R                          | 15 +++++++++++++++
 r/tests/testthat/test-dataset-csv.R | 18 ++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/r/R/util.R b/r/R/util.R
index c63e1ee5459..acbd39e2037 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -196,6 +196,21 @@ repeat_value_as_array <- function(object, n) {
 }
 
 handle_csv_read_error <- function(msg, call, schema) {
+  # Handle null type inference issue with sparse data
+  if (grepl("conversion error to null", msg)) {
+    msg <- c(
+      msg,
+      i = paste(
+        "Column type was inferred as null because the first block of data",
+        "(default 1MB, set via `block_size` in read options) contained only",
+        "missing values. Try specifying the column types explicitly using the",
+        "`col_types` or `schema` argument."
+      )
+    )
+    abort(msg, call = call)
+  }
+
+  # Handle schema + header row issue
   if (grepl("conversion error", msg) && inherits(schema, "Schema")) {
     msg <- c(
       msg,
diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R
index 749d1672ac5..145a376da97 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -711,3 +711,21 @@ test_that("open_dataset() with `decimal_point` argument", {
     tibble(x = 1.2, y = "c")
   )
 })
+
+
+test_that("more informative error when column inferred as null due to sparse data (GH-35806)", {
+  tf <- tempfile()
+  on.exit(unlink(tf))
+
+  # Create a CSV where the second column has NAs in the first rows
+  # but a value later - this causes Arrow to infer null type
+  writeLines(c("x,y", paste0(1:100, ",")), tf)
+  write("101,foo", tf, append = TRUE)
+
+  # Use small block_size to force type inference from only the first rows
+  expect_error(
+    open_dataset(tf, format = "csv", read_options = csv_read_options(block_size = 100L)) |>
+      collect(),
+    "inferred as null"
+  )
+})

From 2154bdc27d3b052a43da18506b28c99e5d1efc4c Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Mon, 23 Feb 2026 14:16:31 +0000
Subject: [PATCH 2/4] Only give 1 option as to what to do, remove redundant
 comments

---
 r/R/util.R                          |  5 ++---
 r/tests/testthat/test-dataset-csv.R | 10 +++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/r/R/util.R b/r/R/util.R
index acbd39e2037..aa102b5005a 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -202,9 +202,8 @@ handle_csv_read_error <- function(msg, call, schema) {
       msg,
       i = paste(
         "Column type was inferred as null because the first block of data",
-        "(default 1MB, set via `block_size` in read options) contained only",
-        "missing values. Try specifying the column types explicitly using the",
-        "`col_types` or `schema` argument."
+        "contained only missing values. See `?csv_read_options` for how to",
+        "set a smaller value."
       )
     )
     abort(msg, call = call)
diff --git a/r/tests/testthat/test-dataset-csv.R b/r/tests/testthat/test-dataset-csv.R
index 145a376da97..5cf8e23b097 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -712,19 +712,19 @@ test_that("open_dataset() with `decimal_point` argument", {
   )
 })
 
-
 test_that("more informative error when column inferred as null due to sparse data (GH-35806)", {
   tf <- tempfile()
   on.exit(unlink(tf))
 
-  # Create a CSV where the second column has NAs in the first rows
-  # but a value later - this causes Arrow to infer null type
   writeLines(c("x,y", paste0(1:100, ",")), tf)
   write("101,foo", tf, append = TRUE)
 
-  # Use small block_size to force type inference from only the first rows
   expect_error(
-    open_dataset(tf, format = "csv", read_options = csv_read_options(block_size = 100L)) |>
+    open_dataset(
+      tf,
+      format = "csv",
+      read_options = csv_read_options(block_size = 100L)
+    ) |>
       collect(),
     "inferred as null"
   )

From 3538a2de1be231ba259b89fa7280fdd5a4f91ba9 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Mon, 23 Feb 2026 14:19:09 +0000
Subject: [PATCH 3/4] Remove redundant comments

---
 r/R/util.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/r/R/util.R b/r/R/util.R
index aa102b5005a..83b22f2d9cf 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -196,7 +196,6 @@ repeat_value_as_array <- function(object, n) {
 }
 
 handle_csv_read_error <- function(msg, call, schema) {
-  # Handle null type inference issue with sparse data
   if (grepl("conversion error to null", msg)) {
     msg <- c(
       msg,

From 921701a7c989c6ece644416c97aae477bcefbe86 Mon Sep 17 00:00:00 2001
From: Nic Crane <thisisnic@gmail.com>
Date: Mon, 23 Feb 2026 14:19:16 +0000
Subject: [PATCH 4/4] Remove redundant comments

---
 r/R/util.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/r/R/util.R b/r/R/util.R
index 83b22f2d9cf..cb98358b432 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -208,7 +208,6 @@ handle_csv_read_error <- function(msg, call, schema) {
     abort(msg, call = call)
   }
 
-  # Handle schema + header row issue
   if (grepl("conversion error", msg) && inherits(schema, "Schema")) {
     msg <- c(
       msg,