From 388ef8c7663753bad09180e19805f2fb87f5ed2b Mon Sep 17 00:00:00 2001 From: coffeecookey Date: Mon, 29 Dec 2025 03:14:23 +0530 Subject: [PATCH 01/25] implementing NSE to cube --- R/groupingsets.R | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/R/groupingsets.R b/R/groupingsets.R index f5fc2101f1..f04c506fc7 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -29,6 +29,25 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { stopf("Argument 'id' must be a logical scalar.") if (missing(j)) stopf("Argument 'j' is required") + #implementing NSE in cube + jj = substitute(j) + usesSD = any(all.vars(jj) == ".SD") + if (usesSD) { + if (missing(.SDcols)) { + .SDcols = names(x)[vapply(x, is.numeric, logical(1L))] + } else { + sub.result = substitute(.SDcols) + if (is.call(sub.result)) { + #.SDcols = eval_with_cols(sub.result, names(x)) + check_var = eval_with_cols(sub.result, names(x)) + if (!is.null(check_var)) { + .SDcols = eval_with_cols(sub.result, names(x)) + } + } + } + } else { + .SDcols = NULL + } # generate grouping sets for cube - power set: http://stackoverflow.com/a/32187892/2490497 n = length(by) keepBool = sapply(2L^(seq_len(n)-1L), function(k) rep(c(FALSE, TRUE), times=k, each=((2L^n)/(2L*k)))) From 3e96dfc95a595139f6b6486a6a2924f8aeac6d0f Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Mon, 29 Dec 2025 03:41:39 +0530 Subject: [PATCH 02/25] implementing NSE in cube --- R/groupingsets.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index f04c506fc7..54ddd26194 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -29,7 +29,7 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { stopf("Argument 'id' must be a logical scalar.") if (missing(j)) stopf("Argument 'j' is required") - #implementing NSE in cube + #implementing NSE in cube jj = substitute(j) usesSD = any(all.vars(jj) == ".SD") if (usesSD) { @@ -48,6 +48,7 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { } else { .SDcols = NULL } + # generate grouping sets for cube - power set: http://stackoverflow.com/a/32187892/2490497 n = length(by) keepBool = sapply(2L^(seq_len(n)-1L), function(k) rep(c(FALSE, TRUE), times=k, each=((2L^n)/(2L*k)))) From e1eb87a5f60d8292579c326dcdcfbd5b6193903f Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Mon, 29 Dec 2025 03:49:27 +0530 Subject: [PATCH 03/25] removed trailing whitespace --- R/groupingsets.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index 54ddd26194..93e36899e1 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -29,7 +29,8 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { stopf("Argument 'id' must be a logical scalar.") if (missing(j)) stopf("Argument 'j' is required") - #implementing NSE in cube + + #implementing NSE in cube jj = substitute(j) usesSD = any(all.vars(jj) == ".SD") if (usesSD) { @@ -48,7 +49,7 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { } else { .SDcols = NULL } - + # generate grouping sets for cube - power set: http://stackoverflow.com/a/32187892/2490497 n = length(by) keepBool = sapply(2L^(seq_len(n)-1L), function(k) rep(c(FALSE, TRUE), times=k, each=((2L^n)/(2L*k)))) From b6adef94fabc78213edb4099594abd5b44fd195b Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Mon, 29 Dec 2025 03:53:21 +0530 Subject: [PATCH 04/25] removed trailing whitespace --- R/groupingsets.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index 93e36899e1..21ec65bd42 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -29,7 +29,6 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { stopf("Argument 'id' must be a logical scalar.") if (missing(j)) stopf("Argument 'j' is required") - #implementing NSE in cube jj = substitute(j) usesSD = any(all.vars(jj) == ".SD") From 47bb2c390b7800a6f712b71a2e23988ba1fae3e1 Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Wed, 31 Dec 2025 15:13:26 +0530 Subject: [PATCH 05/25] revised implementation of NSE in cube --- ..Rcheck/00check.log | 13 +++++++++++++ Makevars | 3 +++ R/groupingsets.R | 42 +++++++++++++++++++++++++++++++++++------- 3 files changed, 51 insertions(+), 7 deletions(-) create mode 100644 ..Rcheck/00check.log create mode 100644 Makevars diff --git a/..Rcheck/00check.log b/..Rcheck/00check.log new file mode 100644 index 0000000000..2d743cd4d3 --- /dev/null +++ b/..Rcheck/00check.log @@ -0,0 +1,13 @@ +* using log directory ‘/Users/tanishaojha/Desktop/Code-folder/data.table/..Rcheck’ +* using R version 4.5.1 (2025-06-13) +* using platform: aarch64-apple-darwin20 +* R was compiled by + Apple clang version 16.0.0 (clang-1600.0.26.6) + GNU Fortran (GCC) 14.2.0 +* running under: macOS Sequoia 15.5 +* using session charset: UTF-8 +* checking for file ‘./DESCRIPTION’ ... ERROR +Required fields missing or empty: + ‘Author’ ‘Maintainer’ +* DONE +Status: 1 ERROR diff --git a/Makevars b/Makevars new file mode 100644 index 0000000000..dd8d74e972 --- /dev/null +++ b/Makevars @@ -0,0 +1,3 @@ +CPPFLAGS += -I/opt/homebrew/opt/gettext/include +LDFLAGS += -L/opt/homebrew/opt/gettext/lib + diff --git a/R/groupingsets.R b/R/groupingsets.R index 21ec65bd42..26e01e2a3c 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -29,26 +29,54 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { stopf("Argument 'id' must be a logical scalar.") if (missing(j)) stopf("Argument 'j' is required") - #implementing NSE in cube + + # Implementing NSE in cube jj = substitute(j) + bysub = substitute(by) + names_x = names(x) + + allbyvars = intersect(all.vars(bysub), names_x) usesSD = any(all.vars(jj) == ".SD") + if (usesSD) { if (missing(.SDcols)) { - .SDcols = names(x)[vapply(x, is.numeric, logical(1L))] + ansvars = sdvars = setdiff(unique(names_x), union(by, allbyvars)) + ansvals = match(ansvars, names_x) } else { sub.result = substitute(.SDcols) if (is.call(sub.result)) { - #.SDcols = eval_with_cols(sub.result, names(x)) - check_var = eval_with_cols(sub.result, names(x)) - if (!is.null(check_var)) { - .SDcols = eval_with_cols(sub.result, names(x)) + call_name = as.character(sub.result[[1L]]) + if (call_name %in% c("patterns", "is.numeric", "is.character", "is.factor")) { + .SDcols = eval_with_cols(sub.result, names_x) + } else { + .SDcols = eval(sub.result, parent.frame()) } + } else { + .SDcols = eval(sub.result, parent.frame()) + } + if (is.character(.SDcols)) { + if (!all(idx = .SDcols %chin% names_x)) + stopf("Some items of .SDcols are not column names: %s", + paste(.SDcols[!idx], collapse = ", ")) + ansvars = sdvars = .SDcols + ansvals = match(ansvars, names_x) + } else if (is.numeric(.SDcols)) { + ansvals = as.integer(.SDcols) + ansvars = sdvars = names_x[ansvals] + } else if (is.logical(.SDcols)) { + if (length(.SDcols) != length(names_x)) + stopf(".SDcols is a logical vector of length %d but there are %d columns", + length(.SDcols), length(names_x)) + ansvals = which(.SDcols) + ansvars = sdvars = names_x[ansvals] + } else { + stopf(".SDcols must be character, numeric, or logical") } } } else { .SDcols = NULL } - + # generate grouping sets for cube - power set: http://stackoverflow.com/a/32187892/2490497 n = length(by) keepBool = sapply(2L^(seq_len(n)-1L), function(k) rep(c(FALSE, TRUE), times=k, each=((2L^n)/(2L*k)))) From e9876cb8af7ad1f4df58607542263428efd3c5b4 Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Wed, 31 Dec 2025 15:31:52 +0530 Subject: [PATCH 06/25] cleaning up the code --- R/groupingsets.R | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index 26e01e2a3c..4890bf5652 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -29,15 +29,12 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { stopf("Argument 'id' must be a logical scalar.") if (missing(j)) stopf("Argument 'j' is required") - # Implementing NSE in cube jj = substitute(j) bysub = substitute(by) names_x = names(x) - allbyvars = intersect(all.vars(bysub), names_x) usesSD = any(all.vars(jj) == ".SD") - if (usesSD) { if (missing(.SDcols)) { ansvars = sdvars = setdiff(unique(names_x), union(by, allbyvars)) @@ -55,9 +52,9 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { .SDcols = eval(sub.result, parent.frame()) } if (is.character(.SDcols)) { - if (!all(idx = .SDcols %chin% names_x)) - stopf("Some items of .SDcols are not column names: %s", - paste(.SDcols[!idx], collapse = ", ")) + idx = .SDcols %chin% names_x + if (any(!idx)) + stopf("Some items of .SDcols are not column names: %s", toString(.SDcols[!idx])) ansvars = sdvars = .SDcols ansvals = match(ansvars, names_x) } else if (is.numeric(.SDcols)) { @@ -65,8 +62,7 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { ansvars = sdvars = names_x[ansvals] } else if (is.logical(.SDcols)) { if (length(.SDcols) != length(names_x)) - stopf(".SDcols is a logical vector of length %d but there are %d columns", - length(.SDcols), length(names_x)) + stopf(".SDcols is a logical vector of length %d but there are %d columns", length(.SDcols), length(names_x)) ansvals = which(.SDcols) ansvars = sdvars = names_x[ansvals] } else { @@ -76,7 +72,6 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { } else { .SDcols = NULL } - # generate grouping sets for cube - power set: http://stackoverflow.com/a/32187892/2490497 n = length(by) keepBool = sapply(2L^(seq_len(n)-1L), function(k) rep(c(FALSE, TRUE), times=k, each=((2L^n)/(2L*k)))) From 2a15cb9d4cbbad25e27a240b4fe83c05da43502b Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Wed, 31 Dec 2025 15:37:35 +0530 Subject: [PATCH 07/25] more cleaning --- R/groupingsets.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index 4890bf5652..da0e8fe3d0 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -53,7 +53,7 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { } if (is.character(.SDcols)) { idx = .SDcols %chin% names_x - if (any(!idx)) + if (!all(idx)) stopf("Some items of .SDcols are not column names: %s", toString(.SDcols[!idx])) ansvars = sdvars = .SDcols ansvals = match(ansvars, names_x) From 0ae97fabd9660b306bc5c2abe9c7a45cf07ac8e7 Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Wed, 31 Dec 2025 20:23:27 +0530 Subject: [PATCH 08/25] removed unnecessary changes to code --- R/groupingsets.R | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index da0e8fe3d0..faeebc59fe 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -29,7 +29,7 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { stopf("Argument 'id' must be a logical scalar.") if (missing(j)) stopf("Argument 'j' is required") - # Implementing NSE in cube +# Implementing NSE in cube jj = substitute(j) bysub = substitute(by) names_x = names(x) @@ -41,13 +41,8 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { ansvals = match(ansvars, names_x) } else { sub.result = substitute(.SDcols) - if (is.call(sub.result)) { - call_name = as.character(sub.result[[1L]]) - if (call_name %in% c("patterns", "is.numeric", "is.character", "is.factor")) { - .SDcols = eval_with_cols(sub.result, names_x) - } else { - .SDcols = eval(sub.result, parent.frame()) - } + if (is.call(sub.result) && as.character(sub.result[[1L]]) == "patterns") { + .SDcols = eval_with_cols(sub.result, names_x) } else { .SDcols = eval(sub.result, parent.frame()) } From 7215a4c857c8f9953557af8ebfa1a5a3e227189c Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Thu, 1 Jan 2026 11:50:56 +0530 Subject: [PATCH 09/25] adding some tests to the code --- inst/tests/tests.Rraw | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8cad916e3b..12440c968e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11503,6 +11503,31 @@ test(1750.34, character(0)), id = TRUE) ) +test(1750.35, + cube(dt, j = lapply(.SD, sum), by = c("color","year","status"), id=TRUE, .SDcols=patterns("value")), + groupingsets(dt, j = lapply(.SD, sum), by = c("color","year","status"), .SDcols = "value", + sets = list(c("color","year","status"), + c("color","year"), + c("color","status"), + "color", + c("year","status"), + "year", + "status", + character(0)), + id = TRUE) +) +test(1750.36, + cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = c("value", "BADCOL")), + error = "Some items of \\.SDcols are not column names" +) +test(1750.37, + cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = c(TRUE, FALSE)), + error = "\\.SDcols is a logical vector of length" +) +test(1750.38, + cube(dt, j = sum(value), by = "year", .SDcols = "value", id = TRUE), + cube(dt, j = sum(value), by = "year", id = TRUE) +) # grouping sets with integer64 if (test_bit64) { set.seed(26) From 2b7b7ff796c632b75187d15202b5119de2c1676c Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Thu, 1 Jan 2026 12:21:08 +0530 Subject: [PATCH 10/25] more tests --- inst/tests/tests.Rraw | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 12440c968e..93ecf96d97 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11528,6 +11528,18 @@ test(1750.38, cube(dt, j = sum(value), by = "year", .SDcols = "value", id = TRUE), cube(dt, j = sum(value), by = "year", id = TRUE) ) +test(1750.39, + cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = 5L, id = TRUE), + cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = "value", id = TRUE) +) +test(1750.40, + cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = names(dt) == "value", id = TRUE), + cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = "value", id = TRUE) +) +test( + cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = list("value")), + error = "\\.SDcols must be character, numeric, or logical" +) # grouping sets with integer64 if (test_bit64) { set.seed(26) From 431ca81914b3f2319cbc7369c440eec10d42f99d Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Thu, 1 Jan 2026 12:26:39 +0530 Subject: [PATCH 11/25] Revert "more tests" This reverts commit 2b7b7ff796c632b75187d15202b5119de2c1676c. --- inst/tests/tests.Rraw | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 93ecf96d97..12440c968e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11528,18 +11528,6 @@ test(1750.38, cube(dt, j = sum(value), by = "year", .SDcols = "value", id = TRUE), cube(dt, j = sum(value), by = "year", id = TRUE) ) -test(1750.39, - cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = 5L, id = TRUE), - cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = "value", id = TRUE) -) -test(1750.40, - cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = names(dt) == "value", id = TRUE), - cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = "value", id = TRUE) -) -test( - cube(dt, j = lapply(.SD, sum), by = "year", .SDcols = list("value")), - error = "\\.SDcols must be character, numeric, or logical" -) # grouping sets with integer64 if (test_bit64) { set.seed(26) From 3bec96c4b6135912bb0f1e48ea34fac80a17b95e Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Thu, 1 Jan 2026 13:06:34 +0530 Subject: [PATCH 12/25] more tests --- inst/tests/tests.Rraw | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 12440c968e..6c4128a5a3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11525,8 +11525,18 @@ test(1750.37, error = "\\.SDcols is a logical vector of length" ) test(1750.38, - cube(dt, j = sum(value), by = "year", .SDcols = "value", id = TRUE), - cube(dt, j = sum(value), by = "year", id = TRUE) + cube(dt, j = lapply(.SD, mean), by = "color", .SDcols = c(FALSE, FALSE, FALSE, TRUE, FALSE), id=TRUE), + groupingsets(dt, j = lapply(.SD, mean), by = "color", .SDcols = "amount", + sets = list("color", character(0)), + id = TRUE) +) +test(1750.39, + cube(dt, j = lapply(.SD, sum), by = "color", .SDcols = list("amount")), + error = ".SDcols must be character, numeric, or logical" +) +test(1750.40, + cube(dt, j = lapply(.SD, sum), by = "color", .SDcols = c(1, 99)), + error = "out of bounds" ) # grouping sets with integer64 if (test_bit64) { From c16f64dc34e73381eae53778f3cc0bfbffa5f100 Mon Sep 17 00:00:00 2001 From: sisyphuswastaken <69812646+sisyphuswastaken@users.noreply.github.com> Date: Thu, 1 Jan 2026 15:30:11 +0530 Subject: [PATCH 13/25] Delete ..Rcheck/00check.log --- ..Rcheck/00check.log | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 ..Rcheck/00check.log diff --git a/..Rcheck/00check.log b/..Rcheck/00check.log deleted file mode 100644 index 2d743cd4d3..0000000000 --- a/..Rcheck/00check.log +++ /dev/null @@ -1,13 +0,0 @@ -* using log directory ‘/Users/tanishaojha/Desktop/Code-folder/data.table/..Rcheck’ -* using R version 4.5.1 (2025-06-13) -* using platform: aarch64-apple-darwin20 -* R was compiled by - Apple clang version 16.0.0 (clang-1600.0.26.6) - GNU Fortran (GCC) 14.2.0 -* running under: macOS Sequoia 15.5 -* using session charset: UTF-8 -* checking for file ‘./DESCRIPTION’ ... ERROR -Required fields missing or empty: - ‘Author’ ‘Maintainer’ -* DONE -Status: 1 ERROR From 4119cee6f1097f5242875f75ccc601f75d5d4e37 Mon Sep 17 00:00:00 2001 From: sisyphuswastaken <69812646+sisyphuswastaken@users.noreply.github.com> Date: Thu, 1 Jan 2026 15:30:22 +0530 Subject: [PATCH 14/25] Delete Makevars --- Makevars | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 Makevars diff --git a/Makevars b/Makevars deleted file mode 100644 index dd8d74e972..0000000000 --- a/Makevars +++ /dev/null @@ -1,3 +0,0 @@ -CPPFLAGS += -I/opt/homebrew/opt/gettext/include -LDFLAGS += -L/opt/homebrew/opt/gettext/lib - From f792b159c749bee9636a2160d198bfd024920e76 Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Fri, 2 Jan 2026 16:15:14 +0530 Subject: [PATCH 15/25] converting the NSE code into helper function --- R/groupingsets.R | 85 ++++++++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 35 deletions(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index faeebc59fe..14e70bd634 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -16,6 +16,48 @@ rollup.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label, enclos = parent.frame()) } +# Helper function to process SDcols +.processSDcols = function(SDcols_sub, SDcols_missing, x, jsub, by, enclos = parent.frame()) { + names_x = names(x) + bysub = substitute(by) + allbyvars = intersect(all.vars(bysub), names_x) + usesSD = any(all.vars(jsub) == ".SD") + if (!usesSD) { + return(NULL) + } + if (SDcols_missing) { + ansvars = sdvars = setdiff(unique(names_x), union(by, allbyvars)) + ansvals = match(ansvars, names_x) + return(list(ansvars = ansvars, sdvars = sdvars, ansvals = ansvals)) + } + sub.result = SDcols_sub + if (is.call(sub.result) && as.character(sub.result[[1L]]) == "patterns") { + .SDcols = eval_with_cols(sub.result, names_x) + } else { + .SDcols = eval(sub.result, enclos) + } + if (is.character(.SDcols)) { + idx = .SDcols %chin% names_x + if (!all(idx)) + stopf("Some items of .SDcols are not column names: %s", toString(.SDcols[!idx])) + ansvars = sdvars = .SDcols + ansvals = match(ansvars, names_x) + } else if (is.numeric(.SDcols)) { + ansvals = as.integer(.SDcols) + if (any(ansvals < 1L | ansvals > length(names_x))) + stopf(".SDcols contains indices out of bounds") + ansvars = sdvars = names_x[ansvals] + } else if (is.logical(.SDcols)) { + if (length(.SDcols) != length(names_x)) + stopf(".SDcols is a logical vector of length %d but there are %d columns", length(.SDcols), length(names_x)) + ansvals = which(.SDcols) + ansvars = sdvars = names_x[ansvals] + } else { + stopf(".SDcols must be character, numeric, or logical") + } + list(ansvars = ansvars, sdvars = sdvars, ansvals = ansvals) +} + cube = function(x, ...) { UseMethod("cube") } @@ -29,43 +71,16 @@ cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { stopf("Argument 'id' must be a logical scalar.") if (missing(j)) stopf("Argument 'j' is required") -# Implementing NSE in cube + # Implementing NSE in cube using the helper, .processSDcols jj = substitute(j) - bysub = substitute(by) - names_x = names(x) - allbyvars = intersect(all.vars(bysub), names_x) - usesSD = any(all.vars(jj) == ".SD") - if (usesSD) { - if (missing(.SDcols)) { - ansvars = sdvars = setdiff(unique(names_x), union(by, allbyvars)) - ansvals = match(ansvars, names_x) - } else { - sub.result = substitute(.SDcols) - if (is.call(sub.result) && as.character(sub.result[[1L]]) == "patterns") { - .SDcols = eval_with_cols(sub.result, names_x) - } else { - .SDcols = eval(sub.result, parent.frame()) - } - if (is.character(.SDcols)) { - idx = .SDcols %chin% names_x - if (!all(idx)) - stopf("Some items of .SDcols are not column names: %s", toString(.SDcols[!idx])) - ansvars = sdvars = .SDcols - ansvals = match(ansvars, names_x) - } else if (is.numeric(.SDcols)) { - ansvals = as.integer(.SDcols) - ansvars = sdvars = names_x[ansvals] - } else if (is.logical(.SDcols)) { - if (length(.SDcols) != length(names_x)) - stopf(".SDcols is a logical vector of length %d but there are %d columns", length(.SDcols), length(names_x)) - ansvals = which(.SDcols) - ansvars = sdvars = names_x[ansvals] - } else { - stopf(".SDcols must be character, numeric, or logical") - } - } - } else { + sdcols_result = .processSDcols(SDcols_sub = substitute(.SDcols), SDcols_missing = missing(.SDcols), x = x, jsub = jj, by = by, enclos = parent.frame()) + if (is.null(sdcols_result)) { .SDcols = NULL + } else { + ansvars = sdcols_result$ansvars + sdvars = sdcols_result$sdvars + ansvals = sdcols_result$ansvals + .SDcols = sdvars } # generate grouping sets for cube - power set: http://stackoverflow.com/a/32187892/2490497 n = length(by) From 97b4536b44504b0af24a322be6599c1bf7b4111e Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Sun, 4 Jan 2026 00:25:06 +0530 Subject: [PATCH 16/25] including helper in [.data.table --- R/data.table.R | 114 +++++++++++++++++++++++++----------------- R/groupingsets.R | 2 + inst/tests/tests.Rraw | 5 ++ 3 files changed, 76 insertions(+), 45 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 27c985e44c..5fc9bfa1ce 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1036,56 +1036,80 @@ replace_dot_alias = function(e) { while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]] # fix for R-Forge #5190. colsub[[1L]] gave error when it's a symbol. # NB: _unary_ '-', not _binary_ '-' (#5826). Test for '!' length-2 should be redundant but low-cost & keeps code concise. - if (colsub %iscall% c("!", "-") && length(colsub) == 2L) { - negate_sdcols = TRUE - colsub = colsub[[2L]] - } else negate_sdcols = FALSE - # fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4))) - while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]] - if (colsub %iscall% ':' && length(colsub)==3L && !is.call(colsub[[2L]]) && !is.call(colsub[[3L]])) { - # .SDcols is of the format a:b, ensure none of : arguments is a call data.table(V1=-1L, V2=-2L, V3=-3L)[,.SD,.SDcols=-V2:-V1] #4231 - .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) - } else { - if (colsub %iscall% 'patterns') { - patterns_list_or_vector = eval_with_cols(colsub, names_x) - .SDcols = if (is.list(patterns_list_or_vector)) { - # each pattern gives a new filter condition, intersect the end result - Reduce(intersect, patterns_list_or_vector) + try_processSDcols = !(colsub %iscall% c("!", "-") && length(colsub) == 2L) && !(colsub %iscall% ':') && !(colsub %iscall% 'patterns') + if (try_processSDcols) { + tryCatch({ + sdcols_result = .processSDcols( + SDcols_sub = colsub, + SDcols_missing = FALSE, + x = x, + jsub = jsub, + by = union(bynames, allbyvars), + enclos = parent.frame() + ) + if (!is.null(sdcols_result)) { + ansvars = sdvars = sdcols_result$ansvars + ansvals = sdcols_result$ansvals } else { - patterns_list_or_vector + try_processSDcols = FALSE } + }, error = function(e) { + try_processSDcols <<- FALSE + }) + } + if (!try_processSDcols) { + + if (colsub %iscall% c("!", "-") && length(colsub) == 2L) { + negate_sdcols = TRUE + colsub = colsub[[2L]] + } else negate_sdcols = FALSE + # fix for #1216, make sure the parentheses are peeled from expr of the form (((1:4))) + while(colsub %iscall% "(") colsub = as.list(colsub)[[-1L]] + if (colsub %iscall% ':' && length(colsub)==3L && !is.call(colsub[[2L]]) && !is.call(colsub[[3L]])) { + # .SDcols is of the format a:b, ensure none of : arguments is a call data.table(V1=-1L, V2=-2L, V3=-3L)[,.SD,.SDcols=-V2:-V1] #4231 + .SDcols = eval(colsub, setattr(as.list(seq_along(x)), 'names', names_x), parent.frame()) } else { - .SDcols = eval(colsub, parent.frame(), parent.frame()) - # allow filtering via function in .SDcols, #3950 - if (is.function(.SDcols)) { - .SDcols = lapply(x, .SDcols) - if (any(idx <- lengths(.SDcols) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA))) - stopf("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: %s", brackify(names(x)[idx])) - .SDcols = unlist(.SDcols, use.names = FALSE) + if (colsub %iscall% 'patterns') { + patterns_list_or_vector = eval_with_cols(colsub, names_x) + .SDcols = if (is.list(patterns_list_or_vector)) { + # each pattern gives a new filter condition, intersect the end result + Reduce(intersect, patterns_list_or_vector) + } else { + patterns_list_or_vector + } + } else { + .SDcols = eval(colsub, parent.frame(), parent.frame()) + # allow filtering via function in .SDcols, #3950 + if (is.function(.SDcols)) { + .SDcols = lapply(x, .SDcols) + if (any(idx <- lengths(.SDcols) > 1L | vapply_1c(.SDcols, typeof) != 'logical' | vapply_1b(.SDcols, anyNA))) + stopf("When .SDcols is a function, it is applied to each column; the output of this function must be a non-missing boolean scalar signalling inclusion/exclusion of the column. However, these conditions were not met for: %s", brackify(names(x)[idx])) + .SDcols = unlist(.SDcols, use.names = FALSE) + } } } - } - if (anyNA(.SDcols)) - stopf(".SDcols missing at the following indices: %s", brackify(which(is.na(.SDcols)))) - if (is.logical(.SDcols)) { - if (length(.SDcols)!=length(x)) stopf(".SDcols is a logical vector of length %d but there are %d columns", length(.SDcols), length(x)) - ansvals = which_(.SDcols, !negate_sdcols) - ansvars = sdvars = names_x[ansvals] - } else if (is.numeric(.SDcols)) { - .SDcols = as.integer(.SDcols) - # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff' - if (length(unique(sign(.SDcols))) > 1L) stopf(".SDcols is numeric but has both +ve and -ve indices") - if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L)) - stopf(".SDcols is numeric but out of bounds [1, %d] at: %s", ncol(x), brackify(which(idx))) - ansvars = sdvars = if (negate_sdcols) dupdiff(names_x[-.SDcols], bynames) else names_x[.SDcols] - ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols - } else { - if (!is.character(.SDcols)) stopf(".SDcols should be column numbers or names") - if (!all(idx <- .SDcols %chin% names_x)) - stopf("Some items of .SDcols are not column names: %s", brackify(.SDcols[!idx])) - ansvars = sdvars = if (negate_sdcols) setdiff(names_x, c(.SDcols, bynames)) else .SDcols - # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove) - ansvals = chmatch(ansvars, names_x) + if (anyNA(.SDcols)) + stopf(".SDcols missing at the following indices: %s", brackify(which(is.na(.SDcols)))) + if (is.logical(.SDcols)) { + if (length(.SDcols)!=length(x)) stopf(".SDcols is a logical vector of length %d but there are %d columns", length(.SDcols), length(x)) + ansvals = which_(.SDcols, !negate_sdcols) + ansvars = sdvars = names_x[ansvals] + } else if (is.numeric(.SDcols)) { + .SDcols = as.integer(.SDcols) + # if .SDcols is numeric, use 'dupdiff' instead of 'setdiff' + if (length(unique(sign(.SDcols))) > 1L) stopf(".SDcols is numeric but has both +ve and -ve indices") + if (any(idx <- abs(.SDcols)>ncol(x) | abs(.SDcols)<1L)) + stopf(".SDcols is numeric but out of bounds [1, %d] at: %s", ncol(x), brackify(which(idx))) + ansvars = sdvars = if (negate_sdcols) dupdiff(names_x[-.SDcols], bynames) else names_x[.SDcols] + ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols + } else { + if (!is.character(.SDcols)) stopf(".SDcols should be column numbers or names") + if (!all(idx <- .SDcols %chin% names_x)) + stopf("Some items of .SDcols are not column names: %s", brackify(.SDcols[!idx])) + ansvars = sdvars = if (negate_sdcols) setdiff(names_x, c(.SDcols, bynames)) else .SDcols + # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove) + ansvals = chmatch(ansvars, names_x) + } } } # fix for long standing FR/bug, #495 and #484 diff --git a/R/groupingsets.R b/R/groupingsets.R index 14e70bd634..d18d0def62 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -36,6 +36,8 @@ rollup.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { } else { .SDcols = eval(sub.result, enclos) } + if (anyNA(.SDcols)) + stopf(".SDcols missing at the following indices: %s", brackify(which(is.na(.SDcols)))) if (is.character(.SDcols)) { idx = .SDcols %chin% names_x if (!all(idx)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6c4128a5a3..6248cf2ca5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11468,6 +11468,11 @@ sets = local({ by=c("color","year","status") lapply(length(by):0, function(i) by[0:i]) }) +test(1750.25, + cube(copy(dt), j = lapply(.SD, mean), by = "color", .SDcols = 4, id=TRUE), + groupingsets(dt, j = lapply(.SD, mean), by = "color", .SDcols = "amount", + sets = list("color", character(0)), id = TRUE) +) test(1750.31, rollup(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), id=TRUE), groupingsets(dt, j = c(list(cnt=.N), lapply(.SD, sum)), by = c("color","year","status"), sets=sets, id=TRUE) From 159559564056740b153eb12c10b8fe28ac2684d3 Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Sun, 4 Jan 2026 00:28:52 +0530 Subject: [PATCH 17/25] removing trailing spaces --- R/data.table.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 5fc9bfa1ce..0924b1b7c5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1054,11 +1054,10 @@ replace_dot_alias = function(e) { try_processSDcols = FALSE } }, error = function(e) { - try_processSDcols <<- FALSE + try_processSDcols <<- FALSE }) } if (!try_processSDcols) { - if (colsub %iscall% c("!", "-") && length(colsub) == 2L) { negate_sdcols = TRUE colsub = colsub[[2L]] From 25fbd53d0bc4e9dac56e96bc1eb58a6dda5615b3 Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Sun, 4 Jan 2026 00:48:08 +0530 Subject: [PATCH 18/25] removed super assignment --- R/data.table.R | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 0924b1b7c5..156a813079 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1038,24 +1038,25 @@ replace_dot_alias = function(e) { # NB: _unary_ '-', not _binary_ '-' (#5826). Test for '!' length-2 should be redundant but low-cost & keeps code concise. try_processSDcols = !(colsub %iscall% c("!", "-") && length(colsub) == 2L) && !(colsub %iscall% ':') && !(colsub %iscall% 'patterns') if (try_processSDcols) { - tryCatch({ - sdcols_result = .processSDcols( - SDcols_sub = colsub, - SDcols_missing = FALSE, - x = x, - jsub = jsub, - by = union(bynames, allbyvars), - enclos = parent.frame() - ) + sdcols_result = tryCatch({ + .processSDcols( + SDcols_sub = colsub, + SDcols_missing = FALSE, + x = x, + jsub = jsub, + by = union(bynames, allbyvars), + enclos = parent.frame() + ) + }, error = function(e) { + NULL + }) if (!is.null(sdcols_result)) { ansvars = sdvars = sdcols_result$ansvars ansvals = sdcols_result$ansvals + try_processSDcols = TRUE } else { try_processSDcols = FALSE } - }, error = function(e) { - try_processSDcols <<- FALSE - }) } if (!try_processSDcols) { if (colsub %iscall% c("!", "-") && length(colsub) == 2L) { From 32d078d606cf8e82acf810696241d697969ebc9b Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Sun, 4 Jan 2026 00:52:00 +0530 Subject: [PATCH 19/25] removed trailing whitespace --- R/data.table.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 156a813079..87f6a5cc5a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1040,15 +1040,15 @@ replace_dot_alias = function(e) { if (try_processSDcols) { sdcols_result = tryCatch({ .processSDcols( - SDcols_sub = colsub, - SDcols_missing = FALSE, - x = x, - jsub = jsub, - by = union(bynames, allbyvars), + SDcols_sub = colsub, + SDcols_missing = FALSE, + x = x, + jsub = jsub, + by = union(bynames, allbyvars), enclos = parent.frame() ) }, error = function(e) { - NULL + NULL }) if (!is.null(sdcols_result)) { ansvars = sdvars = sdcols_result$ansvars From b2e6171d7a027a143a1b264cdb19f0e6fe733e9e Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Sun, 4 Jan 2026 02:11:22 +0530 Subject: [PATCH 20/25] review changes --- R/data.table.R | 2 +- R/groupingsets.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 87f6a5cc5a..9c6eaa8478 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1044,7 +1044,7 @@ replace_dot_alias = function(e) { SDcols_missing = FALSE, x = x, jsub = jsub, - by = union(bynames, allbyvars), + by = substitute(by), enclos = parent.frame() ) }, error = function(e) { diff --git a/R/groupingsets.R b/R/groupingsets.R index d18d0def62..29105e3163 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -21,7 +21,7 @@ rollup.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { names_x = names(x) bysub = substitute(by) allbyvars = intersect(all.vars(bysub), names_x) - usesSD = any(all.vars(jsub) == ".SD") + usesSD = ".SD" %chin% all.vars(jsub) if (!usesSD) { return(NULL) } @@ -31,7 +31,7 @@ rollup.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { return(list(ansvars = ansvars, sdvars = sdvars, ansvals = ansvals)) } sub.result = SDcols_sub - if (is.call(sub.result) && as.character(sub.result[[1L]]) == "patterns") { + if (sub.result %iscall% "patterns") { .SDcols = eval_with_cols(sub.result, names_x) } else { .SDcols = eval(sub.result, enclos) From a6d00fabf6ad740eaafa1053d3c0b87e0fb59f2c Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Tue, 6 Jan 2026 14:02:28 +0530 Subject: [PATCH 21/25] moved helper to data.table.R --- R/data.table.R | 44 ++++++++++++++++++++++++++++++++++++++++++++ R/groupingsets.R | 44 -------------------------------------------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 9c6eaa8478..277fb7e913 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -147,6 +147,50 @@ replace_dot_alias = function(e) { } } +# Helper function to process SDcols +.processSDcols = function(SDcols_sub, SDcols_missing, x, jsub, by, enclos = parent.frame()) { + names_x = names(x) + bysub = substitute(by) + allbyvars = intersect(all.vars(bysub), names_x) + usesSD = ".SD" %chin% all.vars(jsub) + if (!usesSD) { + return(NULL) + } + if (SDcols_missing) { + ansvars = sdvars = setdiff(unique(names_x), union(by, allbyvars)) + ansvals = match(ansvars, names_x) + return(list(ansvars = ansvars, sdvars = sdvars, ansvals = ansvals)) + } + sub.result = SDcols_sub + if (sub.result %iscall% "patterns") { + .SDcols = eval_with_cols(sub.result, names_x) + } else { + .SDcols = eval(sub.result, enclos) + } + if (anyNA(.SDcols)) + stopf(".SDcols missing at the following indices: %s", brackify(which(is.na(.SDcols)))) + if (is.character(.SDcols)) { + idx = .SDcols %chin% names_x + if (!all(idx)) + stopf("Some items of .SDcols are not column names: %s", toString(.SDcols[!idx])) + ansvars = sdvars = .SDcols + ansvals = match(ansvars, names_x) + } else if (is.numeric(.SDcols)) { + ansvals = as.integer(.SDcols) + if (any(ansvals < 1L | ansvals > length(names_x))) + stopf(".SDcols contains indices out of bounds") + ansvars = sdvars = names_x[ansvals] + } else if (is.logical(.SDcols)) { + if (length(.SDcols) != length(names_x)) + stopf(".SDcols is a logical vector of length %d but there are %d columns", length(.SDcols), length(names_x)) + ansvals = which(.SDcols) + ansvars = sdvars = names_x[ansvals] + } else { + stopf(".SDcols must be character, numeric, or logical") + } + list(ansvars = ansvars, sdvars = sdvars, ansvals = ansvals) +} + "[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0.0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL, showProgress=getOption("datatable.showProgress", interactive())) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could diff --git a/R/groupingsets.R b/R/groupingsets.R index 29105e3163..661ac1af09 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -16,50 +16,6 @@ rollup.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { groupingsets.data.table(x, by=by, sets=sets, .SDcols=.SDcols, id=id, jj=jj, label=label, enclos = parent.frame()) } -# Helper function to process SDcols -.processSDcols = function(SDcols_sub, SDcols_missing, x, jsub, by, enclos = parent.frame()) { - names_x = names(x) - bysub = substitute(by) - allbyvars = intersect(all.vars(bysub), names_x) - usesSD = ".SD" %chin% all.vars(jsub) - if (!usesSD) { - return(NULL) - } - if (SDcols_missing) { - ansvars = sdvars = setdiff(unique(names_x), union(by, allbyvars)) - ansvals = match(ansvars, names_x) - return(list(ansvars = ansvars, sdvars = sdvars, ansvals = ansvals)) - } - sub.result = SDcols_sub - if (sub.result %iscall% "patterns") { - .SDcols = eval_with_cols(sub.result, names_x) - } else { - .SDcols = eval(sub.result, enclos) - } - if (anyNA(.SDcols)) - stopf(".SDcols missing at the following indices: %s", brackify(which(is.na(.SDcols)))) - if (is.character(.SDcols)) { - idx = .SDcols %chin% names_x - if (!all(idx)) - stopf("Some items of .SDcols are not column names: %s", toString(.SDcols[!idx])) - ansvars = sdvars = .SDcols - ansvals = match(ansvars, names_x) - } else if (is.numeric(.SDcols)) { - ansvals = as.integer(.SDcols) - if (any(ansvals < 1L | ansvals > length(names_x))) - stopf(".SDcols contains indices out of bounds") - ansvars = sdvars = names_x[ansvals] - } else if (is.logical(.SDcols)) { - if (length(.SDcols) != length(names_x)) - stopf(".SDcols is a logical vector of length %d but there are %d columns", length(.SDcols), length(names_x)) - ansvals = which(.SDcols) - ansvars = sdvars = names_x[ansvals] - } else { - stopf(".SDcols must be character, numeric, or logical") - } - list(ansvars = ansvars, sdvars = sdvars, ansvals = ansvals) -} - cube = function(x, ...) { UseMethod("cube") } From cde91ae2b84de39869fea64baa4192f1c51a3c59 Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Thu, 8 Jan 2026 00:07:49 +0530 Subject: [PATCH 22/25] removed try catch block --- R/data.table.R | 49 ++++++++++++++++++++++++------------------- inst/tests/tests.Rraw | 4 ---- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 277fb7e913..4f6c1ad0e3 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -162,11 +162,21 @@ replace_dot_alias = function(e) { return(list(ansvars = ansvars, sdvars = sdvars, ansvals = ansvals)) } sub.result = SDcols_sub + if (sub.result %iscall% ':' && length(sub.result) == 3L) { + return(NULL) + } + if (sub.result %iscall% c("!", "-") && length(sub.result) == 2L) { + negate_sdcols = TRUE + sub.result = sub.result[[2L]] + } else negate_sdcols = FALSE if (sub.result %iscall% "patterns") { .SDcols = eval_with_cols(sub.result, names_x) } else { .SDcols = eval(sub.result, enclos) } + if (!is.character(.SDcols) && !is.numeric(.SDcols) && !is.logical(.SDcols)) { + return(NULL) + } if (anyNA(.SDcols)) stopf(".SDcols missing at the following indices: %s", brackify(which(is.na(.SDcols)))) if (is.character(.SDcols)) { @@ -177,9 +187,10 @@ replace_dot_alias = function(e) { ansvals = match(ansvars, names_x) } else if (is.numeric(.SDcols)) { ansvals = as.integer(.SDcols) - if (any(ansvals < 1L | ansvals > length(names_x))) - stopf(".SDcols contains indices out of bounds") + if (length(unique(sign(.SDcols))) > 1L) stopf(".SDcols is numeric but has both +ve and -ve indices") + if (any(idx <- abs(.SDcols) > ncol(x) | abs(.SDcols) < 1L)) stopf(".SDcols is numeric but out of bounds [1, %d] at: %s", ncol(x), brackify(which(idx))) ansvars = sdvars = names_x[ansvals] + ansvals = if (negate_sdcols) setdiff(seq_along(names(x)), c(.SDcols, which(names(x) %chin% bynames))) else .SDcols } else if (is.logical(.SDcols)) { if (length(.SDcols) != length(names_x)) stopf(".SDcols is a logical vector of length %d but there are %d columns", length(.SDcols), length(names_x)) @@ -1082,25 +1093,21 @@ replace_dot_alias = function(e) { # NB: _unary_ '-', not _binary_ '-' (#5826). Test for '!' length-2 should be redundant but low-cost & keeps code concise. try_processSDcols = !(colsub %iscall% c("!", "-") && length(colsub) == 2L) && !(colsub %iscall% ':') && !(colsub %iscall% 'patterns') if (try_processSDcols) { - sdcols_result = tryCatch({ - .processSDcols( - SDcols_sub = colsub, - SDcols_missing = FALSE, - x = x, - jsub = jsub, - by = substitute(by), - enclos = parent.frame() - ) - }, error = function(e) { - NULL - }) - if (!is.null(sdcols_result)) { - ansvars = sdvars = sdcols_result$ansvars - ansvals = sdcols_result$ansvals - try_processSDcols = TRUE - } else { - try_processSDcols = FALSE - } + sdcols_result = .processSDcols( + SDcols_sub = colsub, + SDcols_missing = FALSE, + x = x, + jsub = jsub, + by = substitute(by), + enclos = parent.frame() + ) + if (!is.null(sdcols_result)) { + ansvars = sdvars = sdcols_result$ansvars + ansvals = sdcols_result$ansvals + } + else { + try_processSDcols = FALSE + } } if (!try_processSDcols) { if (colsub %iscall% c("!", "-") && length(colsub) == 2L) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 23ef7ff129..6894b23ba4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11536,10 +11536,6 @@ test(1750.38, id = TRUE) ) test(1750.39, - cube(dt, j = lapply(.SD, sum), by = "color", .SDcols = list("amount")), - error = ".SDcols must be character, numeric, or logical" -) -test(1750.40, cube(dt, j = lapply(.SD, sum), by = "color", .SDcols = c(1, 99)), error = "out of bounds" ) From f2f2d9f81c520a9562bd80864096723095d6313d Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Thu, 8 Jan 2026 00:11:59 +0530 Subject: [PATCH 23/25] removed whitespace --- R/data.table.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 4f6c1ad0e3..1d4a147def 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1104,9 +1104,9 @@ replace_dot_alias = function(e) { if (!is.null(sdcols_result)) { ansvars = sdvars = sdcols_result$ansvars ansvals = sdcols_result$ansvals - } + } else { - try_processSDcols = FALSE + try_processSDcols = FALSE } } if (!try_processSDcols) { From 734d5cc3e2fc4e61dd2e317dac0a8db522d75aeb Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Sun, 18 Jan 2026 13:11:15 +0530 Subject: [PATCH 24/25] additional tests for processSDcols --- inst/tests/tests.Rraw | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6894b23ba4..c4c20b1776 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4061,6 +4061,15 @@ set.seed(45) DT = data.table(x=c("A", "A", "C", "C"), y=1:4, z=runif(4)) test(1137.12, DT[, lapply(.SD, sum), by=x, .SDcols=-"y"], DT[, lapply(.SD, sum), by=x, .SDcols="z"]) +# Additional tests for .processSDcols coverage +DT <- data.table(x=1:5, y=6:10, z=11:15) +test(1137.13, DT[, .SD, .SDcols = c("x", NA_character_, "y")], error = ".SDcols missing at the following indices") +test(1137.14, DT[, .SD, .SDcols = c(TRUE, FALSE, TRUE)], DT[, c(1,3), with=FALSE]) +test(1137.15, DT[, .SD, .SDcols = c(TRUE, FALSE)], error = ".SDcols is a logical vector of length 2 but there are 3 columns") +test(1137.16, DT[, .SD, .SDcols = !c(FALSE, TRUE, FALSE)], DT[, c(1,3), with=FALSE]) +test(1137.17, DT[, .SD, .SDcols = 5L], error = "out of bounds.*1.*3.*at") +test(1137.18, DT[, .SD, .SDcols = c("x", "notexist")], error = "Some items of .SDcols are not column names.*notexist") + # test for FR #353 / R-Forge #353 - print.data.table gets new argument "row.names", default=TRUE. if FALSE, the row-names don't get printed # Thanks to Eddi for `capture.output` function! DT <- data.table(x=1:5, y=6:10) From 157808f17502857b3a858fe3a58979f5ff35b16d Mon Sep 17 00:00:00 2001 From: sisyphuswastaken Date: Sun, 18 Jan 2026 13:26:48 +0530 Subject: [PATCH 25/25] Revert "Merge branch 'master' into fix-7543" This reverts commit 62714b5c2893de5b1f148e2cc7a7cd87ecb74867, reversing changes made to 734d5cc3e2fc4e61dd2e317dac0a8db522d75aeb. --- .github/CONTRIBUTING.md | 2 +- DESCRIPTION | 3 +- NEWS.md | 36 +- R/as.data.table.R | 8 +- R/data.table.R | 365 +++-- R/duplicated.R | 2 +- R/fcast.R | 12 +- R/fmelt.R | 2 +- R/foverlaps.R | 8 +- R/frank.R | 2 +- R/fread.R | 6 +- R/frollapply.R | 10 +- R/groupingsets.R | 17 +- R/helpers.R | 4 +- R/merge.R | 8 +- R/rowwiseDT.R | 8 - R/setkey.R | 4 +- R/test.data.table.R | 55 +- inst/tests/benchmark.Rraw | 26 +- inst/tests/froll.Rraw | 4 +- inst/tests/nafill.Rraw | 167 +-- inst/tests/optimize.Rraw | 486 ------- inst/tests/tests.Rraw | 1216 ++++++++++++----- man/data.table.Rd | 2 +- man/nafill.Rd | 5 - man/setkey.Rd | 2 +- man/setorder.Rd | 2 +- man/test.Rd | 5 +- src/assign.c | 218 ++- src/between.c | 4 +- src/data.table.h | 11 - src/dogroups.c | 54 +- src/fastmean.c | 4 +- src/fifelse.c | 14 +- src/fmelt.c | 12 +- src/fread.c | 4 +- src/froll.c | 12 +- src/frollR.c | 4 +- src/fsort.c | 2 +- src/gsumm.c | 26 +- src/mergelist.c | 26 +- src/nafill.c | 87 +- src/openmp-utils.c | 2 +- src/rbindlist.c | 14 +- src/transpose.c | 6 +- src/types.h | 2 +- src/uniqlist.c | 2 +- src/utils.c | 33 +- src/wrappers.c | 7 +- tests/optimize.R | 2 - vignettes/es/datatable-benchmarking.Rmd | 147 -- vignettes/es/datatable-faq.Rmd | 674 --------- vignettes/es/datatable-fread-and-fwrite.Rmd | 295 ---- vignettes/es/datatable-importing.Rmd | 298 ---- vignettes/es/datatable-intro.Rmd | 726 ---------- vignettes/es/datatable-joins.Rmd | 725 ---------- vignettes/es/datatable-keys-fast-subset.Rmd | 500 ------- vignettes/es/datatable-programming.Rmd | 485 ------- .../es/datatable-reference-semantics.Rmd | 413 ------ vignettes/es/datatable-reshape.Rmd | 295 ---- vignettes/es/datatable-sd-usage.Rmd | 262 ---- ...le-secondary-indices-and-auto-indexing.Rmd | 364 ----- 62 files changed, 1434 insertions(+), 6763 deletions(-) delete mode 100644 inst/tests/optimize.Rraw delete mode 100644 tests/optimize.R delete mode 100644 vignettes/es/datatable-benchmarking.Rmd delete mode 100644 vignettes/es/datatable-faq.Rmd delete mode 100644 vignettes/es/datatable-fread-and-fwrite.Rmd delete mode 100644 vignettes/es/datatable-importing.Rmd delete mode 100644 vignettes/es/datatable-intro.Rmd delete mode 100644 vignettes/es/datatable-joins.Rmd delete mode 100644 vignettes/es/datatable-keys-fast-subset.Rmd delete mode 100644 vignettes/es/datatable-programming.Rmd delete mode 100644 vignettes/es/datatable-reference-semantics.Rmd delete mode 100644 vignettes/es/datatable-reshape.Rmd delete mode 100644 vignettes/es/datatable-sd-usage.Rmd delete mode 100644 vignettes/es/datatable-secondary-indices-and-auto-indexing.Rmd diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 374bcd81fb..713508c06b 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -43,7 +43,7 @@ If you are not fixing an open issue and you are confident, you do not need to fi 1. Unless the change is trivial (e.g. typo fix) there must be a new entry in [NEWS](https://github.com/Rdatatable/data.table/blob/master/NEWS.md). Please use the name of the user-visible function at the start to aid users quickly scanning the news item, explain the feature/bug, and thank issue/PR contributors by name. Follow the prevailing style at the top of the file; e.g. "fread with X in Y circumstance would error/segfault, [#123](issue link). Thanks to _(them)_ for reporting and _(me)_ for fixing". These are the release notes that others quickly skim and search so please use relevant helpful keywords with that in mind. If the problem was an error/warning/message, please include the error/warning/message in the news item so that folks searching for it will have a better chance of finding the news item, and to make the news item more specific. Bug fixes are under the bug fixes section heading so there is no need to include words such as "is fixed" in the first sentence because that is implicit. Please link to the issue(s) not to the PR (unless there is just a PR and no issue); if folk are interested in the detail of the fix they can get to the PR from the issue. Again: please follow the prevailing style of news items. Doing so makes it much easier and faster to review and merge. -1. Please create the PR against the `master` branch. You can do that by forking the repository, creating a new branch for your feature/bugfix in the forked project, and then using that as a base for your pull requests. After your first successful merged PR you will very likely be invited to be a [project member](https://github.com/orgs/Rdatatable/teams/project-members). This will allow you to create your next branch directly in the project which is easier and more convenient than forking, both for you and for Rdatatable's maintainers. Working on _branches_ on this (Rdatatable) project will _not_ affect the core code, so you can feel free to experiment as a project member; the core code is on the `master` branch, and only data.table [committers](https://github.com/orgs/Rdatatable/teams/committers) can push/merge code there. Remember to do `git pull upstream your_branch` (where `upstream` is the name of the remote for `Rdatatable/data.table` seen in `git remote -v`) each time you want to add something; this will keep your local branch up to date with remote, in case anyone makes commits you don't yet have locally. This will reduce the number of merge conflicts you will need to deal with. Do not use `git rebase` on a branch where other users are pushing. +1. Please create the PR against the `master` branch. You can do that by forking the repository, creating a new branch for your feature/bugfix in the forked project, and then using that as a base for your pull requests. After your first successful merged PR you will very likely be invited to be a [project member](https://github.com/orgs/Rdatatable/teams/project-members). This will allow you to create your next branch directly in the project which is easier and more convenient than forking, both for you and for Rdatatable's maintainers. Working on _branches_ on this (Rdatatable) project will _not_ affect the core code, so you can feel free to experiment as a project member; the core code is on the `master` branch, and only data.table [committers](https://github.com/orgs/Rdatatable/teams/maintainers) can push/merge code there. Remember to do `git pull upstream your_branch` (where `upstream` is the name of the remote for `Rdatatable/data.table` seen in `git remote -v`) each time you want to add something; this will keep your local branch up to date with remote, in case anyone makes commits you don't yet have locally. This will reduce the number of merge conflicts you will need to deal with. Do not use `git rebase` on a branch where other users are pushing. 1. Just one feature/bugfix per PR please. Small changes are easier to review and accept than big sweeping changes. Sometimes big sweeping changes are needed and we just have to discuss those case by case. diff --git a/DESCRIPTION b/DESCRIPTION index 3860e0c480..bba370c417 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -106,6 +106,5 @@ Authors@R: c( person(given="@badasahog", role="ctb", comment="GitHub user"), person("Vinit", "Thakur", role="ctb"), person("Mukul", "Kumar", role="ctb"), - person("Ildikó", "Czeller", role="ctb"), - person("Manmita", "Das", role="ctb") + person("Ildikó", "Czeller", role="ctb") ) diff --git a/NEWS.md b/NEWS.md index f218b93fbc..23e8d5c873 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,17 +14,15 @@ ### NEW FEATURES -1. `nafill()`, `setnafill()` extended to work on logical and factor vectors (part of [#3992](https://github.com/Rdatatable/data.table/issues/3992)). Includes support for `Date`, `IDate`, `POSIXct`, etc. `nafill()` works for character vectors, but not yet `setnafill()`. Thanks @jangorecki for the request and @jangorecki and @MichaelChirico for the PRs. +1. `nafill()`, `setnafill()` extended to work on logical vectors (part of [#3992](https://github.com/Rdatatable/data.table/issues/3992)). Thanks @jangorecki for the request and @MichaelChirico for the PR. -2. `[,showProgress=]` and `options(datatable.showProgress)` now accept an integer to control the progress bar update interval in seconds, allowing finer control over progress reporting frequency; `TRUE` uses the default 3-second interval, [#6514](https://github.com/Rdatatable/data.table/issues/6514). Thanks @ethanbsmith for the report and @ben-schwen for the PR. +### Notes -3. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. +1. {data.table} now depends on R 3.5.0 (2018). - This rewrite also introduces several new optimizations: - - Enables Map in addition to lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) - - lapply optimization works without .SD (e.g., `lapply(list(col1, col2), fun)` -> `list(fun(col1), fun(col2))` [#5032](https://github.com/Rdatatable/data.table/issues/5032) - - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))` will use GForce, saving the need to coerce `x` in a setup step) [#2934](https://github.com/Rdatatable/data.table/issues/2934) - - Arithmetic operation support in GForce (e.g., `max(x) - min(x)` will use GForce on both `max(x)` and `min(x)`, saving the need to do the subtraction in a follow-up step) [#3815](https://github.com/Rdatatable/data.table/issues/3815) +2. pydatatable compatibility layer in `fread()` and `fwrite()` has been removed, [#7069](https://github.com/Rdatatable/data.table/issues/7069). Thanks @badasahog for the report and the PR. + +3. Vignettes are now built using `litedown` instead of `knitr`, [#6394](https://github.com/Rdatatable/data.table/issues/6394). Thanks @jangorecki for the suggestion and @ben-schwen and @aitap for the implementation. ### BUG FIXES @@ -34,28 +32,6 @@ 3. `fread("file://...")` works for file URIs with spaces, [#7550](https://github.com/Rdatatable/data.table/issues/7550). Thanks @aitap for the report and @MichaelChirico for the PR. -4. `sum()` by group is correct with missing entries and GForce activated ([#7571](https://github.com/Rdatatable/data.table/issues/7571)). Thanks to @rweberc for the report and @manmita for the fix. The issue was caused by a faulty early `break` that spilled between groups, and resulted in silently incorrect results! - -5. `fread(text=)` could segfault when reading text input ending with a `\x1a` (ASCII SUB) character after a long line, [#7407](https://github.com/Rdatatable/data.table/issues/7407) which is solved by adding check for eof. Thanks @aitap for the report and @manmita for the fix. - -6. `rowwiseDT()` now provides a helpful error message when a complex object that is not a list (e.g., a function) is provided as a cell value, instructing the user to wrap it in `list()`, [#7219](https://github.com/Rdatatable/data.table/issues/7219). Thanks @kylebutts for the report and @venom1204 for the fix. - -7. Fixed compilation failure like "error: unknown type name 'siginfo_t'" in v1.18.0 in some strict environments, e.g., FreeBSD, where the header file declaring the POSIX function `waitid` does not transitively include the header file defining the `siginfo_t` type, [#7516](https://github.com/rdatatable/data.table/issues/7516). Thanks to @jszhao for the report and @aitap for the fix. - -8. When fixing duplicate factor levels, `setattr()` no longer crashes upon encountering missing factor values, [#7595](https://github.com/Rdatatable/data.table/issues/7595). Thanks to @sindribaldur for the report and @aitap for the fix. - -### Notes - -1. {data.table} now depends on R 3.5.0 (2018). - -2. pydatatable compatibility layer in `fread()` and `fwrite()` has been removed, [#7069](https://github.com/Rdatatable/data.table/issues/7069). Thanks @badasahog for the report and the PR. - -3. Vignettes are now built using `litedown` instead of `knitr`, [#6394](https://github.com/Rdatatable/data.table/issues/6394). Thanks @jangorecki for the suggestion and @ben-schwen and @aitap for the implementation. - -4. Removed use of non-API `ATTRIB`, `SET_ATTRIB`, and `findVar` [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks @aitap for the continued assiduous work here, and @MichaelChirico for the easy fix to replace `findVar` with `R_getVar`. - -5. The data.table test suite is a bit more robust to lacking UTF-8 support via a new `requires_utf8` argument to `test()` to skip tests when UTF-8 support is not available, [#7336](https://github.com/Rdatatable/data.table/issues/7336). Thanks @MichaelChirico for the suggestion and @ben-schwen for the implementation. - ## data.table [v1.18.0](https://github.com/Rdatatable/data.table/milestone/37?closed=1) 23 December 2025 ### BREAKING CHANGE diff --git a/R/as.data.table.R b/R/as.data.table.R index 1894235462..5b4dea6975 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -89,10 +89,10 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va stopf("as.data.table.array method should only be called for arrays with 3+ dimensions; use the matrix method for 2-dimensional arrays") if (!is.character(value.name) || length(value.name)!=1L || is.na(value.name) || !nzchar(value.name)) stopf("Argument 'value.name' must be scalar character, non-NA and at least one character") - if (!isTRUEorFALSE(sorted)) - stopf("'%s' must be TRUE or FALSE", "sorted") - if (!isTRUEorFALSE(na.rm)) - stopf("'%s' must be TRUE or FALSE", "na.rm") + if (!is.logical(sorted) || length(sorted)!=1L || is.na(sorted)) + stopf("Argument 'sorted' must be scalar logical and non-NA") + if (!is.logical(na.rm) || length(na.rm)!=1L || is.na(na.rm)) + stopf("Argument 'na.rm' must be scalar logical and non-NA") if (!missing(sorted) && !is.null(key)) stopf("Please provide either 'key' or 'sorted', but not both.") diff --git a/R/data.table.R b/R/data.table.R index 67f22805ee..1d4a147def 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -299,7 +299,7 @@ replace_dot_alias = function(e) { if ((isTRUE(which)||is.na(which)) && !missing(j)) stopf("which==%s (meaning return row numbers) but j is also supplied. Either you need row numbers or the result of j, but only one type of result can be returned.", which) if (is.null(nomatch) && is.na(which)) stopf("which=NA with nomatch=0|NULL would always return an empty vector. Please change or remove either which or nomatch.") if (!with && missing(j)) stopf("j must be provided when with=FALSE") - if (!missing(by) && !(isTRUEorFALSE(showProgress) || (is.numeric(showProgress) && length(showProgress)==1L && showProgress >= 0))) stopf("showProgress must be TRUE, FALSE, or a single non-negative number") # nocov + if (!missing(by) && !isTRUEorFALSE(showProgress)) stopf("%s must be TRUE or FALSE", "showProgress") irows = NULL # Meaning all rows. We avoid creating 1:nrow(x) for efficiency. notjoin = FALSE rightcols = leftcols = integer() @@ -1584,8 +1584,8 @@ replace_dot_alias = function(e) { ########################################################################### o__ = integer() - if (".N" %chin% ansvars) stopf("The column '.%1$s' can't be grouped because it conflicts with the special .%1$s variable. Try setnames(DT,'.%1$s','%1$s') first.", "N") - if (".I" %chin% ansvars) stopf("The column '.%1$s' can't be grouped because it conflicts with the special .%1$s variable. Try setnames(DT,'.%1$s','%1$s') first.", "I") + if (".N" %chin% ansvars) stopf("The column '.N' can't be grouped because it conflicts with the special .N variable. Try setnames(DT,'.N','N') first.") + if (".I" %chin% ansvars) stopf("The column '.I' can't be grouped because it conflicts with the special .I variable. Try setnames(DT,'.I','I') first.") SDenv$.iSD = NULL # null.data.table() SDenv$.xSD = NULL # null.data.table() - introducing for FR #2693 and Gabor's post on fixing for FAQ 2.8 @@ -1724,11 +1724,253 @@ replace_dot_alias = function(e) { SDenv$.NGRP = length(f__) lockBinding(".NGRP", SDenv) - # Determine GForce-optimized query - gforce_result = .attempt_optimize(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, parent.frame()) - GForce = gforce_result$GForce - jsub = gforce_result$jsub - jvnames = gforce_result$jvnames + GForce = FALSE + if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N"))) ) { # Ability to turn off if problems or to benchmark the benefit + # Optimization to reduce overhead of calling lapply over and over for each group + oldjsub = jsub + funi = 1L # Fix for #985 + # converted the lapply(.SD, ...) to a function and used below, easier to implement FR #2722 then. + .massageSD = function(jsub) { + txt = as.list(jsub)[-1L] + if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "") # fixes bug #110 + fun = txt[[2L]] + if (fun %iscall% "function") { + # Fix for #2381: added SDenv$.SD to 'eval' to take care of cases like: lapply(.SD, function(x) weighted.mean(x, bla)) where "bla" is a column in DT + # http://stackoverflow.com/questions/13441868/data-table-and-stratified-means + # adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD) + # replaced SDenv$.SD to SDenv to deal with Bug #87 reported by Ricardo (Nice catch!) + thisfun = paste0("..FUN", funi) # Fix for #985 + assign(thisfun,eval(fun, SDenv, SDenv), SDenv) # to avoid creating function() for each column of .SD + lockBinding(thisfun,SDenv) + txt[[1L]] = as.name(thisfun) + } else { + if (is.character(fun)) fun = as.name(fun) + txt[[1L]] = fun + } + ans = vector("list", length(sdvars)+1L) + ans[[1L]] = as.name("list") + for (ii in seq_along(sdvars)) { + txt[[2L]] = as.name(sdvars[ii]) + ans[[ii+1L]] = as.call(txt) + } + jsub = as.call(ans) # important no names here + jvnames = sdvars # but here instead + list(jsub, jvnames) + # It may seem inefficient to construct a potentially long expression. But, consider calling + # lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it + # is called, involving small memory allocations. + # The R level lapply calls as.list which needs a shallow copy. + # lapply also does a setAttib of names (duplicating the same names over and over again + # for each group) which is terrible for our needs. We replace all that with a + # (ok, long, but not huge in memory terms) list() which is primitive (so avoids symbol + # lookup), and the eval() inside dogroups hardly has to do anything. All this results in + # overhead minimised. We don't need to worry about the env passed to the eval in a possible + # lapply replacement, or how to pass ... efficiently to it. + # Plus we optimize lapply first, so that mean() can be optimized too as well, next. + } + if (is.name(jsub)) { + if (jsub == ".SD") { + jsub = as.call(c(quote(list), lapply(sdvars, as.name))) + jvnames = sdvars + } + } else if (is.name(jsub[[1L]])) { # Else expect problems with + # g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612 + subopt = length(jsub) == 3L && + (jsub %iscall% "[" || + (jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) && + (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") + headopt = jsub %iscall% c("head", "tail") + firstopt = jsub %iscall% c("first", "last") # fix for #2030 + if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") && + (subopt || headopt || firstopt)) { + if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462 + # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. + jsub = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub }))) + jvnames = sdvars + } else if (jsub %iscall% "lapply" && jsub[[2L]]==".SD" && length(xcols)) { + deparse_ans = .massageSD(jsub) + jsub = deparse_ans[[1L]] + jvnames = deparse_ans[[2L]] + } else if (jsub %iscall% "c" && length(jsub) > 1L) { + # TODO, TO DO: raise the checks for 'jvnames' earlier (where jvnames is set by checking 'jsub') and set 'jvnames' already. + # FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here. + # FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains + # 1) lapply(.SD, ...), 2) simply .SD or .SD[..], 3) .N, 4) list(...) and 5) functions that normally return a single value* + # On 5)* the IMPORTANT point to note is that things that are not wrapped within "list(...)" should *always* + # return length 1 output for us to optimise. Else, there's no equivalent to optimising c(...) to list(...) AFAICT. + # One issue could be that these functions (e.g., mean) can be "re-defined" by the OP to produce a length > 1 output + # Of course this is worrying too much though. If the issue comes up, we'll just remove the relevant optimisations. + # For now, we optimise all functions mentioned in 'optfuns' below. + optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var") + is_valid = TRUE + any_SD = FALSE + jsubl = as.list.default(jsub) + oldjvnames = jvnames + jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward. + # Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!! + for (i_ in 2L:length(jsubl)) { + this = jsub[[i_]] + if (is.name(this)) { # no need to check length(this)==1L; is.name() returns single TRUE or FALSE (documented); can't have a vector of names + if (this == ".SD") { # optimise '.SD' alone + any_SD = TRUE + jsubl[[i_]] = lapply(sdvars, as.name) + jvnames = c(jvnames, sdvars) + } else if (this == ".N") { + # don't optimise .I in c(.SD, .I), it's length can be > 1 + # only c(.SD, list(.I)) should be optimised!! .N is always length 1. + jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this)) + } else { + # jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) + is_valid=FALSE + break + } + } else if (is.call(this)) { + if (this[[1L]] == "lapply" && this[[2L]] == ".SD" && length(xcols)) { + any_SD = TRUE + deparse_ans = .massageSD(this) + funi = funi + 1L # Fix for #985 + jsubl[[i_]] = as.list(deparse_ans[[1L]][-1L]) # just keep the '.' from list(.) + jn__ = deparse_ans[[2L]] + if (isTRUE(nzchar(names(jsubl)[i_]))) { + # Fix for #2311, prepend named arguments of c() to column names of .SD + # e.g. c(mean=lapply(.SD, mean)) + jn__ = paste(names(jsubl)[i_], jn__, sep=".") # sep="." for consistency with c(A=list(a=1,b=1)) + } + jvnames = c(jvnames, jn__) + } else if (this[[1L]] == "list") { + # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen + if (length(this) > 1L) { + jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) + if (isTRUE(nzchar(names(jsubl)[i_]))) { + # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* + njl__ = names(jl__) %||% rep("", length(jl__)) + njl__nonblank = nzchar(names(jl__)) + if (length(jl__) > 1L) { + jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) + } else { + jn__ = names(jsubl)[i_] + } + jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".") + } else { + jn__ = names(jl__) %||% rep("", length(jl__)) + } + idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) + if (any(idx)) + jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && + jvnames = c(jvnames, jn__) + jsubl[[i_]] = jl__ + } + } else if (this %iscall% optfuns && length(this)>1L) { + jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) + } else if ( length(this) == 3L && (this[[1L]] == "[" || this[[1L]] == "head") && + this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N") ) { + # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. + any_SD = TRUE + jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this }) + jvnames = c(jvnames, sdvars) + } else if (any(all.vars(this) == ".SD")) { + # TODO, TO DO: revisit complex cases (as illustrated below) + # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp] + # hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation. + is_valid=FALSE + break + } else { # just to be sure that any other case (I've overlooked) runs smoothly, without optimisation + # TO DO, TODO: maybe a message/warning here so that we can catch the overlooked cases, if any? + is_valid=FALSE + break + } + } else { + is_valid = FALSE + break + } + } + if (!is_valid || !any_SD) { # restore if c(...) doesn't contain lapply(.SD, ..) or if it's just invalid + jvnames = oldjvnames # reset jvnames + jsub = oldjsub # reset jsub + jsubl = as.list.default(jsubl) # reset jsubl + } else { + setattr(jsubl, 'names', NULL) + jsub = as.call(unlist(jsubl, use.names=FALSE)) + jsub[[1L]] = quote(list) + } + } + } + if (verbose) { + if (!identical(oldjsub, jsub)) + catf("lapply optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub,width.cutoff=200L, nlines=1L)) + else + catf("lapply optimization is on, j unchanged as '%s'\n", deparse(jsub,width.cutoff=200L, nlines=1L)) + } + # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with + # nomatch=NULL even now.. but not switching it on yet, will deal it separately. + if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__)) { + if (!length(ansvars) && !use.I) { + GForce = FALSE + if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { + GForce = TRUE + if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n",deparse(jsub, width.cutoff=200L, nlines=1L)) + } + } else if (length(lhs) && is.symbol(jsub)) { # turn off GForce for the combination of := and .N + GForce = FALSE + } else { + # Apply GForce + if (jsub %iscall% "list") { + GForce = TRUE + for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { + if (!.gforce_ok(jsub[[ii]], SDenv$.SDall)) {GForce = FALSE; break} + } + } else + GForce = .gforce_ok(jsub, SDenv$.SDall) + if (GForce) { + if (jsub %iscall% "list") + for (ii in seq_along(jsub)[-1L]) { + if (is.N(jsub[[ii]])) next; # For #334 + jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x) + } + else { + # adding argument to ghead/gtail if none is supplied to g-optimized head/tail + if (length(jsub) == 2L && jsub %iscall% c("head", "tail")) jsub[["n"]] = 6L + jsub = .gforce_jsub(jsub, names_x) + } + if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) + } else if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n"); + } + } + if (!GForce && !is.name(jsub)) { + # Still do the old speedup for mean, for now + nomeanopt=FALSE # to be set by .optmean() using <<- inside it + oldjsub = jsub + if (jsub %iscall% "list") { + # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if() + # jsub[[1]]=="list" so the first item of todo will always be FALSE + todo = sapply(jsub, `%iscall%`, 'mean') + if (any(todo)) { + w = which(todo) + jsub[w] = lapply(jsub[w], .optmean) + } + } else if (jsub %iscall% "mean") { + jsub = .optmean(jsub) + } + if (nomeanopt) { + warningf("Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized.", immediate.=TRUE) + } + if (verbose) { + if (!identical(oldjsub, jsub)) + catf("Old mean optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub, width.cutoff=200L, nlines=1L)) + else + catf("Old mean optimization is on, left j unchanged.\n") + } + assign("Cfastmean", Cfastmean, SDenv) + # Old comments still here for now ... + # Here in case nomeanopt=TRUE or some calls to mean weren't detected somehow. Better but still slow. + # Maybe change to : + # assign("mean", fastmean, SDenv) # neater than the hard work above, but slower + # when fastmean can do trim. + } + } else if (verbose) { + if (getOption("datatable.optimize")<1L) catf("All optimizations are turned off\n") + else catf("Optimization is on but left j unchanged (single plain symbol): '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) + } if (byjoin) { groups = i grpcols = leftcols # 'leftcols' are the columns in i involved in the join (either head of key(i) or head along i) @@ -1805,7 +2047,7 @@ replace_dot_alias = function(e) { } ans = c(g, ans) } else { - ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose, as.integer(showProgress)) + ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose, showProgress) } # unlock any locked data.table components of the answer, #4159 # MAX_DEPTH prevents possible infinite recursion from truly recursive object, #4173 @@ -2363,7 +2605,7 @@ Ops.data.table = function(e1, e2 = NULL) } split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TRUE, flatten = TRUE, ..., verbose = getOption("datatable.verbose")) { - if (!is.data.table(x)) internal_error("'%s' argument to split.data.table must be a data.table") # nocov + if (!is.data.table(x)) internal_error("x argument to split.data.table must be a data.table") # nocov stopifnot(is.logical(drop), is.logical(sorted), is.logical(keep.by), is.logical(flatten)) # split data.frame way, using `f` and not `by` argument if (!missing(f)) { @@ -2943,10 +3185,10 @@ rowid = function(..., prefix=NULL) { rowidv = function(x, cols=seq_along(x), prefix=NULL) { if (!is.null(prefix) && (!is.character(prefix) || length(prefix) != 1L)) - stopf("'prefix' must be NULL or a character vector of length 1") + stopf("'prefix' must be NULL or a character vector of length 1.") if (is.atomic(x)) { if (!missing(cols) && !is.null(cols)) - stopf("x is a single vector, non-NULL 'cols' doesn't make sense") + stopf("x is a single vector, non-NULL 'cols' doesn't make sense.") cols = 1L x = as_list(x) } else if (!length(cols)) { @@ -2968,10 +3210,10 @@ rleid = function(..., prefix=NULL) { rleidv = function(x, cols=seq_along(x), prefix=NULL) { if (!is.null(prefix) && (!is.character(prefix) || length(prefix) != 1L)) - stopf("'prefix' must be NULL or a character vector of length 1") + stopf("'prefix' must be NULL or a character vector of length 1.") if (is.atomic(x)) { if (!missing(cols) && !is.null(cols)) - stopf("x is a single vector, non-NULL 'cols' doesn't make sense") + stopf("x is a single vector, non-NULL 'cols' doesn't make sense.") cols = 1L x = as_list(x) } else if (!length(cols)) { @@ -3072,25 +3314,22 @@ is_constantish = function(q, check_singleton=FALSE) { length(q) == 3L && is_constantish(q[[3L]], check_singleton = TRUE) } -`.g[_ok` = function(q, x, envir=parent.frame(3L)) { +`.g[_ok` = function(q, x) { length(q) == 3L && is_constantish(q[[3L]], check_singleton = TRUE) && (q[[1L]] != "[[" || eval(call('is.atomic', q[[2L]]), envir=x)) && - !(as.character(q[[3L]]) %chin% names(x)) && is.numeric(q3 <- eval(q[[3L]], envir)) && length(q3)==1L && q3>0L + !(as.character(q[[3L]]) %chin% names(x)) && is.numeric(q3 <- eval(q[[3L]], parent.frame(3L))) && length(q3)==1L && q3>0L } .gweighted.mean_ok = function(q, x) { #3977 q = match.call(gweighted.mean, q) is_constantish(q[["na.rm"]]) && - !(is.symbol(q[["na.rm"]]) && q[["na.rm"]] %chin% names(x)) && (is.null(q[["w"]]) || eval(call('is.numeric', q[["w"]]), envir=x)) } # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD .get_gcall = function(q) { if (!is.call(q)) return(NULL) - if (length(q) < 2L) return(NULL) # e.g. list() # is.symbol() is for #1369, #1974 and #2949 - if (!is.symbol(q[[2L]]) && !is.call(q[[2L]])) return(NULL) - if (is.call(q[[2L]]) && !.is_type_conversion(q[[2L]])) return(NULL) + if (!is.symbol(q[[2L]])) return(NULL) q1 = q[[1L]] if (is.symbol(q1)) return(if (q1 %chin% gfuns) q1) if (!q1 %iscall% "::") return(NULL) @@ -3103,79 +3342,31 @@ is_constantish = function(q, check_singleton=FALSE) { # is robust to unnamed expr. Note that NA names are not possible here. .arg_is_narm = function(expr, which=3L) !is.null(nm <- names(expr)[which]) && startsWith(nm, "na") -.is_type_conversion = function(expr) { - is.call(expr) && is.symbol(expr[[1L]]) && expr[[1L]] %chin% - c("as.numeric", "as.double", "as.integer", "as.character", "as.integer64", - "as.complex", "as.logical", "as.Date", "as.POSIXct", "as.factor") -} - -.gforce_ops = c("+", "-", "*", "/", "^", "%%", "%/%") - -.unwrap_conversions = function(expr) { - while (.is_type_conversion(expr) && length(expr) >= 2L) expr = expr[[2L]] - expr -} - -.gforce_ok = function(q, x, envir=parent.frame(2L)) { +.gforce_ok = function(q, x) { if (is.N(q)) return(TRUE) # For #334 - if (!is.call(q)) return(is.numeric(q)) # plain columns are not gforce-able since they might not aggregate (see test 104.1) - if (q %iscall% "(") return(.gforce_ok(q[[2L]], x, envir)) - q1 = .get_gcall(q) - if (!is.null(q1)) { - q2 = .unwrap_conversions(q[[2L]]) - if (!is.symbol(q2) || (!q2 %chin% names(x) && q2 != ".I")) return(FALSE) - if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]) && - !(is.symbol(q[[3L]]) && q[[3L]] %chin% names(x)))) return(TRUE) - return(switch(as.character(q1), - "shift" = .gshift_ok(q), - "weighted.mean" = .gweighted.mean_ok(q, x), - "tail" = , "head" = .ghead_ok(q), - "[[" = , "[" = `.g[_ok`(q, x, envir), - FALSE - )) - } - - # check if arithmetic operator -> recursively validate ALL branches (like in AST) - if (is.symbol(q[[1L]]) && q[[1L]] %chin% .gforce_ops) { - for (i in 2:length(q)) { - if (!.gforce_ok(q[[i]], x, envir)) return(FALSE) - } - return(TRUE) - } - - FALSE + if (is.null(q1)) return(FALSE) + if (!(q2 <- q[[2L]]) %chin% names(x) && q2 != ".I") return(FALSE) # 875 + if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]))) return(TRUE) + switch(as.character(q1), + "shift" = .gshift_ok(q), + "weighted.mean" = .gweighted.mean_ok(q, x), + "tail" = , "head" = .ghead_ok(q), + "[[" = , "[" = `.g[_ok`(q, x), + FALSE + ) } -.gforce_jsub = function(q, names_x, envir=parent.frame(2L)) { - if (!is.call(q)) return(q) - if (q %iscall% "(") { - q[[2L]] = .gforce_jsub(q[[2L]], names_x, envir) - return(q) - } - - q1 = .get_gcall(q) - if (!is.null(q1)) { - call_name = if (is.symbol(q[[1L]])) q[[1L]] else q[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work. - q[[1L]] = as.name(paste0("g", call_name)) - # gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok - # do not evaluate vars present as columns in x - if (length(q) >= 3L) { - for (i in 3:length(q)) { - if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], envir) # tests 1187.2 & 1187.4 - } - } - return(q) - } - - # if arithmetic operator, recursively substitute its operands. we know what branches are valid from .gforce_ok - if (is.symbol(q[[1L]]) && q[[1L]] %chin% .gforce_ops) { - for (i in 2:length(q)) { - q[[i]] = .gforce_jsub(q[[i]], names_x, envir) +.gforce_jsub = function(q, names_x) { + call_name = if (is.symbol(q[[1L]])) q[[1L]] else q[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work. + q[[1L]] = as.name(paste0("g", call_name)) + # gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok + # do not evaluate vars present as columns in x + if (length(q) >= 3L) { + for (i in 3:length(q)) { + if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], parent.frame(2L)) # tests 1187.2 & 1187.4 } - return(q) } - # should not reach here since .gforce_ok q } diff --git a/R/duplicated.R b/R/duplicated.R index 5e06007e0f..e1a04c9822 100644 --- a/R/duplicated.R +++ b/R/duplicated.R @@ -4,7 +4,7 @@ duplicated.data.table = function(x, incomparables=FALSE, fromLast=FALSE, by=seq_ .NotYetUsed("incomparables != FALSE") } if (nrow(x) == 0L || ncol(x) == 0L) return(logical(0L)) # fix for bug #28 - if (is.na(fromLast) || !is.logical(fromLast)) stopf("'%s' must be TRUE or FALSE", "fromLast") + if (is.na(fromLast) || !is.logical(fromLast)) stopf("'fromLast' must be TRUE or FALSE") if (!length(by)) by = NULL #4594 query = .duplicated.helper(x, by) diff --git a/R/fcast.R b/R/fcast.R index 1bcc2916fb..35ab4eaae8 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -122,15 +122,13 @@ aggregate_funs = function(funs, vals, sep="_", ...) { } dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., margins = NULL, subset = NULL, fill = NULL, drop = TRUE, value.var = guess(data), verbose = getOption("datatable.verbose"), value.var.in.dots = FALSE, value.var.in.LHSdots = value.var.in.dots, value.var.in.RHSdots = value.var.in.dots) { - if (!is.data.table(data)) stopf("'%s' must be a data.table", "data") + if (!is.data.table(data)) stopf("'data' must be a data.table.") drop = as.logical(rep_len(drop, 2L)) - if (anyNA(drop)) stopf("'drop' must be logical vector with no missing entries") + if (anyNA(drop)) stopf("'drop' must be logical TRUE/FALSE") if (!isTRUEorFALSE(value.var.in.dots)) - stopf("'%s' must be TRUE or FALSE", "value.var.in.dots") - if (!isTRUEorFALSE(value.var.in.LHSdots)) - stopf("'%s' must be TRUE or FALSE", "value.var.in.LHSdots") - if (!isTRUEorFALSE(value.var.in.RHSdots)) - stopf("'%s' must be TRUE or FALSE", "value.var.in.RHSdots") + stopf("Argument 'value.var.in.dots' should be logical TRUE/FALSE") + if (!isTRUEorFALSE(value.var.in.LHSdots) || !isTRUEorFALSE(value.var.in.RHSdots)) + stopf("Arguments 'value.var.in.LHSdots', 'value.var.in.RHSdots' should be logical TRUE/FALSE") # #2980 if explicitly providing fun.aggregate=length but not a value.var, # just use the last column (as guess(data) would do) because length will be # the same on all columns diff --git a/R/fmelt.R b/R/fmelt.R index e0596f73fc..c6f435578b 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -181,7 +181,7 @@ measurev = function(fun.list, sep="_", pattern, cols, multiple.keyword="value.na melt.data.table = function(data, id.vars, measure.vars, variable.name = "variable", value.name = "value", ..., na.rm = FALSE, variable.factor = TRUE, value.factor = FALSE, verbose = getOption("datatable.verbose")) { - if (!is.data.table(data)) stopf("'%s' must be a data.table", "data") + if (!is.data.table(data)) stopf("'data' must be a data.table") for(type.vars in c("id.vars","measure.vars")){ sub.lang <- substitute({ if (missing(VAR)) VAR=NULL diff --git a/R/foverlaps.R b/R/foverlaps.R index 8c3d4be777..7bbaf0dc13 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -9,8 +9,8 @@ foverlaps = function(x, y, by.x=key(x) %||% key(y), by.y=key(y), maxgap=0L, mino stopf("maxgap must be a non-negative integer value of length 1") if (!length(minoverlap) || length(minoverlap) != 1L || is.na(minoverlap) || minoverlap < 1L) stopf("minoverlap must be a positive integer value of length 1") - if (!isTRUEorFALSE(which)) - stopf("'%s' must be TRUE or FALSE", "which") + if (!length(which) || length(which) != 1L || is.na(which)) + stopf("which must be a logical vector of length 1. Either TRUE/FALSE") if (!length(nomatch) || length(nomatch) != 1L || (!is.na(nomatch) && nomatch!=0L)) stopf("nomatch must either be NA or NULL") type = match.arg(type) @@ -33,9 +33,9 @@ foverlaps = function(x, y, by.x=key(x) %||% key(y), by.y=key(y), maxgap=0L, mino by.y = names(y)[by.y] } if (!is.character(by.x)) - stopf("A non-empty vector of column names or numbers is required for '%s'", "by.x") + stopf("A non-empty vector of column names or numbers is required for by.x") if (!is.character(by.y)) - stopf("A non-empty vector of column names or numbers is required for '%s'", "by.y") + stopf("A non-empty vector of column names or numbers is required for by.y") if (!identical(by.y, key(y)[seq_along(by.y)])) stopf("The first %d columns of y's key must be identical to the columns specified in by.y.", length(by.y)) if (anyNA(chmatch(by.x, names(x)))) diff --git a/R/frank.R b/R/frank.R index 63e3be8321..419f5ea414 100644 --- a/R/frank.R +++ b/R/frank.R @@ -20,7 +20,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a } else { cols = colnamesInt(x, cols, check_dups=TRUE) if (!length(cols)) - stopf("x is a list, 'cols' cannot be 0-length.") + stopf("x is a list, 'cols' can not be 0-length") } # need to unlock for #4429 x = .shallow(x, cols, unlock = TRUE) # shallow copy even if list.. diff --git a/R/fread.R b/R/fread.R index bc8509c713..2f397b78e1 100644 --- a/R/fread.R +++ b/R/fread.R @@ -182,9 +182,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC") call_args = names(match.call()) if (is.character(skip)) warningf("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of the metadata; please file an issue on GitHub if you'd like to see more intuitive behavior supported.") - yaml_res = .read_yaml_header(input, skip, verbose) - yaml_header = yaml_res$yaml_header - n_read = yaml_res$n_read + yaml_header = .read_yaml_header(input, skip, verbose) yaml_names = names(yaml_header) # process header first since it impacts how to handle colClasses if ('header' %chin% yaml_names) { @@ -389,7 +387,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC") yaml_header = yaml::yaml.load(yaml_string) if (verbose) catf('Processed %d lines of YAML metadata with the following top-level fields: %s\n', n_read, brackify(names(yaml_header))) - list(yaml_header = yaml_header, n_read = n_read) + yaml_header } # nocov end. diff --git a/R/frollapply.R b/R/frollapply.R index 9c1b0c26d7..a8a214ff13 100644 --- a/R/frollapply.R +++ b/R/frollapply.R @@ -133,15 +133,15 @@ frollapply = function(X, N, FUN, ..., by.column=TRUE, fill=NA, align=c("right"," stopf("'n' is deprecated in frollapply, use 'N' instead") } if (!isTRUEorFALSE(by.column)) - stopf("'%s' must be TRUE or FALSE", "by.column") + stopf("'by.column' must be TRUE or FALSE") if (!isTRUEorFALSE(adaptive)) - stopf("'%s' must be TRUE or FALSE", "adaptive") + stopf("'adaptive' must be TRUE or FALSE") if (!isTRUEorFALSE(partial)) - stopf("'%s' must be TRUE or FALSE", "partial") + stopf("'partial' must be TRUE or FALSE") if (!isTRUEorFALSE(give.names)) - stopf("'%s' must be TRUE or FALSE", "give.names") + stopf("'give.names' must be TRUE or FALSE") if (!isTRUEorFALSE(simplify) && !is.function(simplify)) - stopf("'%s' must be TRUE or FALSE or a function", "simplify") + stopf("'simplify' must be TRUE or FALSE or a function") align = match.arg(align) FUN = match.fun(FUN) diff --git a/R/groupingsets.R b/R/groupingsets.R index dcdbd842aa..661ac1af09 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -4,7 +4,7 @@ rollup = function(x, ...) { rollup.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { # input data type basic validation if (!is.data.table(x)) - stopf("'%s' must be a data.table", "x", class="dt_invalid_input_error") + stopf("Argument 'x' must be a data.table object", class="dt_invalid_input_error") if (!is.character(by)) stopf("Argument 'by' must be a character vector of column names used in grouping.") if (!is.logical(id)) @@ -22,7 +22,7 @@ cube = function(x, ...) { cube.data.table = function(x, j, by, .SDcols, id = FALSE, label = NULL, ...) { # input data type basic validation if (!is.data.table(x)) - stopf("'%s' must be a data.table", "x", class="dt_invalid_input_error") + stopf("Argument 'x' must be a data.table object", class="dt_invalid_input_error") if (!is.character(by)) stopf("Argument 'by' must be a character vector of column names used in grouping.") if (!is.logical(id)) @@ -55,7 +55,7 @@ groupingsets = function(x, ...) { groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, label = NULL, enclos = parent.frame(), ...) { # input data type basic validation if (!is.data.table(x)) - stopf("'%s' must be a data.table", "x") + stopf("Argument 'x' must be a data.table object") if (ncol(x) < 1L) stopf("Argument 'x' is a 0-column data.table; no measure to apply grouping over.") if (anyDuplicated(names(x)) > 0L) @@ -158,6 +158,12 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, labe warningf("For the following variables, the 'label' value was already in the data: %s", brackify(info)) } } + # workaround for rbindlist fill=TRUE on integer64 #1459 + int64.cols = vapply_1b(empty, inherits, "integer64") + int64.cols = names(int64.cols)[int64.cols] + if (length(int64.cols) && !requireNamespace("bit64", quietly=TRUE)) + stopf("Using integer64 class columns require to have 'bit64' package installed.") # nocov + int64.by.cols = intersect(int64.cols, by) # aggregate function called for each grouping set # inline all arguments that might clash with enclosing environment pcall = substitute(x[, jj], list(x = x, jj = jj)) @@ -171,6 +177,11 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, labe i_str = paste(c("1", "0")[by %chin% by.set + 1L], collapse="") set(r, j = "grouping", value = if (nzchar(i_str)) strtoi(i_str, base=2L) else 0L) } + if (length(int64.by.cols)) { + # workaround for rbindlist fill=TRUE on integer64 #1459 + missing.int64.by.cols = setdiff(int64.by.cols, by.set) + if (length(missing.int64.by.cols)) r[, (missing.int64.by.cols) := bit64::as.integer64(NA)] + } if (!is.null(label) && length(by.label.use.vars <- intersect(setdiff(by, by.set), names(label.use))) > 0L) r[, (by.label.use.vars) := label.use[by.label.use.vars]] r diff --git a/R/helpers.R b/R/helpers.R index 284a129bb2..213b0c057e 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -3,9 +3,9 @@ # convert char to factor retaining order #4837 fctr = function(x, levels=unique(x), ..., sort=FALSE, rev=FALSE) { if (!isTRUEorFALSE(sort)) - stopf("'%s' must be TRUE or FALSE", "sort") + stopf("argument 'sort' must be TRUE or FALSE") if (!isTRUEorFALSE(rev)) - stopf("'%s' must be TRUE or FALSE", "rev") + stopf("argument 'rev' must be TRUE or FALSE") if (sort) levels = sort(levels) if (rev) levels = frev(levels) factor(x, levels=levels, ...) diff --git a/R/merge.R b/R/merge.R index 23156a43d0..2484cd9a0f 100644 --- a/R/merge.R +++ b/R/merge.R @@ -1,9 +1,9 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FALSE, all.x = all, all.y = all, sort = TRUE, suffixes = c(".x", ".y"), no.dups = TRUE, allow.cartesian=getOption("datatable.allow.cartesian"), incomparables=NULL, ...) { - if (!isTRUEorFALSE(sort)) - stopf("'%s' must be TRUE or FALSE", "sort") - if (!isTRUEorFALSE(no.dups)) - stopf("'%s' must be TRUE or FALSE", "no.dups") + if (!sort %in% c(TRUE, FALSE)) + stopf("Argument 'sort' should be logical TRUE/FALSE") + if (!no.dups %in% c(TRUE, FALSE)) + stopf("Argument 'no.dups' should be logical TRUE/FALSE") class_x = class(x) if (!is.data.table(y)) { y = as.data.table(y) diff --git a/R/rowwiseDT.R b/R/rowwiseDT.R index 1451e58498..81114ead63 100644 --- a/R/rowwiseDT.R +++ b/R/rowwiseDT.R @@ -13,14 +13,6 @@ rowwiseDT = function(...) { nrows = length(body) %/% ncols if (length(body) != nrows * ncols) stopf("There are %d columns but the number of cells is %d, which is not an integer multiple of the columns", ncols, length(body)) - is_problematic = vapply_1b(body, function(v) !(is.atomic(v) || is.null(v) || typeof(v) == "list")) - if (any(is_problematic)) { - idx = which(is_problematic)[1L] - col_idx = (idx - 1L) %% ncols + 1L - col_name = header[col_idx] - obj_type = class1(body[[idx]]) - stopf("Column '%s' is type '%s'. Non-atomic, non-list objects must be wrapped in list(), e.g., list(f) instead of f", col_name, obj_type) - } # make all the non-scalar elements to a list needs_list = lengths(body) != 1L body[needs_list] = lapply(body[needs_list], list) diff --git a/R/setkey.R b/R/setkey.R index afaf06293e..4ba5be4d71 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -32,7 +32,7 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU on.exit(options(oldverbose)) } if (!is.data.table(x)) stopf("x is not a data.table") - if (!is.character(cols)) stopf("cols is not a character vector. Please see further information in ?%s.", "setkey") + if (!is.character(cols)) stopf("cols is not a character vector. Please see further information in ?setkey.") if (physical && .Call(C_islocked, x)) stopf("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.") if (!length(cols)) { warningf("cols is a character vector of zero length. Removed the key, but use NULL instead, or wrap with suppressWarnings() to avoid this warning.") @@ -257,7 +257,7 @@ setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE) if (!is.data.frame(x)) stopf("x must be a data.frame or data.table") na.last = as.logical(na.last) if (is.na(na.last) || !length(na.last)) stopf('na.last must be logical TRUE/FALSE') - if (!is.character(cols)) stopf("cols is not a character vector. Please see further information in ?%s.", "setorder") + if (!is.character(cols)) stopf("cols is not a character vector. Please see further information in ?setorder.") if (!length(cols)) { warningf("cols is a character vector of zero length. Use NULL instead, or wrap with suppressWarnings() to avoid this warning.") return(x) diff --git a/R/test.data.table.R b/R/test.data.table.R index d37fba29b5..70cd82b363 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -370,49 +370,10 @@ gc_mem = function() { # nocov end } -# Check if UTF-8 symbols can be represented in native encoding -# R's parser requires symbol names (PRINTNAME in LANGSXP) to be in native encoding. In non-UTF-8 -# locales, parsing Unicode escapes like \u00FC fails with a warning and substitutes . -# Tests using requires_utf8 are skipped when UTF-8 cannot be represented. Using eval(parse(text=...)) -# defers parsing to runtime, allowing the encoding check to run first and avoid source() warnings. -utf8_check = function(test_str) identical(test_str, enc2native(test_str)) - test = function(num, x, y=TRUE, error=NULL, warning=NULL, message=NULL, output=NULL, notOutput=NULL, ignore.warning=NULL, options=NULL, env=NULL, - context=NULL, requires_utf8=FALSE, optimize=NULL) { - # if optimization is provided, test across multiple optimization levels - if (!is.null(optimize)) { - if (!is.numeric(optimize) || length(optimize) < 1L || anyNA(optimize) || any(optimize < 0L)) - stopf("optimize must be numeric, length >= 1, non-NA, and >= 0; got: %s", optimize) # nocov - cl = match.call() - if ("datatable.optimize" %in% names(cl$options)) - stopf("Trying to set optimization level through both options= and optimize=") # nocov - cl$optimize = NULL # Remove optimization levels from the recursive call - - # Check if y was explicitly provided (not just the default) - y_provided = !missing(y) - vector_params = mget(c("error", "warning", "message", "output", "notOutput", "ignore.warning"), environment()) - vector_params = vector_params[lengths(vector_params) > 0L] - compare = !y_provided && length(optimize)>1L && !length(vector_params) - # When optimize has multiple levels, vector params are recycled across levels. - if (length(optimize) > 1L && "warning" %in% names(vector_params) && length(vector_params$warning) > 1L) - warningf("warning= with multiple values is recycled across optimize levels, not treated as multiple warnings in one run") - - for (i in seq_along(optimize)) { - cl$num = num + (i - 1L) * 1e-6 - opt_level = list(datatable.optimize = optimize[i]) - cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level - for (param in names(vector_params)) { - val = vector_params[[param]] - cl[[param]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels - } - - if (compare && i == 1L) cl$y = eval(cl$x, parent.frame()) - eval(cl, parent.frame()) # actual test call - } - return(invisible()) - } + context=NULL) { if (!is.null(env)) { old = Sys.getenv(names(env), names=TRUE, unset=NA) to_unset = !lengths(env) @@ -426,20 +387,6 @@ test = function(num, x, y=TRUE, Sys.unsetenv(names(old)[!is_preset]) }, add=TRUE) } - # Check UTF-8 requirement - if (!isFALSE(requires_utf8)) { - test_str = if (isTRUE(requires_utf8)) "\u00F1\u00FC\u3093" else requires_utf8 # the default test_str are UTF-8 symbols we found over time, TOOD: harden this default - if (!utf8_check(test_str)) { - # nocov start - last_utf8_skip = get0("last_utf8_skip", parent.frame(), ifnotfound=0, inherits=TRUE) - if (num - last_utf8_skip >= 1) { - catf("Test %s skipped because required UTF-8 symbols cannot be represented in native encoding.\n", num) - } - assign("last_utf8_skip", num, parent.frame(), inherits=TRUE) - return(invisible(TRUE)) - # nocov end - } - } # Usage: # i) tests that x equals y when both x and y are supplied, the most common usage # ii) tests that x is TRUE when y isn't supplied diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 246a1d5daa..62075dcf85 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -190,14 +190,24 @@ DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) test(301.1, nrow(DT[,sum(B),by=C])==100010) # Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. -set.seed(1) -DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") -opt = c(0L,2L) -test(637.1, optimize=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) -test(637.2, optimize=opt, key(copy(DT)[J(43L), a:=99L]), NULL) -setkey(DT, a) -test(637.3, optimize=opt, key(copy(DT)[, a:=99L, by=a]), NULL) -# test 637 subsumes 637 and 638 for different optimization levels +local({ + old = options(datatable.optimize=0L); on.exit(options(old)) + set.seed(1) + DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") + test(637.1, DT[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) + test(637.2, key(DT[J(43L), a:=99L]), NULL) + setkey(DT, a) + test(637.3, key(DT[, a:=99L, by=a]), NULL) +}) +local({ + options(datatable.optimize=2L); on.exit(options(old)) + set.seed(1) + DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") + test(638.1, DT[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) + test(638.2, key(DT[J(43L), a:=99L]), NULL) + setkey(DT,a) + test(638.3, key(DT[, a:=99L, by=a]), NULL) +}) # Test X[Y] slowdown, #2216 # Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes diff --git a/inst/tests/froll.Rraw b/inst/tests/froll.Rraw index 5f82259100..f87ace0498 100644 --- a/inst/tests/froll.Rraw +++ b/inst/tests/froll.Rraw @@ -423,9 +423,9 @@ test(6000.118, frollmean(1:5, as.factor("a")), error="'n' must be an integer") #### is.list(n) test(6000.119, frollmean(1:5, list(1:5)), error="'n' must be an integer, list is accepted for adaptive TRUE") #### adaptive=NA -test(6000.1192, frollmean(1:5, 2, adaptive=NA), error="'adaptive' must be TRUE or FALSE") +test(6000.1192, frollmean(1:5, 2, adaptive=NA), error="adaptive must be TRUE or FALSE") #### na.rm=NA -test(6000.1193, frollmean(1:5, 2, na.rm=NA), error="'na.rm' must be TRUE or FALSE") +test(6000.1193, frollmean(1:5, 2, na.rm=NA), error="na.rm must be TRUE or FALSE") #### has.nf=1 test(6000.1194, frollmean(1:5, 2, has.nf=1), error="has.nf must be TRUE, FALSE or NA") #### has.nf=FALSE na.rm=TRUE diff --git a/inst/tests/nafill.Rraw b/inst/tests/nafill.Rraw index 6428be9af6..16f84fa16f 100644 --- a/inst/tests/nafill.Rraw +++ b/inst/tests/nafill.Rraw @@ -112,8 +112,8 @@ x = 1:10 test(3.01, nafill(x, "locf", fill=0L), x) test(3.02, setnafill(list(copy(x)), "locf", fill=0L), list(x)) test(3.03, setnafill(x, "locf"), error="in-place update is supported only for list") -test(3.04, nafill(as.raw(x), fill=0), error="not supported") -test(3.05, setnafill(list(as.raw(x)), fill=0), error="not supported") +test(3.04, nafill(letters[1:5], fill=0), error="must be logical/numeric type, or list/data.table") +test(3.05, setnafill(list(letters[1:5]), fill=0), error="must be logical/numeric type, or list/data.table") test(3.06, nafill(x, fill=1:2), error="fill must be a vector of length 1.*fcoalesce") test(3.07, nafill(x, "locf", fill=1:2), error="fill must be a vector of length 1.*x\\.$") test(3.08, nafill(x, fill="asd"), x, warning=c("Coercing.*character.*integer","NAs introduced by coercion")) @@ -149,7 +149,7 @@ test(4.26, colnamesInt(dt, c(1, 4), skip_absent=TRUE), c(1L,0L)) test(4.27, colnamesInt(dt, c("a", NA), skip_absent=TRUE), c(1L,0L)) test(4.28, colnamesInt(dt, c(1L, 0L), skip_absent=TRUE), error="received non-existing column*.*0") test(4.29, colnamesInt(dt, c(1, -5), skip_absent=TRUE), error="received non-existing column*.*-5") -test(4.30, colnamesInt(dt, c(1, 4), skip_absent=NULL), error="'skip_absent' must be TRUE or FALSE") +test(4.30, colnamesInt(dt, c(1, 4), skip_absent=NULL), error="skip_absent must be TRUE or FALSE") test(4.31, colnamesInt(dt, c(1L, 1000L), skip_absent=TRUE), c(1L,0L)) cols=c(1L,100L) test(4.32, colnamesInt(dt, cols, skip_absent=TRUE), c(1L, 0L)) @@ -324,153 +324,22 @@ test(11.09, coerceAs(1L, a), error="must not be matrix or array") x = c(NA, NA, TRUE, FALSE, NA, NA, FALSE, TRUE, NA, NA) test(12.01, nafill(x, "locf"), c(NA, NA, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE)) test(12.02, nafill(x, "nocb"), c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, NA, NA)) -test(12.03, nafill(x, "locf", fill=TRUE), c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE)) -test(12.04, nafill(x, "nocb", fill=TRUE), c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE)) -test(12.05, nafill(x, fill=TRUE), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE)) -test(12.06, nafill(x, fill=0L), c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE)) -test(12.07, nafill(x, fill=5.0), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE), warning="double.*taken as TRUE") -test(12.08, nafill(x, fill=Inf), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE), warning="double.*taken as TRUE") -test(12.09, nafill(x, fill=NA), x) -test(12.10, nafill(x, fill=NA_integer_), x) -test(12.11, nafill(x, fill=NA_real_), x) -test(12.12, nafill(x, fill=NaN), x) +test(12.03, nafill(x, fill=TRUE), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE)) +test(12.04, nafill(x, fill=0L), c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE)) +test(12.05, nafill(x, fill=5.0), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE), warning="double.*taken as TRUE") +test(12.06, nafill(x, fill=Inf), c(TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE), warning="double.*taken as TRUE") +test(12.07, nafill(x, fill=NA), x) +test(12.08, nafill(x, fill=NA_integer_), x) +test(12.09, nafill(x, fill=NA_real_), x) +test(12.10, nafill(x, fill=NaN), x) -## factor input -x = rep(NA_character_, 10L) -x[c(3:4, 7:8)] = c("a", "b", "a", "c") -x = as.factor(x) -test(13.01, nafill(x, "locf"), replace(replace(x, 5:6, "b"), 9:10, "c")) -test(13.02, nafill(x, "nocb"), replace(x, c(1:2, 5:6), "a")) -test(13.03, nafill(x, "locf", fill="b"), replace(replace(x, c(1:2, 5:6), "b"), 9:10, "c")) -test(13.04, nafill(x, "nocb", fill="a"), replace(x, c(1:2, 5:6, 9:10), "a")) -x_fill_a = replace(x, c(1:2, 5:6, 9:10), "a") -test(13.05, nafill(x, fill="a"), x_fill_a) -test(13.06, nafill(x, fill=1L), x_fill_a) -test(13.07, nafill(x, fill=1.0), x_fill_a) -test(13.08, nafill(x, fill=factor("a")), x_fill_a) -test(13.09, nafill(x, fill=factor("a", levels=levels(x))), x_fill_a) -test(13.10, nafill(x, fill=factor("a", levels=c("a", "b"))), x_fill_a) -test(13.11, nafill(x, fill=factor("a", levels=c("a", "d"))), factor(x_fill_a, levels=c("a", "b", "c", "d"))) -x_fill_d = replace(factor(x, levels = c(levels(x), "d")), c(1:2, 5:6, 9:10), "d") -test(13.12, nafill(x, fill="d"), x_fill_d) -test(13.13, nafill(x, fill=factor("d", levels=c("a", "b", "c", "d"))), x_fill_d) -test(13.14, nafill(x, fill=factor("d", levels=c("d", "a", "b", "c"))), x_fill_d) -test(13.15, nafill(x, fill=factor("d", levels=c("d", "c", "b", "a"))), x_fill_d) -test(13.16, nafill(x, fill=factor("d", levels=c("b", "c", "d"))), x_fill_d) -test(13.17, nafill(x, fill=NA), x) -test(13.18, nafill(x, fill=NA_integer_), x) -test(13.19, nafill(x, fill=NA_real_), x) -test(13.20, nafill(x, fill=NA_character_), x) - -## character input -x = c(NA, NA, "a", "b", NA, NA, "c","d", NA, NA) -test(14.01, nafill(x, fill="unknown"), c("unknown", "unknown", "a", "b", "unknown", "unknown", "c", "d", "unknown", "unknown")) -test(14.02, nafill(x, fill=NA), x) -test(14.03, nafill(x, "locf"), c(NA, NA, "a", "b", "b", "b", "c", "d", "d", "d")) -test(14.04, nafill(x, "nocb"), c("a", "a", "a", "b", "c", "c", "c", "d", NA, NA)) -test(14.05, nafill(x, "locf", fill="unknown"), c("unknown", "unknown", "a", "b", "b", "b", "c", "d", "d", "d")) -test(14.06, nafill(x, "nocb", fill="unknown"), c("a", "a", "a", "b", "c", "c", "c", "d", "unknown", "unknown")) -test(14.07, nafill(x, fill=TRUE), c("TRUE", "TRUE", "a", "b", "TRUE", "TRUE", "c", "d", "TRUE", "TRUE")) -test(14.08, nafill(x, fill=1L), c("1", "1", "a", "b", "1", "1", "c", "d", "1", "1")) -test(14.09, nafill(x, fill=1.0), c("1", "1", "a", "b", "1", "1", "c", "d", "1", "1")) -test(14.10, nafill(x, fill=NA_integer_), x) -test(14.11, nafill(x, fill=NA_real_), x) -test(14.12, nafill(x, fill=NA_character_), x) -test(14.13, options=c(datatable.verbose=TRUE), - nafill(x, fill="z"), c("z", "z", "a", "b", "z", "z", "c", "d", "z", "z"), - output="nafillString: took") - -## other common classed vector objects: Date, IDate, POSIXct -x = as.IDate(c(NA, "2025-01-01", NA, "2025-06-01", NA)) -y26 = as.IDate("2026-01-01") -test(15.01, nafill(x, fill=y26), replace(x, c(1L, 3L, 5L), y26)) -test(15.02, nafill(x, fill=NA), x) -test(15.03, nafill(x, "locf"), replace(x, c(3L, 5L), x[c(2L, 4L)])) -test(15.04, nafill(x, "nocb"), replace(x, c(1L, 3L), x[c(2L, 4L)])) -test(15.05, nafill(x, "locf", fill=y26), replace(x, c(1L, 3L, 5L), c(y26, x[c(2L, 4L)]))) -test(15.06, nafill(x, "nocb", fill=y26), replace(x, c(1L, 3L, 5L), c(x[c(2L, 4L)], y26))) -test(15.07, nafill(x, fill=as.numeric(y26)), replace(x, c(1L, 3L, 5L), y26)) - -x = as.Date(x) -y26 = as.Date(y26) -test(15.08, nafill(x, fill=y26), replace(x, c(1L, 3L, 5L), y26)) -test(15.09, nafill(x, fill=NA), x) -test(15.10, nafill(x, "locf"), replace(x, c(3L, 5L), x[c(2L, 4L)])) -test(15.11, nafill(x, "nocb"), replace(x, c(1L, 3L), x[c(2L, 4L)])) -test(15.12, nafill(x, "locf", fill=y26), replace(x, c(1L, 3L, 5L), c(y26, x[c(2L, 4L)]))) -test(15.13, nafill(x, "nocb", fill=y26), replace(x, c(1L, 3L, 5L), c(x[c(2L, 4L)], y26))) -test(15.14, nafill(x, fill=as.numeric(y26)), replace(x, c(1L, 3L, 5L), y26)) - -x = as.POSIXct(x) -y26 = as.POSIXct(y26) -test(15.15, nafill(x, fill=y26), replace(x, c(1L, 3L, 5L), y26)) -test(15.16, nafill(x, fill=NA), x) -test(15.17, nafill(x, "locf"), replace(x, c(3L, 5L), x[c(2L, 4L)])) -test(15.18, nafill(x, "nocb"), replace(x, c(1L, 3L), x[c(2L, 4L)])) -test(15.19, nafill(x, "locf", fill=y26), replace(x, c(1L, 3L, 5L), c(y26, x[c(2L, 4L)]))) -test(15.20, nafill(x, "nocb", fill=y26), replace(x, c(1L, 3L, 5L), c(x[c(2L, 4L)], y26))) -test(15.21, nafill(x, fill=as.numeric(y26)), replace(x, c(1L, 3L, 5L), y26)) - -attr(x, "tzone") = "Asia/Singapore" -test(15.22, nafill(x, fill=y26), replace(x, c(1L, 3L, 5L), y26)) -test(15.23, nafill(x, fill=as.POSIXct(y26, tz='Asia/Singapore')), replace(x, c(1L, 3L, 5L), y26)) - -test(15.24, nafill(as.Date(NA), fill=as.IDate("2025-01-01")), as.Date("2025-01-01")) -test(15.25, nafill(as.Date(NA_integer_), fill=as.IDate("2025-01-01")), as.Date("2025-01-01")) -test(15.26, nafill(as.IDate(NA), fill=as.Date("2025-01-01")), as.IDate("2025-01-01")) - -## setnafill -DT = data.table(l1=c(NA, NA, TRUE, TRUE, NA, NA, FALSE, FALSE, NA, NA), - l2=c(NA, NA, FALSE, FALSE, NA, NA, TRUE, TRUE, NA, NA), - i1=c(NA, NA, 0:1, NA, NA, 2:3, NA, NA), - i2=c(NA, NA, 3:2, NA, NA, 1:0, NA, NA), - d1=c(NA, NA, 0.0, 1L, NA, NA, 2:3, NA, NA), - d2=c(NA, NA, 3.0, 2L, NA, NA, 1:0, NA, NA), - f1=as.factor(c(NA, NA, "a", "b", NA, NA, "b", "c", NA, NA)), - f2=as.factor(c(NA, NA, "c", "b", NA, NA, "b", "a", NA, NA)), - c1=c(NA, NA, "a", "b", NA, NA, "c", "d", NA, NA), - c2=c(NA, NA, "d", "c", NA, NA, "b", "a", NA, NA), - t1=as.POSIXct(c(NA, NA, "2025-01-01", "2025-01-02", NA, NA, "2025-06-01", "2025-06-02", NA, NA)), - t2=as.POSIXct(c(NA, NA, "2026-01-01", "2026-01-02", NA, NA, "2026-06-01", "2026-06-02", NA, NA))) -test(16.01, setnafill(copy(DT), fill=TRUE, cols='l1')$l1, - c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE)) -test(16.02, setnafill(copy(DT), fill=TRUE, cols=c('l1', 'l2'))[, .(l1, l2)], - data.table(l1=c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE), - l2=c(TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE))) -test(16.03, setnafill(copy(DT), fill=9L, cols='i1')$i1, - c(9L, 9L, 0:1, 9L, 9L, 2:3, 9L, 9L)) -test(16.04, setnafill(copy(DT), fill=9L, cols=c('i1', 'i2'))[, .(i1, i2)], - data.table(i1=c(9L, 9L, 0:1, 9L, 9L, 2:3, 9L, 9L), - i2=c(9L, 9L, 3:2, 9L, 9L, 1:0, 9L, 9L))) -test(16.05, setnafill(copy(DT), fill=9.0, cols='d1')$d1, - c(9.0, 9L, 0:1, 9L, 9L, 2:3, 9L, 9L)) -test(16.06, setnafill(copy(DT), fill=9.0, cols=c('d1', 'd2'))[, .(d1, d2)], - data.table(d1=c(9.0, 9L, 0:1, 9L, 9L, 2:3, 9L, 9L), - d2=c(9.0, 9L, 3:2, 9L, 9L, 1:0, 9L, 9L))) -test(16.07, setnafill(copy(DT), fill="a", cols='f1')$f1, - as.factor(c("a", "a", "a", "b", "a", "a", "b", "c", "a", "a"))) -test(16.08, setnafill(copy(DT), fill="a", cols=c('f1', 'f2'))[, .(f1, f2)], - data.table(f1=as.factor(c("a", "a", "a", "b", "a", "a", "b", "c", "a", "a")), - f2=as.factor(c("a", "a", "c", "b", "a", "a", "b", "a", "a", "a")))) -test(16.09, setnafill(DT, fill="z", cols='c1'), error="not yet supported") -# test(16.10, setnafill(copy(DT), fill="z", cols=c('c1', 'c2'))[, .(c1, c2)], -# data.table(c1=c("z", "z", "a", "b", "z", "z", "c", "d", "z", "z"), -# c2=c("z", "z", "d", "c", "z", "z", "b", "a", "z", "z"))) -test(16.11, setnafill(copy(DT), fill=as.POSIXct("2027-01-01"), cols='t1')$t1, - replace(DT$t1, c(1:2, 5:6, 9:10), as.POSIXct("2027-01-01"))) -test(16.12, setnafill(copy(DT), fill=as.POSIXct("2027-01-01"), cols=c('t1', 't2'))[, .(t1, t2)], - data.table(t1=replace(DT$t1, c(1:2, 5:6, 9:10), as.POSIXct("2027-01-01")), - t2=replace(DT$t2, c(1:2, 5:6, 9:10), as.POSIXct("2027-01-01")))) -test(16.13, setnafill(copy(DT), fill=list(TRUE, 9L, 9.0, "a", as.POSIXct("2027-01-01")), cols=c("l1", "i1", "d1", "f1", "t1"))[, .(l1, i1, d1, f1, t1)], - data.table(l1=c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE), - i1=c(9L, 9L, 0:1, 9L, 9L, 2:3, 9L, 9L), - d1=c(9.0, 9L, 0L, 1L, 9L, 9L, 2:3, 9L, 9L), - f1=as.factor(c("a", "a", "a", "b", "a", "a", "b", "c", "a", "a")), - t1=replace(DT$t1, c(1:2, 5:6, 9:10), as.POSIXct("2027-01-01")))) -test(16.14, setnafill(DT, cols=c("l1", "c1")), error="not yet supported") -DT = data.table(l=c(NA, FALSE), i=c(NA, 0L)) -setnafill(DT, fill=list(TRUE, 1L)) -test(16.15, DT, data.table(l=c(TRUE, FALSE), i=1:0)) +## logical +## character +## factor +## Date +## POSIXct +## IDate +## ITime # related to !is.integer(verbose) test(99.1, data.table(a=1,b=2)[1,1, verbose=1], error="verbose must be logical or integer") diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw deleted file mode 100644 index 88dd27498c..0000000000 --- a/inst/tests/optimize.Rraw +++ /dev/null @@ -1,486 +0,0 @@ -require(methods) -if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { - if ((tt<-compiler::enableJIT(-1))>0) - cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") -} else { - require(data.table) - test = data.table:::test - null.data.table = data.table:::null.data.table - INT = data.table:::INT -} - -sugg = c("bit64") -for (s in sugg) { - assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages( - library(s, character.only=TRUE, logical.return=TRUE, quietly=TRUE, warn.conflicts=FALSE, pos="package:base") # attach at the end for #5101 - ))) - if (!loaded) cat("\n**** Suggested package",s,"is not installed or has dependencies missing. Tests using it will be skipped.\n\n") -} - -# := by group -DT = data.table(a=1:3,b=(1:9)/10) -test(611.1,optimize=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) -setkey(DT,a) -test(611.2,optimize=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) -# Combining := by group with i -test(611.3,optimize=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(611.4,optimize=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) -# 612 was just level repetition of 611 -# Assign to subset ok (NA initialized in the other items) ok : -test(613,optimize=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) -test(614,optimize=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) -test(615,optimize=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) -# 616, 617 removed in #5245 - -# Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 -ans = copy(DT)[,r:=NA_real_] -test(618.1,optimize=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) -test(618.2,optimize=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) -test(618.3,optimize=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) -test(618.4,optimize=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) -test(618.5,optimize=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) -# test 619 was level 2 of 618 -# test 620 was removed in #5245 - -DT = data.table(x=letters, key="x") -test(621,optimize=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained -test(622,optimize=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") - -set.seed(2) -DT = data.table(a=rnorm(5)*10, b=1:5) -test(623,optimize=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) -# test 623 subsumes 623.1 and 623.2 for testing both levels - -# Setup for test 656.x - gforce tests -set.seed(9) -n = 1e3 -DT = data.table(grp1=sample.int(150L, n, replace=TRUE), - grp2=sample.int(150L, n, replace=TRUE), - x=rnorm(n), - y=rnorm(n)) -opt = 0:2 -out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') -test(656.1,optimize=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) -test(656.2,optimize=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) -test(656.3,optimize=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) -# 657-658 were for levels 1,2, resp. - -# Test := keyby does setkey, #2065 -DT = data.table(x=1:2, y=1:6) -ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(670.1,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) -test(670.2,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) -test(670.3,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), - warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") -test(670.4,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) -test(670.5,optimize=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") -# test 671 was level 2 of 670 - -# varname holding colnames, by group, linked from #2120. -DT = data.table(a=rep(1:3,1:3),b=1:6) -colname = "newcol" -test(751,optimize=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) -# test 751 subsumes 751.1 and 751.2 for testing both levels - -# Add tests for nested := in j by group, #1987 -DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752,optimize=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) -# test 752 subsumes 752.1 and 752.2 for testing both levels - -DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) -opt = c(0:2, Inf) -out = c('GForce FALSE', 'GForce FALSE', 'GForce TRUE', 'GForce TRUE') -# v1.9.7 treats wrapped {} better, so this is now optimized -test(865,optimize=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) -test(867,optimize=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here -# test 865 subsumes 865.1, 865.2, 865.3 for testing all levels -# 866 was testing an intermediate step; 868 was testing equality between optimization levels - -# tests of gsum and gmean with NA -DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) -set(DT,c(3L,8L),"y",NA) -set(DT,c(5L,9L),"v",NA) -set(DT,10:12,"y",NA) -set(DT,10:12,"v",NA) -opt = c(1L, 2L) -out = c("(GForce FALSE)", "GForce optimized j to") -test(1184.1,optimize=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) -# test 1184.1 subsumes 1184.1 and 1186 for testing both levels -test(1184.2,optimize=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") -test(1185.1,optimize=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], - data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) -# test 1185.1 subsumes 1185.1 and 1187.1 for testing all levels -test(1185.2,optimize=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], - output=c("All optimizations.*off", "Old mean.*changed j", "GForce optimized j to")) -# test 1185.2 subsumes 1185.2, 1185.3, and 1185.4 for testing all levels -MyVar = TRUE -test(1187.2,optimize=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) -test(1187.3,optimize=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) -MyVar = FALSE -test(1187.4,optimize=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) -test(1187.5,optimize=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) -# GForce should not turn on when the .ok function isn't triggered -test(1187.6,optimize=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], - data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)), - output='j unchanged', warning="'trim' is not yet optimized") - -# FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results. -set.seed(2L) -dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) -test(1304.1,optimize=0:2, dt[, list(.N, sum(y)), by=x]) -dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") -test(1304.2,optimize=0:2, dt[, list(.N, sum(y)), by=x]) - -# gmin and gmax extensive testing (because there are tricky cases) -DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) -opts = 0:2 -# for integers -test(1313.01,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.02,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.03,optimize=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.04,optimize=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) -# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) -DT[x==6, y := INT(NA)] -test(1313.05,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.06,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.07,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) -test(1313.08,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) -# for numeric -DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) -test(1313.09,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.10,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.11,optimize=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.12,optimize=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) -# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) -DT[x==6, y := NA_real_] -test(1313.13,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.14,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.15,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) -test(1313.16,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) -# for date (attribute check.. especially after issues/689 !!!) -DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) -test(1313.17,optimize=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.18,optimize=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) -DT[c(1,6), y := NA] -test(1313.19,optimize=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.20,optimize=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) -test(1313.21,optimize=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) -test(1313.22,optimize=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) -# for character -set.seed(1L) -DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) -DT[x==7, y := c("","b","c")] -test(1313.23,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.24,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.25,optimize=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.26,optimize=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) -DT[x==6, y := NA_character_] -test(1313.27,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.28,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) -test(1313.30,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) - -# Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now -dt = data.table(a=sample(3,20,TRUE), b=1:10) -test(1565, optimize=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], - output=c("All optimizations are turned off", "lapply optimization is on, j unchanged", "GForce optimized j to")) -# test 1565 subsumes 1565.1, 1565.2 and 1565.3 for testing all levels - -# gforce optimisations -dt = data.table(x = sample(letters, 300, TRUE), - i1 = sample(-10:10, 300, TRUE), - i2 = sample(c(-10:10, NA), 300, TRUE), - d1 = as.numeric(sample(-10:10, 300, TRUE)), - d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) -if (test_bit64) { - dt[, `:=`(d3 = as.integer64(sample(-10:10, 300, TRUE)))] - dt[, `:=`(d4 = as.integer64(sample(c(-10:10,NA), 300, TRUE)))] -} -opt = 0:2 -out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') -# make sure gforce is on -# testing gforce::gmedian -test(1579.01,optimize=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], - dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x], output="GForce optimized") -test(1579.02,optimize=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], - dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) -test(1579.03,optimize=2L, dt[, lapply(.SD, median), keyby=x], - dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) -test(1579.04,optimize=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], - dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) -# testing gforce::ghead and gforce::gtail -# head(.SD, 1) and tail(.SD, 1) optimisation -test(1579.06,optimize=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) -test(1579.08,optimize=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) -test(1579.10,optimize=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) -test(1579.12,optimize=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) -test(1579.14,optimize=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) -test(1579.16,optimize=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) -test(1579.18,optimize=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) -test(1579.20,optimize=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) -# 1579.22 tested gtail with n>1; now 1579.4+ below -mysub <- function(x, n) x[n] -test(1579.23,optimize=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") -test(1579.24,optimize=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.25,optimize=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.26,optimize=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.27,optimize=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 -test(1579.28,optimize=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) -# gforce head/tail for n>1, #5060 -set.seed(99) -DT = data.table(x = sample(letters[1:5], 20, TRUE), - y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly - i = sample(c(-2L,0L,3L,NA), 20, TRUE), - d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), - s = sample(c("foo","bar",NA), 20, TRUE), - l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) -if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] -test(1579.401,optimize=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize -test(1579.402,optimize=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") -test(1579.403,optimize=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") -test(1579.404,optimize=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") -test(1579.405,optimize=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") -test(1579.406,optimize=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") -test(1579.407,optimize=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") -test(1579.408,optimize=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") -test(1579.409,optimize=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") -test(1579.410,optimize=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") - - -# FR #971, partly addressed (only subsets in 'i') -# make sure GForce kicks in and the results are identical -dt = data.table(x = sample(letters, 300, TRUE), - d1 = as.numeric(sample(-10:10, 300, TRUE)), - d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) -opt = 1:2 -out = c("GForce FALSE","GForce optimized j") -test(1581.01,optimize=opt, ans1 <- dt[x %in% letters[15:20], - c(.N, lapply(.SD, sum, na.rm=TRUE), - lapply(.SD, min, na.rm=TRUE), - lapply(.SD, max, na.rm=TRUE), - lapply(.SD, mean, na.rm=TRUE), - lapply(.SD, median, na.rm=TRUE) - ), by=x, verbose=TRUE], - output = out) -# test 1581.01 subsumes 1581.01, 1581.02 and 1581.03 for testing all levels -# subsets in 'i' for head and tail -test(1581.04,optimize=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) -# test 1581.04 subsumes 1581.04, 1581.05 and 1581.06 for testing all levels -test(1581.07,optimize=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) -# test 1581.07 subsumes 1581.07, 1581.08 and 1581.09 for testing all levels -test(1581.10,optimize=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) -# test 1581.10 subsumes 1581.10, 1581.11 and 1581.12 for testing all levels -# #3209 g[[ -test(1581.13,optimize=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) -# test 1581.13 subsumes 1581.13, 1581.14 and 1581.15 for testing all levels -# also, block for non-atomic input, #4159 -dt = data.table(a=1:3) -dt[ , l := .(list(1, 2, 3))] -test(1581.16, dt[ , .(l = l[[1L]]), by=a, verbose=TRUE], - dt[ , l := unlist(l)], output='(GForce FALSE)') -# make sure not to apply when `[[` is applied to a nested call, #4413 -DT = data.table(f1=c("a","b"), f2=c("x","y")) -l = list(a = c(x = "ax", y = "ay"), b = c(x = "bx", y = "by")) -test(1581.17, DT[ , as.list(l[[f1]])[[f2]], by=c("f1","f2")], - data.table(f1 = c("a", "b"), f2 = c("x", "y"), V1 = c("ax", "by"))) -test(1581.18, DT[, v:=l[[f1]][f2], by=c("f1","f2")], - data.table(f1=c("a","b"), f2=c("x","y"), v=c("ax", "by"))) -# When the object being [[ is in parent.frame(), not x, -# need eval to have enclos=parent.frame(), #4612 -DT = data.table(id = c(1, 1, 2), value = c("a", "b", "c")) -DT0 = copy(DT) -fun = function(DT, tag = c("A", "B")) DT[, var := tag[[.GRP]], by = "id"] -fun(DT) -test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')]) - -# bug fix #1461 related to NaN not being recognized due to ISNA vs ISNAN at C level -# verbatim test from the original report: -DT = data.table( - C1 = c(rep("A", 4), rep("B",4), rep("C", 4)), - C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), - Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) -opt = 0:2 -test(1583.1,optimize=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], - data.table(C1=c("A","A","B","B","C","C"), - C2=c("a","b","b","c","c","d"), - agg=c(1,4,5,8,9,10))) -# extra test with a size-1 group containing one NaN too -DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) -test(1583.2,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) -test(1583.3,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) -test(1583.4,optimize=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) -test(1583.5,optimize=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) - -# FR #523, var, sd and prod -DT = data.table(x=sample(5, 100, TRUE), - y1=sample(6, 100, TRUE), - y2=sample(c(1:10,NA), 100, TRUE), - z1=runif(100), - z2=sample(c(runif(10),NA,NaN), 100, TRUE)) -opt = 0:2 -out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") -test(1594.01,optimize=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) -test(1594.02,optimize=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) -test(1594.03,optimize=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) -# coverage: default group .N=1 case -idx=DT[ , .I[1L], by=x]$V1 -ans=data.table(x=DT[(idx), x], V1=NA_real_) -test(1594.05,optimize=opt, DT[(idx), var(y1), by=x], ans) -test(1594.06,optimize=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) -test(1594.07,optimize=opt, DT[(idx), var(z1), by=x], ans) -test(1594.08,optimize=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) - -test(1594.09,optimize=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) -test(1594.10,optimize=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) -test(1594.11,optimize=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) - -test(1594.12,optimize=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) -test(1594.13,optimize=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) -test(1594.14,optimize=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) - -# when datatable.optimize<1, no optimisation of j should take place: -dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) -test(1638, options=c(datatable.optimize=0L), dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") - -# weighted.mean GForce optimized, #3977 -DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -opt = c(1L,2L) -out = c("GForce FALSE", "GForce optimized j to") -test(2231.01,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) -test(2231.02,optimize=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) -test(2231.03,optimize=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) -# multiple groups -DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.04,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) -test(2231.05,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) -test(2231.06,optimize=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) -# (only x XOR w) containing NA -DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.07,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) -test(2231.08,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) -test(2231.09,optimize=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.10,optimize=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) -# (only x XOR w) containing NaN -DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.11,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) -test(2231.12,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) -test(2231.13,optimize=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) -test(2231.14,optimize=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) -# (only x XOR w) containing NA and NaN -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.15,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) -test(2231.16,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) -test(2231.17,optimize=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.18,optimize=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) -# (x and w) containing NA and NaN -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.19,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.20,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.21,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.22,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) -# tests 2231.31-2231.52 were subsumed in 2231.01-2231.22 for testing different optimization levels -# let wrongly named arguments get lost in ellipsis #5543 -DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.61,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) -test(2231.62,optimize=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) -test(2231.63,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) -test(2231.64,optimize=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) - -# GForce retains attributes in by arguments #5567 -dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) -opt = c(0,Inf) -out = c("GForce FALSE", "GForce optimized j to") -test(2263.1,optimize=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) -# test 2263.1 subsumes 2263.1 and 2263.4 for different optimization levels -test(2263.2,optimize=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) -# test 2263.2 subsumes 2263.2 and 2263.5 for different optimization levels -test(2263.3,optimize=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) -# test 2263.3 subsumes 2263.3 and 2263.6 for different optimization levels - -# named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 -M <- as.data.table(mtcars) -M[, " " := hp] -M[, "." := hp] - -sdnames <- setdiff(names(M), "cyl") -sdlist <- vector("list", length(sdnames)) -names(sdlist) <- sdnames - -opts = 0:2 -test(2283 + 0.01, optimize=opts, - names(M[, c(m=lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(m=sdlist))), - context=sprintf("optimize=%s [I]", format(opt))) -test(2283 + 0.02, optimize=opts, - names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", "Mpg", sdnames), - context=sprintf("optimize=%s [II]", format(opt))) -test(2283 + 0.03, optimize=opts, - names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), - c("cyl", "Mpg", names(c(m=sdlist))), - context=sprintf("optimize=%s [III]", format(opt))) -test(2283 + 0.04, optimize=opts, - names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), - c("cyl", "mpg", names(c(mpg=sdlist))), - context=sprintf("optimize=%s [IV]", format(opt))) -test(2283 + 0.05, optimize=opts, - names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", "V1", sdnames), - context=sprintf("optimize=%s [V]", format(opt))) -test(2283 + 0.06, optimize=opts, - names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), - c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L)), - context=sprintf("optimize=%s [VI]", format(opt))) -test(2283 + 0.07, optimize=opts, - names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), - c("cyl", sdnames, sdnames), - context=sprintf("optimize=%s [VII]", format(opt))) -test(2283 + 0.08, optimize=opts, - names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), - c("cyl", names(c(mean=sdlist, sum=sdlist))), - context=sprintf("optimize=%s [VIII]", format(opt))) -test(2283 + 0.09, optimize=opts, - names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), - c("cyl", sdnames, names(c(sum=sdlist))), - context=sprintf("optimize=%s [IX]", format(opt))) -test(2283 + 0.10, optimize=opts, - names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), - c("cyl", names(c(" "=sdlist, "."=sdlist))), - context=sprintf("optimize=%s [X]", format(opt))) -test(2283 + 0.11, optimize=opts, - names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(a=0, b=0))), sdnames), - context=sprintf("optimize=%s [XI]", format(opt))) -test(2283 + 0.12, optimize=opts, - names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0, 0))), sdnames), - context=sprintf("optimize=%s [XII]", format(opt))) -test(2283 + 0.13, optimize=opts, - names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0, b=0, 0))), sdnames), - context=sprintf("optimize=%s [XIII]", format(opt))) -test(2283 + 0.14, optimize=opts, - names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0))), sdnames), - context=sprintf("optimize=%s [XIV]", format(opt))) -test(2283 + 0.15, optimize=opts, - names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames), - context=sprintf("optimize=%s [XV]", format(opt))) -test(2283 + 0.16, optimize=opts, - names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames), - context=sprintf("optimize=%s [XVI]", format(opt))) -test(2283 + 0.17, optimize=opts, - names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "b", "vs", "am"), - context=sprintf("optimize=%s [XVII]", format(opt))) -test(2283 + 0.18, optimize=opts, - names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "b", "vs", "am"), - context=sprintf("optimize=%s [XVIII]", format(opt))) -test(2283 + 0.19, optimize=opts, - names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "V2", "b", "vs", "am"), - context=sprintf("optimize=%s [XIX]", format(opt))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4c043b2675..c4c20b1776 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -78,7 +78,6 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { test = data.table:::test uniqlengths = data.table:::uniqlengths uniqlist = data.table:::uniqlist - utf8_check = data.table:::utf8_check warningf = data.table:::warningf which_ = data.table:::which_ which.first = data.table:::which.first @@ -1852,7 +1851,53 @@ x = sample(LETTERS,1000,replace=TRUE) test(610.3, chorder(x), base::order(x, method="radix")) test(610.4, unique(x[chgroup(x)]), unique(x)) -# tests 611-623 moved to optimize.Rraw +# := by group +options(datatable.optimize=0L) +DT = data.table(a=1:3,b=(1:9)/10) +test(611.1, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +setkey(DT,a) +test(611.2, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +# Combining := by group with i +test(611.3, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(611.4, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) +options(datatable.optimize=2L) +DT = data.table(a=1:3,b=(1:9)/10) +test(612.1, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +setkey(DT,a) +test(612.2, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +# Combining := by group with i +test(612.3, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(612.4, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) +# Assign to subset ok (NA initialized in the other items) ok : +test(613, DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) +test(614, DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) +test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) + +# Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 +ans = copy(DT)[,r:=NA_real_] +options(datatable.optimize=0L) +test(618.1, copy(DT)[a>3,r:=sum(b)], ans) +test(618.2, copy(DT)[J(-1),r:=sum(b)], ans) +test(618.3, copy(DT)[NA,r:=sum(b)], ans) +test(618.4, copy(DT)[0,r:=sum(b)], ans) +test(618.5, copy(DT)[NULL,r:=sum(b)], null.data.table()) +options(datatable.optimize=2L) +test(619.1, copy(DT)[a>3,r:=sum(b)], ans) +test(619.2, copy(DT)[J(-1),r:=sum(b)], ans) +test(619.3, copy(DT)[NA,r:=sum(b)], ans) +test(619.4, copy(DT)[0,r:=sum(b)], ans) +test(619.5, copy(DT)[NULL,r:=sum(b)], null.data.table()) + +DT = data.table(x=letters, key="x") +test(621, copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained +test(622, copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") + +set.seed(2) +DT = data.table(a=rnorm(5)*10, b=1:5) +options(datatable.optimize=0L) +test(623.1, copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +options(datatable.optimize=2L) +test(623.2, copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) # Tests on POSIXct attributes @@ -1974,10 +2019,20 @@ setnames(ans2,"x","V1") setnames(ans2,"y","V2") test(654, ans1, ans2) -# test 656-658 moved to optimize.Rraw -# test is not testing what it should since #2671 -# tt = capture.output(DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose=TRUE]) -# test(659, !length(grep("Wrote less rows", tt))) # first group is one row with this seed. Ensure we treat this as aggregate case rather than allocate too many rows. +options(datatable.optimize = 0L) +test(656.1, DT[ , mean(x), by=grp1, verbose=TRUE], output='(GForce FALSE)') +test(656.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="(GForce FALSE)") +test(656.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="(GForce FALSE)") +options(datatable.optimize = 1L) +test(657.1, DT[ , mean(x), by=grp1, verbose=TRUE], output='(GForce FALSE)') +test(657.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="(GForce FALSE)") +test(657.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="(GForce FALSE)") +options(datatable.optimize = 2L) +test(658.1, DT[ , mean(x), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean") +test(658.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean") +test(658.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean") +# first group is one row with this seed. Ensure we treat this as aggregate case rather than allocate too many rows. +test(659, DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose=TRUE], notOutput = "Wrote less rows") # Test .N for logical i subset DT = data.table(a=1:10, b=rnorm(10)) @@ -2007,7 +2062,33 @@ test(667, DT[a<3,sum(b),by=paste("a")], error='Otherwise, by=eval(paste("a")) sh test(668, DT[a<3,sum(b),by=eval(paste("a"))], DT[a<3,sum(b),by=a]) test(669, DT[a<3,sum(b),by=c(2)], error="must evaluate to 'character'") -# tests 670+671 moved to optimize.Rraw +# Test := keyby does setkey, #2065 +options(datatable.optimize=0L) +DT = data.table(x=1:2, y=1:6) +ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") +test(670.1, DT[,z:=sum(y),keyby=x], ans) +DT = data.table(x=1:2, y=1:6) +test(670.2, DT[,z:=sum(y),keyby="x"], ans) +DT = data.table(x=1:2, y=1:6) +test(670.3, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), + warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") +DT = data.table(x=1:2, y=1:6) +test(670.4, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +DT = data.table(x=1:2, y=1:6) +test(670.5, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +options(datatable.optimize=2L) +DT = data.table(x=1:2, y=1:6) +ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") +test(671.1, DT[,z:=sum(y),keyby=x], ans) +DT = data.table(x=1:2, y=1:6) +test(671.2, DT[,z:=sum(y),keyby="x"], ans) +DT = data.table(x=1:2, y=1:6) +test(671.3, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), + warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") +DT = data.table(x=1:2, y=1:6) +test(671.4, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +DT = data.table(x=1:2, y=1:6) +test(671.5, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") # Test new .() DT = data.table(x=1:2, y=1:6, key="x") @@ -2192,7 +2273,23 @@ test(749, DT[,c("c","d","e"):=list(.N,sum(b),a*10L),by=a], data.table(a=rep(6:8, test(750.1, copy(DT)[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) test(750.2, copy(DT)[a<8,let(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) -# tests 751, 752 moved to optimize.Rraw +# varname holding colnames, by group, linked from #2120. +options(datatable.optimize=0L) +DT = data.table(a=rep(1:3,1:3),b=1:6) +colname = "newcol" +test(751.1, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) +options(datatable.optimize=2L) +DT = data.table(a=rep(1:3,1:3),b=1:6) +colname = "newcol" +test(751.2, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) + +# Add tests for nested := in j by group, #1987 +options(datatable.optimize=0L) +DT = data.table(a=rep(1:3,2:4),b=1:9) +test(752.1, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +options(datatable.optimize=2L) +DT = data.table(a=rep(1:3,2:4),b=1:9) +test(752.2, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) # Test duplicate() of recycled plonking RHS, #2298 DT = data.table(a=letters[3:1],x=1:3) @@ -2494,8 +2591,18 @@ test(864.3, rbindlist(list(data.table(logical(0),logical(0)), DT<-data.table(baz message="Column 1 [[]'baz'[]] of item 2 is missing in item 1.*Use fill=TRUE.*or use.names=FALSE.*v1.12.2") # Steve's find that setnames failed for numeric 'old' when pointing to duplicated names -# tests 865-868 moved to optimize.Rraw - +DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) +options(datatable.optimize = 0L) +test(865.1, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") +options(datatable.optimize = 1L) +test(865.2, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") +options(datatable.optimize = 2L) +test(865.3, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], + output="GForce optimized.*gsum[(]v[)], gsum[(]w[)]") # v1.9.7 treats wrapped {} better, so this is now optimized +options(datatable.optimize = Inf) +test(866, names(ans1), c("a","b","name1","name2")) +test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a","b","name1","name2")) # list names extracted here +test(868, ans1, ans2) # and related to setnames, too DT = data.table(a=1:3,b=1:6,key="a") test(869, DT[J(2,42,84),print(.SD),by=.EACHI], output=" b\n.*1.*2\n2:.*5.*Empty data.table [(]0 rows and 3 cols[)]: a,V2,V3") # .* for when verbose mode @@ -3462,35 +3569,7 @@ DT[,`:=`(last.x=tail(x,1L),last.x1=tail(x1,1L)),by=y] test(1086, class(DT$last.x), c("POSIXct", "POSIXt")) test(1087, class(DT$last.x1), "ITime") -# chmatch on 'unknown' encoding (e.g. as.character(as.symbol("\u00E4")) )falling back to match, #2538 and #4818 -local({ -x1 = c("al\u00E4", "ala", "\u00E4allc", "coep") -x2 = c("ala", "al\u00E4") -if (utf8_check(c(x1,x2))) { - tstc = function(y) unlist(lapply(y, function(x) as.character(as.name(x))), use.names=FALSE) - test(1088.1, chmatch(x1, x2), match(x1, x2)) # should not fallback to "match" - test(1088.2, x1 %chin% x2, x1 %in% x2) - # change x1 to symbol to character - test(1089.1, chmatch(tstc(x1), x2), match(tstc(x1), x2)) # should fallback to match in "x" - test(1089.2, tstc(x1) %chin% x2, tstc(x1) %in% x2) # should fallback to match in "x" - # change x2 to symbol to character - test(1090.1, chmatch(x1,tstc(x2)), match(x1, tstc(x2))) # should fallback to match in "table" - test(1090.2, x1 %chin% tstc(x2), x1 %in% tstc(x2)) - # both are symbols to characters - test(1091.1, chmatch(tstc(x1), tstc(x2)), match(tstc(x1), tstc(x2))) # should fallback to "match" in "x" as well. - test(1091.2, tstc(x1) %chin% tstc(x2), tstc(x1) %in% tstc(x2)) -} else cat("Tests 1088-1091 skipped because required UTF-8 symbols cannot be represented in native encoding.\n") -}) -# for completeness, include test from #2528 of non ascii LHS of := (it could feasibly fail in future due to something other than chmatch) - -local(if (utf8_check("\u00E4")) { -eval(parse(text=' # eval(parse()) defers parsing to runtime; see utf8_check description - DT = data.table(pas = c(1:5, NA, 6:10), good = c(1:10, NA)) - setnames(DT, "pas", "p\u00E4s") - test(1092, DT[is.na(p\u00E4s), p\u00E4s := 99L], data.table("p\u00E4s" = c(1:5, 99L, 6:10), good = c(1:10,NA))) - test(1093, DT[, p\u00E4s := 34L], data.table("p\u00E4s" = 34L, good=c(1:10,NA))) -')) -} else cat("Tests 1092+1093 skipped because required UTF-8 symbols cannot be represented in native encoding.\n")) +# Tests 1088-1093 were non-ASCII. Now in DtNonAsciiTests # print of unnamed DT with >20 <= 100 rows, #97 (RF#4934) DT <- data.table(x=1:25, y=letters[1:25]) @@ -3629,9 +3708,8 @@ test(1100, dt1[dt2,roll=-Inf,rollends=c(FALSE,TRUE)]$ind, INT(NA,NA,1,2,2,2,2,2, # issue 5824 - tests for "value.var.in.dots", "value.var.in.LHSdots", "value.var.in.RHSdots" arguments of dcast DT = data.table(index = c("a","b"), x = 1:2) - test(1102.1810, dcast(DT, ... ~ index, fun.aggregate = length, value.var.in.dots = NA), error = "'value.var.in.dots' must be TRUE or FALSE") - test(1102.1821, dcast(DT, ... ~ index, fun.aggregate = length, value.var.in.LHSdots = NA), error = "'value.var.in.LHSdots' must be TRUE or FALSE") - test(1102.1822, dcast(DT, ... ~ index, fun.aggregate = length, value.var.in.RHSdots = NA), error = "'value.var.in.RHSdots' must be TRUE or FALSE") + test(1102.181, dcast(DT, ... ~ index, fun.aggregate = length, value.var.in.dots = NA), error = "Argument 'value.var.in.dots' should be logical TRUE/FALSE") + test(1102.182, dcast(DT, ... ~ index, fun.aggregate = length, value.var.in.LHSdots = NA), error = "Arguments 'value.var.in.LHSdots', 'value.var.in.RHSdots' should be logical TRUE/FALSE") test(1102.183, dcast(DT, ... ~ index, fun.aggregate = length, value.var.in.dots = TRUE), data.table(x = 1:2, a = 1:0, b = 0:1, key = "x")) test(1102.184, dcast(DT, ... ~ index, fun.aggregate = length, value.var = "index", value.var.in.dots = TRUE), data.table(index = c("a","b"), x = 1:2, a = 1:0, b = 0:1, key = c("index", "x"))) test(1102.185, dcast(DT, ... + index ~ index, fun.aggregate = length, value.var = "index", value.var.in.dots = TRUE), data.table(index = c("a","b"), x = 1:2, a = 1:0, b = 0:1, key = c("index", "x"))) @@ -3921,8 +3999,10 @@ test(1133.3, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) -test(1133.7,optimize=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) -# test 1133.7 subsumes 1133.7 and 1133.75 for testing both levels +options(datatable.optimize=0L) +test(1133.7, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) +options(datatable.optimize=2L) +test(1133.75, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) # on a new column with warning on 2nd assign DT[,new:=NULL] test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)), @@ -4034,9 +4114,12 @@ DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2)) test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - "))) DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) -test(1143.4,optimize=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.5,optimize=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -# tests 1143.4 and 1143.5 subsume 1143.4, 1143.5, 1143.6 and 1143.7 for testing both levels +options(datatable.optimize=0L) +test(1143.4, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.5, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +options(datatable.optimize=2L) +test(1143.6, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.7, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 @@ -4247,10 +4330,7 @@ test(1162.24, is.sorted(rep(NA_character_, 2))) x <- character(0) test(1163, last(x), character(0)) -# Bug fix for #5159 - chmatch and character encoding (for some reason this seems to pass the test on a mac as well) -a = c("a","\u00E4","\u00DF","z") -au = iconv(a,"UTF8","latin1") -test(1164.1, requires_utf8=c("\u00E4", "\u00DF"), chmatch(a, au), match(a, au)) +# Test 1164 was a non-ASCII test, now in DtNonAsciiTests # Bug fix for #73 - segfault when rbindlist on empty data.tables x <- as.data.table(BOD) @@ -4312,7 +4392,40 @@ test(1181, forderv(INT(1,3,5000000,NA)), INT(4,1,2,3)) test(1182, forderv(INT(1,-1,5000000,NA)), INT(4,2,1,3)) test(1183, forderv(INT(-3,-7,1,-6000000,NA,3,5000000,NA,8)), INT(5,8,4,2,1,3,6,9,7)) -# tests 1184-1187 moved to optimize.Rraw +# tests of gsum and gmean with NA +DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) +set(DT,c(3L,8L),"y",NA) +set(DT,c(5L,9L),"v",NA) +set(DT,10:12,"y",NA) +set(DT,10:12,"v",NA) +options(datatable.optimize=1) # turn off GForce +test(1184.1, DT[, sum(v), by=x, verbose=TRUE], output="(GForce FALSE)") +test(1184.2, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") +test(1185.1, DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], + data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) +options(datatable.optimize=0) # turn off fastmean optimization to get the answer to match to +test(1185.2, ans <- DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], output="All optimizations.*off") +options(datatable.optimize=1) # turn on old fastmean optimization only +test(1185.3, DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], ans, output="Old mean.*changed j") +options(datatable.optimize=Inf) # turn on GForce +test(1185.4, DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], ans, output="GForce optimized j to") +test(1186, DT[, sum(v), by=x, verbose=TRUE], output="GForce optimized j to") +test(1187.1, DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], + data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) +MyVar = TRUE +test(1187.2, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output="GForce optimized j to", + DT[, list(sum(y,na.rm=TRUE), mean(y,na.rm=TRUE)), by=x]) +test(1187.3, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output="GForce optimized j to", + DT[, mean(y,na.rm=TRUE), by=x]) +MyVar = FALSE +test(1187.4, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output="GForce optimized j to", + DT[, list(sum(y,na.rm=FALSE), mean(y,na.rm=FALSE)), by=x]) +test(1187.5, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output="GForce optimized j to", + DT[, mean(y,na.rm=FALSE), by=x]) +# GForce should not turn on when the .ok function isn't triggered +test(1187.6, DT[, mean(y, trim=.2), by=x, verbose=TRUE], + data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)), + output='j unchanged', warning="'trim' is not yet optimized") # test from Zach Mayer a <- c("\"\"\"\") \" \" \" \" \"", "\"\"\"\") \" \" \" \" \"", "\"\"\"\") \" \" \" \" ,\"") @@ -4503,42 +4616,22 @@ test(1228.4, class(DT), class(DT[, sum(b), by=a])) test(1228.5, class(DT), class(DT[a>1, sum(b), by=a])) test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) -# savetl_init error after error, in v1.9.2, thanks Arun -DT = data.table(x=1:5, y=10:6) -test(1229.1, DT[forderv(DT, -1)], error="non-existing column") -test(1229.2, setkey(DT), data.table(x=1:5, y=10:6, key="x,y")) -# umlaut in column names (red herring I think, but testing anyway) -local(if (utf8_check("\u00fc")) { - eval(parse(text = ' # eval(parse()) defers parsing to runtime; see utf8_check description - sentEx = data.table(abend = c(1, 1, 0, 0, 2), - aber = c(0, 1, 0, 0, 0), - "\u00FCber" = c(1, 0, 0, 0, 0), - "\u00FCberall" = c(0, 0, 0, 0, 0), - "\u00FCberlegt" = c(0, 0, 0, 0, 0), - ID = structure(c(1L, 1L, 2L, 2L, 2L), .Label = c("0019", "0021"), class = "factor"), - abgeandert = c(1, 1, 1, 0, 0), - abgebildet = c(0, 0, 1, 1, 0), - abgelegt = c(0, 0, 0, 0, 3)) - test(1229.3, sentEx[, lapply(.SD, sum), by=ID], data.table(ID=factor(c("0019","0021")), abend=c(2,2), aber=c(1,0), "\u00FCber"=c(1,0), - "\u00FCberall"=c(0,0), "\u00FCberlegt" = c(0,0), abgeandert=c(2,1), abgebildet = c(0,2), abgelegt=c(0,3))) - ')) -} else { - cat("Test 1229.3 skipped because required UTF-8 symbols cannot be represented in native encoding.\n") -}) +# test 1229 was non-ASCII, now in package DtNonAsciiTests # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 DT = data.table(a=1:3,b=1:6,key="a") -# turn off GForce, to test dogroups -test(1230,optimize=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +options(datatable.optimize=1) # turn off GForce, to test dogroups +test(1230, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") setkey(DT,NULL) -test(1231,optimize=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") -test(1232,optimize=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") -test(1233,optimize=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") -test(1234,optimize=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized +test(1231, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1232, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") +test(1233, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") +test(1234, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized setkey(DT,a) -test(1235,optimize=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") -test(1236,optimize=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") -test(1237,optimize=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1235, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1236, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") +test(1237, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +options(datatable.optimize=Inf) # check that key is not preserved when length of fastorder is > 0 DT <- data.table(x=1:5, y=6:10, key="x") @@ -4805,24 +4898,47 @@ set.seed(45L) dt = data.table(a=sample(2,10,TRUE), b=sample(3,10,TRUE), c=sample(4,10,TRUE), d=sample(5,10,TRUE)) dt2 = data.table(x=c(1,1,1,2,2,2), y=1:6) -gf_out = c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "GForce optimized j to") -lp_out = c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "lapply optimization changed j") -opt = c(0L, 1L, Inf) +options(datatable.optimize=0L) # auto-naming behavior is different for no-optimization case; just check optimization is off -test(1268.01,optimize=opt, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = gf_out) -test(1268.02,optimize=opt, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = lp_out) -test(1268.03,optimize=opt, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) -test(1268.04,optimize=opt, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output= gf_out) -test(1268.05,optimize=opt, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) +test(1268.01, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') +test(1268.02, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') +test(1268.03, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.04, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.05, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") # newly added tests for #861 -- optimise, but no GForce -test(1268.06,optimize=opt, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) +test(1268.06, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") # don't optimise .I in c(...) -test(1268.07,optimize=opt, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], - output= c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "lapply optimization is on, j unchanged as")) -# tests .08-.21 were different optimization levels -test(1268.22,optimize=opt, dt[, c(as.list(c), lapply(.SD, mean)), by=a], +test(1268.07, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") + +options(datatable.optimize=1L) +test(1268.08, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") +test(1268.09, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.10, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') +test(1268.11, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.12, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.13, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.14, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") + +options(datatable.optimize=Inf) +test(1268.15, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, + output="GForce optimized j to 'list(gmean(b), gmean(c), gmean(d), gsum(b), gsum(c), gsum(d))'") +test(1268.16, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, + output = "lapply optimization changed j from 'c(lapply(.SD, mean), .N)' to 'list(mean(b), mean(c), mean(d), .N)'") +test(1268.17, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, + output = "lapply optimization changed j from 'c(list(c), lapply(.SD, mean))' to 'list(c, mean(b), mean(c), mean(d))") +test(1268.18, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, + output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") +test(1268.19, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, + output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") +test(1268.20, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, + output = "lapply optimization changed j from 'c(list(sum(d), .I), lapply(.SD, mean))' to 'list(sum(d), .I, mean(b), mean(c), mean(d))'") +test(1268.21, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, + output = "lapply optimization is on, j unchanged as 'c(.I, lapply(.SD, mean))'") + +test(1268.22, dt[, c(as.list(c), lapply(.SD, mean)), by=a], error = "j doesn't evaluate to the same number of columns for each group") + ### FR #2722 tests end here ### # Wide range numeric and integer64, to test all bits @@ -5162,7 +5278,21 @@ set(DT,1L,"b",3L) test(1302, 0L[1L], 3L-3L) test(1303, 0L, 3L-3L) -# test 1304 moved to optimize.Rraw +# FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results. +set.seed(2L) +dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) +options(datatable.optimize = 1L) +ans1 <- dt[, list(.N, sum(y)), by=x] +options(datatable.optimize = 2L) +ans2 <- dt[, list(.N, sum(y)), by=x] +test(1304.1, ans1, ans2) + +dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") +options(datatable.optimize = 1L) +ans1 <- dt[, list(.N, sum(y)), by=x] +options(datatable.optimize = 2L) +ans2 <- dt[, list(.N, sum(y)), by=x] +test(1304.2, ans1, ans2) # FR #338 DT <- data.table(x=1:5, y=6:10) @@ -5211,7 +5341,59 @@ DT = data.table(a=1:3,b=6:1) test(1312, DT[,setkey(.SD),by=a], error="Setting a physical key on .SD is reserved for possible future use") # was warning "Already keyed by this key but had invalid row order" due to the key not being cleared after the previous group. A solution could have been to put back the original key on populating .SD for each group. But instead we reserve it for future use and push the user towards doing it a different more efficient way (see Arun's speedups in the datatable-help thread). -# test 1313 moved to optimize.Rraw +# gmin and gmax extensive testing (because there are tricky cases) +DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) +# make sure GForce is running +options(datatable.optimize=3L) + +# for integers +test(1313.01, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.02, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.03, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.04, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) +DT[x==6, y := INT(NA)] +test(1313.05, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.06, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) +test(1313.08, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) + +# for numeric +DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) +test(1313.09, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.10, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.11, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.12, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) +DT[x==6, y := NA_real_] +test(1313.13, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.14, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) +test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) + +# for date (attribute check.. especially after issues/689 !!!) +DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) +test(1313.17, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.18, DT[, list(y=max(y)), by=x], DT[c(5,10)]) +DT[c(1,6), y := NA] +test(1313.19, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.20, DT[, list(y=max(y)), by=x], DT[c(1,6)]) +test(1313.21, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) +test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) + +# for character +set.seed(1L) +DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) +DT[x==7, y := c("","b","c")] +test(1313.23, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.24, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.25, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +DT[x==6, y := NA_character_] +test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) +test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = @@ -6324,8 +6506,10 @@ test(1437.17, DT[!a %chin% c("A", "B") & x == 2], DT[c(4, 5, 6)]) test(1437.18, DT[x == 2, .(test = x+y), verbose = TRUE], output = "Optimized subsetting") test(1437.19, DT[x == 2, test := x+y, verbose = TRUE], output = "Optimized subsetting") ## optimize option level 3 is required to get optimized subsetting -test(1437.21,optimize=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) -# test 1437.21 subsumes 1437.21 and 1437.22 +options(datatable.optimize = 2L) +test(1437.21, DT[x == 2, verbose = TRUE], output = "^ x y") +options(datatable.optimize = Inf) +test(1437.22, DT[x == 2, verbose = TRUE], output = "Optimized subsetting") ## NaN on right hand side is treated correctly. NA on right hand side is not reaching .prepareFastSubset, so not tested here DT <- data.table(x = c(1L:10L, NA_integer_, NA_integer_), y = c(1:10, NA_real_, NaN)) test(1437.23, DT[y == NaN], DT[0]) @@ -6410,22 +6594,35 @@ if (.Machine$sizeof.pointer>4) { # temporarily disabled for 32bit, #2767 for (t in seq_len(nrow(all))) { ## test the query with missing j thisQuery <- all$query[t] + options("datatable.optimize" = 3L) + ansOpt <- DT[eval(parse(text = thisQuery))] + options("datatable.optimize" = 2L) + ansRef <- DT[eval(parse(text = thisQuery))] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, optimize=c(2L,3L), DT[eval(parse(text = thisQuery))], context=sprintf("t=%d [I]", t)) + test(1438.0 + test_no*0.0001, ansOpt, ansRef, context=sprintf("t=%d [I]", t)) ## repeat the test with 'which = TRUE' + options("datatable.optimize" = 3L) + ansOpt <- DT[eval(parse(text = thisQuery)), which = TRUE] + options("datatable.optimize" = 2L) + ansRef <- DT[eval(parse(text = thisQuery)), which = TRUE] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, optimize=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE], context=sprintf("t=%d [II]", t)) + test(1438.0 + test_no*0.0001, ansOpt, ansRef, context=sprintf("t=%d [II]", t)) ## repeat the test with the j queries for (thisJquery in jQueries) { ## do it with and without existing "by" - for(thisBy in bys){ + for (thisBy in bys) { + options("datatable.optimize" = 3L) + ansOpt <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] + options("datatable.optimize" = 2L) + ansRef <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, optimize=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy], context=sprintf("t=%d, thisJquery=%s, thisBy=%s", t, thisJquery, thisBy)) + test(1438.0 + test_no*0.0001, ansOpt, ansRef, context=sprintf("t=%d, thisJquery=%s, thisBy=%s", t, thisJquery, thisBy)) } } } } +options(datatable.optimize = Inf) # fread dec=',' e.g. France test(1439, fread("A;B\n1;2,34\n", dec="12"), error=base_messages$stopifnot("nchar(dec) == 1L")) @@ -6844,7 +7041,7 @@ test(1477.08, transpose(1:5), error="l must be a list") test(1477.09, transpose(list(as.complex(c(1, 1+5i)))), error="Unsupported column type") test(1477.10, transpose(list(x~y)), error="Item 1 of list input is") test(1477.11, transpose(as.list(1:5), fill=1:2), error="fill must be a length 1 vector") -test(1477.12, transpose(as.list(1:5), ignore.empty=NA), error="'ignore.empty' must be TRUE or FALSE") +test(1477.12, transpose(as.list(1:5), ignore.empty=NA), error="ignore.empty should be logical TRUE/FALSE") test(1477.13, transpose(list()), list()) # return list columns #5639 la = list(as.list(1:3), list("a","b","c")) @@ -6857,7 +7054,7 @@ test(1477.18, transpose(list(list(1L,"a"), list(2L), list(3L,"c")), list.cols=TR test(1477.19, transpose(list(1:2, c("a","b","c")), list.cols=TRUE, fill=3L), lb) test(1477.20, transpose(list(factor(letters[1:3])), list.cols=TRUE), list(list("a"), list("b"), list("c"))) test(1477.21, transpose(list(factor(letters[1:3])), list.cols=FALSE), list("a", "b", "c")) -test(1477.22, transpose(la, list.cols=NA), error="'list.cols' must be TRUE or FALSE") +test(1477.22, transpose(la, list.cols=NA), error="list.cols should be logical TRUE/FALSE.") # #480 `setDT` and 'lapply' ll = list(data.frame(a=1), data.frame(x=1, y=2), NULL, list()) @@ -7712,8 +7909,10 @@ test(1547, foo(1L, 5L, a=2L, "c"), c("2", "c")) # Fix for encoding issues in windows, #563 f = testDir("issue_563_fread.txt") -test(1548.1, requires_utf8=TRUE, unique(unlist(lapply(fread(f, sep=",", header=TRUE), Encoding))), "unknown") -test(1548.2, requires_utf8=TRUE, unique(unlist(lapply(fread(f, sep=",", header=TRUE, encoding="UTF-8"), Encoding))), "UTF-8") +ans1 <- fread(f, sep=",", header=TRUE) +ans2 <- fread(f, sep=",", header=TRUE, encoding="UTF-8") +test(1548.1, unique(unlist(lapply(ans1, Encoding))), "unknown") +test(1548.2, unique(unlist(lapply(ans2, Encoding))), "UTF-8") # 1549 moved to benchmark.Rraw, #5517 @@ -7957,7 +8156,14 @@ test(1564.1, truelength(dt[, .SD]), 1025L) test(1564.2, truelength(dt[a==5, .SD]), 1025L) test(1564.3, dt[a==5, .SD][, b := 1L], data.table(a=5L, b=1L)) -# test 1565 moved to optimize.Rraw +# Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now +dt = data.table(a=sample(3,20,TRUE), b=1:10) +options(datatable.optimize = 0L) +test(1565.1, ans <- dt[, .N, by=a, verbose=TRUE], output="All optimizations are turned off") +options(datatable.optimize = 1L) +test(1565.2, dt[ , .N, by=a, verbose=TRUE], ans, output="lapply optimization is on, j unchanged") +options(datatable.optimize = Inf) +test(1565.3, dt[ , .N, by=a, verbose=TRUE], ans, output = "GForce optimized j to") # Fix for #1212 set.seed(123) @@ -8066,7 +8272,83 @@ test(1578.7, fread(f, skip=49L), data.table(V1=1:2, V2=3:4)) test(1578.8, fread(f, skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4)) test(1578.9, fread(f, skip=48L), data.table(V1=1:2, V2=3:4)) # start on blank line 49 and skip="auto" to first data row on line 50 -# test 1579 moved to optimize.Rraw +# gforce optimisations +dt = data.table(x = sample(letters, 300, TRUE), + i1 = sample(-10:10, 300, TRUE), + i2 = sample(c(-10:10, NA), 300, TRUE), + d1 = as.numeric(sample(-10:10, 300, TRUE)), + d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) +if (test_bit64) { + dt[, `:=`(d3 = as.integer64(sample(-10:10, 300, TRUE)))] + dt[, `:=`(d4 = as.integer64(sample(c(-10:10,NA), 300, TRUE)))] +} + +# make sure gforce is on +options(datatable.optimize=2L) + +# testing gforce::gmedian +test(1579.01, dt[, lapply(.SD, median), by=x], + dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x]) +test(1579.02, dt[, lapply(.SD, median, na.rm=TRUE), by=x], + dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) +test(1579.03, dt[, lapply(.SD, median), keyby=x], + dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) +test(1579.04, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], + dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) +test(1579.05, dt[, lapply(.SD, median), by=x, verbose=TRUE], + output = "GForce optimized") + +# testing gforce::ghead and gforce::gtail +# head(.SD, 1) and tail(.SD, 1) optimisation +test(1579.06, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x]) +test(1579.07, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x]) +test(1579.08, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x]) +test(1579.09, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x]) +test(1579.10, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x]) +test(1579.11, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x]) +test(1579.12, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x]) +test(1579.13, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x]) + +test(1579.14, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x]) +test(1579.15, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x]) +test(1579.16, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x]) +test(1579.17, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x]) +test(1579.18, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) +test(1579.19, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) +test(1579.20, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) +test(1579.21, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) +# 1579.22 tested gtail with n>1; now 1579.4+ below + +mysub <- function(x, n) x[n] +test(1579.23, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") +test(1579.24, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) +test(1579.25, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) +test(1579.26, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) +test(1579.27, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 +test(1579.28, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) + +# gforce head/tail for n>1, #5060 +set.seed(99) +DT = data.table(x = sample(letters[1:5], 20, TRUE), + y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly + i = sample(c(-2L,0L,3L,NA), 20, TRUE), + d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), + s = sample(c("foo","bar",NA), 20, TRUE), + l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) +if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] +options(datatable.optimize=2L) +test(1579.401, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(1579.402, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(1579.403, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(1579.404, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(1579.405, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(1579.406, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(1579.407, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(1579.408, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(1579.409, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(1579.410, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") + +options(datatable.optimize = Inf) # test for #1419, rleid doesn't remove names attribute x = c("a"=TRUE, "b"=FALSE) @@ -8074,12 +8356,104 @@ nx = copy(names(x)) r = rleid(x) test(1580, nx, names(x)) -# test 1581 moved to optimize.Rraw +# FR #971, partly addressed (only subsets in 'i') +# make sure GForce kicks in and the results are identical +dt = dt[, .(x, d1, d2)] +options(datatable.optimize=1L) + +test(1581.01, ans1 <- dt[x %in% letters[15:20], + c(.N, lapply(.SD, sum, na.rm=TRUE), + lapply(.SD, min, na.rm=TRUE), + lapply(.SD, max, na.rm=TRUE), + lapply(.SD, mean, na.rm=TRUE), + lapply(.SD, median, na.rm=TRUE) + ), by=x, verbose=TRUE], + output = "(GForce FALSE)") +options(datatable.optimize=2L) +test(1581.02, ans2 <- dt[x %in% letters[15:20], + c(.N, lapply(.SD, sum, na.rm=TRUE), + lapply(.SD, min, na.rm=TRUE), + lapply(.SD, max, na.rm=TRUE), + lapply(.SD, mean, na.rm=TRUE), + lapply(.SD, median, na.rm=TRUE) + ), by=x, verbose=TRUE], + output = "GForce optimized j") +test(1581.03, ans1, ans2) + +# subsets in 'i' for head and tail +options(datatable.optimize=1L) +test(1581.04, ans1 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], + output = "(GForce FALSE)") +options(datatable.optimize=2L) +test(1581.05, ans2 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], + output = "GForce optimized j") +test(1581.06, ans1, ans2) + +options(datatable.optimize=1L) +test(1581.07, ans1 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], + output = "(GForce FALSE)") +options(datatable.optimize=2L) +test(1581.08, ans2 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], + output = "GForce optimized j") +test(1581.09, ans1, ans2) + +options(datatable.optimize=1L) +test(1581.10, ans1 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], + output = "(GForce FALSE)") +options(datatable.optimize=2L) +test(1581.11, ans2 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], + output = "GForce optimized j") +test(1581.12, ans1, ans2) +options(datatable.optimize = Inf) + +# #3209 g[[ +options(datatable.optimize=1L) +test(1581.13, ans1 <- dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], + output = "(GForce FALSE)") +options(datatable.optimize=Inf) +test(1581.14, ans2 <- dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], + output = "GForce optimized j") +test(1581.15, ans1, ans2) +# also, block for non-atomic input, #4159 +dt = data.table(a=1:3) +dt[ , l := .(list(1, 2, 3))] +test(1581.16, dt[ , .(l = l[[1L]]), by=a, verbose=TRUE], + dt[ , l := unlist(l)], output='(GForce FALSE)') +# make sure not to apply when `[[` is applied to a nested call, #4413 +DT = data.table(f1=c("a","b"), f2=c("x","y")) +l = list(a = c(x = "ax", y = "ay"), b = c(x = "bx", y = "by")) +test(1581.17, DT[ , as.list(l[[f1]])[[f2]], by=c("f1","f2")], + data.table(f1 = c("a", "b"), f2 = c("x", "y"), V1 = c("ax", "by"))) +test(1581.18, DT[, v:=l[[f1]][f2], by=c("f1","f2")], + data.table(f1=c("a","b"), f2=c("x","y"), v=c("ax", "by"))) +# When the object being [[ is in parent.frame(), not x, +# need eval to have enclos=parent.frame(), #4612 +DT = data.table(id = c(1, 1, 2), value = c("a", "b", "c")) +DT0 = copy(DT) +fun = function(DT, tag = c("A", "B")) DT[, var := tag[[.GRP]], by = "id"] +fun(DT) +test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')]) # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) -# test 1583 moved to optimize.Rraw +# bug fix #1461 related to NaN not being recognized due to ISNA vs ISNAN at C level +# verbatim test from the original report: +options(datatable.optimize=Inf) # ensure gforce is on +DT = data.table( + C1 = c(rep("A", 4), rep("B",4), rep("C", 4)), + C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), + Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) +test(1583.1, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], + data.table(C1=c("A","A","B","B","C","C"), + C2=c("a","b","b","c","c","d"), + agg=c(1,4,5,8,9,10))) +# extra test with a size-1 group containing one NaN too +DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) +test(1583.2, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) +test(1583.3, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) +test(1583.4, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.5, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) # Fixed a minor bug in fread when blank.lines.skip=TRUE f1 <- function(x, f=TRUE, b=FALSE) fread(x, fill=f, blank.lines.skip=b, data.table=FALSE, logical01=FALSE) @@ -8199,7 +8573,32 @@ test(1592.2, names(setnames(DT, -1, c("m", "n"))), c("x", "m", "n")) # fix for #1513 test(1593, CJ(c(1,2,2), c(1,2,3)), data.table(V1=rep(c(1,2), c(3,6)), V2=c(1,2,3,1,1,2,2,3,3), key=c("V1", "V2"))) -# test 1594 moved to optimize.Rraw +# FR #523, var, sd and prod +options(datatable.optimize = Inf) # ensure gforce is on +DT = data.table(x=sample(5, 100, TRUE), + y1=sample(6, 100, TRUE), + y2=sample(c(1:10,NA), 100, TRUE), + z1=runif(100), + z2=sample(c(runif(10),NA,NaN), 100, TRUE)) +test(1594.01, DT[, lapply(.SD, var, na.rm=FALSE), by=x], DT[, lapply(.SD, stats::var, na.rm=FALSE), by=x]) +test(1594.02, DT[, lapply(.SD, var, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::var, na.rm=TRUE), by=x]) +test(1594.03, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gvar") +# coverage: default group .N=1 case +idx=DT[ , .I[1L], by=x]$V1 +out=data.table(x=DT[(idx), x], V1=NA_real_) +test(1594.05, DT[(idx), var(y1), by=x], out) +test(1594.06, DT[(idx), var(y1, na.rm=TRUE), by=x], out) +test(1594.07, DT[(idx), var(z1), by=x], out) +test(1594.08, DT[(idx), var(z1, na.rm=TRUE), by=x], out) + +test(1594.09, DT[, lapply(.SD, sd, na.rm=FALSE), by=x], DT[, lapply(.SD, stats::sd, na.rm=FALSE), by=x]) +test(1594.10, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) +test(1594.11, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gsd") + +test(1594.12, DT[, lapply(.SD, prod, na.rm=FALSE), by=x], DT[, lapply(.SD, base::prod, na.rm=FALSE), by=x]) +test(1594.13, DT[, lapply(.SD, prod, na.rm=TRUE), by=x], DT[, lapply(.SD, base::prod, na.rm=TRUE), by=x]) +test(1594.14, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gprod") + # FR #1517 dt1 = data.table(x=c(1,1,2), y=1:3) @@ -8822,21 +9221,24 @@ test(1629.07, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3]) dt2 = copy(dt) test(1629.08, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)]) # grouping operations -test(1629.09,optimize=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) -ans1 = dt[, base::sum(v1), by=grp] +options(datatable.optimize = 1L) # no gforce +test(1629.09, dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) +ans1 = dt[, sum(v1), by=grp] ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3] -test(1629.10,optimize=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) -test(1629.11,optimize=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], +test(1629.10, dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) +test(1629.11, dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp]) -test(1629.12,optimize=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) -# test 1629.12 subsumes 1629.12 and 1629.13 for testing both levels +test(1629.12, dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # gforce +options(datatable.optimize = Inf) # Inf +test(1629.13, dt[, c(v1=max(v1), lapply(.SD, min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # even more complex, shouldn't run any optimisation dt[, v4 := v1/2] test(1629.14, dt[, c(.(v1=v1*min(v4)), lapply(.SD, function(x) x*max(v4))), by=grp, .SDcols=v2:v3], dt[, .(v1=v1*min(v4), v2=v2*max(v4), v3=v3*max(v4)), by=grp]) test(1629.15, copy(dt)[, c("a", "b", "c") := c(min(v1), lapply(.SD, function(x) max(x)*min(v1))), by=grp, .SDcols=v3:v4], copy(dt)[, c("a", "b", "c") := .(min(v1), max(v3)*min(v1), max(v4)*min(v1)), by=grp]) +options(datatable.optimize = Inf) # by=.EACHI and operations with 'i' test(1629.16, dt[.(c(2,3)), c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=.EACHI, .SDcols=v2:v3, on="grp"], dt[grp %in% 2:3, c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=grp, .SDcols=v2:v3]) @@ -8917,7 +9319,11 @@ test(1637.3, dt[, data.table(a, .SD), by = a], data.table(a=1,a=1,b=1)) test(1637.4, dt[, data.table(b, .SD), by = cumsum(a)], data.table(cumsum=1, b=1, b=1)) test(1637.5, dt[, data.table(a, b), by = cumsum(a)], data.table(cumsum=1, a=1, b=1)) -# test 1638 moved to optimize.Rraw +# when datatable.optimize<1, no optimisation of j should take place: +options(datatable.optimize=0L) +dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) +test(1638, dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") +options(datatable.optimize=Inf) rm_all() @@ -11520,8 +11926,8 @@ test(1774.13, as.data.table(x), error = "Argument 'value.name' should not overla ## unsupported usage of as.data.table.array test(1774.14, as.data.table.array(as.matrix(x)), error="method should only be called for arrays with 3+") test(1774.15, as.data.table(x, value.name=NA), error="'value.name' must be scalar") -test(1774.16, as.data.table(x, sorted='a'), error="'sorted' must be TRUE or FALSE") -test(1774.17, as.data.table(x, na.rm='a'), error="'na.rm' must be TRUE or FALSE") +test(1774.16, as.data.table(x, sorted='a'), error="'sorted' must be scalar") +test(1774.17, as.data.table(x, na.rm='a'), error="'na.rm' must be scalar") # verify print.keys works DT1 <- data.table(a = 1:3, key = "a") @@ -12456,7 +12862,7 @@ test(1888.5, fsort(x), base::sort(x, na.last = FALSE), x = runif(1e3) test(1888.6, y<-fsort(x,verbose=TRUE), output="nth=.*Top 20 MSB counts") test(1888.7, !base::is.unsorted(y)) -test(1888.8, fsort(x,verbose=1), error="'verbose' must be TRUE or FALSE") +test(1888.8, fsort(x,verbose=1), error="verbose must be TRUE or FALSE") test(1888.9, fsort(c(1L, 2L, 3L, 4L), internal = TRUE), c(1L, 2L, 3L, 4L)) rm(x, y) @@ -12835,8 +13241,9 @@ DT[ , V1:=as.ordered(V1)] test(1918.3, DT[, min(V1)], structure(1L, .Label = lev, class = c("ordered", "factor"))) test(1918.4, DT[, max(V1)], structure(5L, .Label = lev, class = c("ordered", "factor"))) ## make sure GForce is activated -test(1918.5,optimize=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) -test(1918.6,optimize=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) +options(datatable.optimize = Inf) +test(1918.5, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) +test(1918.6, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) # as.ITime.character bug for NA handling #2940 test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime")) @@ -13246,9 +13653,9 @@ test(1962.011, uniqlist(list()), list(0L)) DT1 = data.table(a = 1:3, V = 'a') DT2 = data.table(a = 2:4, V = 'b') test(1962.012, merge(DT1, DT2, sort = 1+3i), - error = "'sort' must be TRUE or FALSE") + error = 'should be logical TRUE/FALSE') test(1962.013, merge(DT1, DT2, no.dups = 1+3i), - error = "'no.dups' must be TRUE or FALSE") + error = 'should be logical TRUE/FALSE') setDF(DT2) test(1962.014, merge(DT1, DT2), data.table(a = integer(0), V = character(0))) @@ -13279,7 +13686,7 @@ test(1962.023, frankv(x, na.last = c(TRUE, FALSE)), test(1962.024, frankv(x, cols = 'y'), error = 'x is a single vector') test(1962.025, frankv(list(x), cols = integer(0L)), - error = "x is a list, 'cols' cannot be 0-length") + error = "x is a list, 'cols' can not be 0-length") f = frankv(list(x), ties.method = 'random') test(1962.026, length(f) == 9L && identical(f[c(3:4, 6L, 8:9)], c(3L, 7L, 4L, 9L, 8L)) && @@ -13402,11 +13809,11 @@ DT = data.table( ) setDF(DT) test(1962.068, rollup(DT), error=base_messages$missing_dispatch_method) -test(1962.069, rollup.data.table(DT), error = 'must be a data.table') +test(1962.069, rollup.data.table(DT), error = 'must be a data.table object') test(1962.070, cube(DT), error=base_messages$missing_dispatch_method) -test(1962.071, cube.data.table(DT), error = 'must be a data.table') +test(1962.071, cube.data.table(DT), error = 'must be a data.table object') test(1962.072, groupingsets(DT), error=base_messages$missing_dispatch_method) -test(1962.073, groupingsets.data.table(DT), error = 'must be a data.table') +test(1962.073, groupingsets.data.table(DT), error = 'must be a data.table object') setDT(DT) test(1962.074, rollup(DT, by = 3L), error = "'by' must be a character vector") test(1962.075, rollup(DT, by = 'color', id = 3L), error = "'id' must be a logical scalar") @@ -13438,7 +13845,7 @@ setDF(DT) test(1962.085, dcast.data.table(DT), error = 'must be a data.table') setDT(DT) test(1962.086, dcast(DT, a ~ a, drop = NA), - error = "'drop' must be logical vector with no missing entries") + error = 'must be logical TRUE/FALSE') DT = data.table(a = c(1, 1, 2, 2), b = list(1, 2, 3, 4), c = c(4, 4, 2, 2)) test(1962.087, dcast(DT, a ~ b, value.var = 'b'), error = 'Columns specified in formula can not be of type list') @@ -13598,11 +14005,11 @@ test(1967.09, foverlaps(x, y, minoverlap = NA), test(1967.10, foverlaps(x, y, minoverlap = -5), error = 'minoverlap must be a positive integer') test(1967.11, foverlaps(x, y, which = integer(0L)), - error = "'which' must be TRUE or FALSE") + error = 'which must be a logical vector') test(1967.12, foverlaps(x, y, which = c(3, 4)), - error = "'which' must be TRUE or FALSE") + error = 'which must be a logical vector') test(1967.13, foverlaps(x, y, which = NA), - error = "'which' must be TRUE or FALSE") + error = 'which must be a logical vector') test(1967.14, foverlaps(x, y, nomatch = integer(0L)), error = 'nomatch must either be NA or NULL') test(1967.15, foverlaps(x, y, nomatch = c(3, 4)), @@ -13622,9 +14029,9 @@ test(1967.22, foverlaps(x, y, by.y = c(-1L, 0L)), test(1967.23, foverlaps(x, y, by.y = c(1L, 100L)), error = "Invalid numeric value for 'by.y'") test(1967.24, foverlaps(x, y, by.x = c(1 + 3i, 2 - 1i)), - error = "non-empty vector of column names or numbers is required for 'by.x'") + error = 'non-empty vector of column names or numbers is required for by.x') test(1967.25, foverlaps(x, y, by.y = c(1 + 3i, 2 - 1i)), - error = "non-empty vector of column names or numbers is required for 'by.y'") + error = 'non-empty vector of column names or numbers is required for by.y') test(1967.26, foverlaps(x, y, by.x = c('start', 'END')), error = "Elements listed in 'by.x' must be valid names") test(1967.27, foverlaps(x, y, by.x = c('start', 'start')), @@ -13719,13 +14126,17 @@ suppressWarnings(rm(`___data.table_internal_test_1967.68___`)) test(1967.68, setDT(`___data.table_internal_test_1967.68___`), error = 'Cannot find symbol') ### [.data.table verbosity & non-equi-join tests -test(1967.69,optimize=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') -# test 1967.69 subsumes 1967.69 and 1967.70 -test(1967.71,optimize=1L, x[order(a), .N, verbose = TRUE], 5L, +options(datatable.optimize = 0L) +verbose_output = capture.output(x[order(a), .N, verbose = TRUE]) +test(1967.69, !any(grepl('forder.c', verbose_output, fixed = TRUE))) +test(1967.70, any(grepl('[1] 5', verbose_output, fixed = TRUE))) +options('datatable.optimize' = 1L) +test(1967.71, x[order(a), .N, verbose = TRUE], 5L, output = "forder.c received 5 rows and 1 column") setkey(x) -test(1967.72,optimize=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, +test(1967.72, x[x, .N, on = 'a', verbose = TRUE], 5L, output = "on= matches existing key") +options(datatable.optimize = Inf) x = data.table( i1 = c(234L, 250L, 169L, 234L, 147L, 96L, 96L, 369L, 147L, 96L), @@ -13756,7 +14167,6 @@ DT = data.table(A=1:5, B=-3i, C=2147483647L) test(1968.2, storage.mode(DT$C), "integer") test(1968.3, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(6442450941, 4294967294)), warning="sum.*integer column.*more than type 'integer' can hold.*coerced to 'numeric'") -test(1968.35, DT[, sum(as.numeric(C)), by=A%%2L], data.table(A=c(1L,0L), V1=c(6442450941, 4294967294))) DT[3,C:=NA] test(1968.4, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(NA, 4294967294)), warning="coerced to 'numeric'") test(1968.5, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 4294967294)), warning="coerced to 'numeric'") @@ -13877,10 +14287,14 @@ x <- as.array(1:5) test(1980, names(data.table(x)), "x") # crash when n="lead", #3354 +options(datatable.optimize=0L) +DT = data.table( id = 1:5 , val = letters[1:5] ) +test(1981.1, DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) +test(1981.2, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") +options(datatable.optimize=Inf) DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.1,optimize=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) -test(1981.2,optimize=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") -# tests 1981.1 and 1981.2 subsume 1981.1, 1981.2, 1981.3 and 1981.4 for testing different levels +test(1981.3, DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) +test(1981.4, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") # 1982 moved to benchmark.Rraw, #5517 @@ -13912,7 +14326,8 @@ test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*t test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 error="Column or expression.*2.*type 'list'.*not.*supported") test(1984.09, DT[, sum(a), by=.(1,1:2)], error="The items in the 'by' or 'keyby' list have lengths [1, 2]. Each must be length 10; the same length as there are rows in x (after subsetting if i is provided).") -test(1984.10,optimize=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], +options('datatable.optimize' = Inf) +test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE], data.table(a = c(1, 0), V1 = c(1, 1)), output = 'Optimization is on but left j unchanged') DT[ , f := rep(1:2, each = 5)] @@ -14202,7 +14617,7 @@ test(2002.12, rbind(DT1, DT2, idcol='id'), data.table(id=integer(), a=logica #rbindlist coverage test(2003.1, rbindlist(list(), use.names=1), error="use.names= should be TRUE, FALSE, or not used [(]\"check\" by default[)]") -test(2003.2, rbindlist(list(), fill=1), error="'fill' must be TRUE or FALSE") +test(2003.2, rbindlist(list(), fill=1), error="fill should be TRUE or FALSE") test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, use.names=FALSE), data.table(a=c(1:4))) test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TRUE, use.names=FALSE), @@ -14221,7 +14636,7 @@ test(2003.82, rbind(y, x, fill=TRUE, use.names=TRUE), ans[2:1,]) test(2003.83, rbind(x, y, fill=TRUE, use.names=FALSE), ans) test(2003.84, rbind(y, x, fill=TRUE, use.names=FALSE), ans[2:1,]) # rbindlist ignore attributes #3911 -test(2003.85, rbindlist(list(), ignore.attr=1), error="'ignore.attr' must be TRUE or FALSE") +test(2003.85, rbindlist(list(), ignore.attr=1), error="ignore.attr should be TRUE or FALSE") # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111 x1 = "fa\xE7ile" @@ -14655,8 +15070,8 @@ test(2037.2, names(DT), 'a') test(2037.3, foo(DT), output="data.table internal attributes", notOutput="data.table internal attributes.*data.table internal attributes") # `between` invalid args, and verbose #3516 -test(2038.01, between(1:5, 2, 4, incbounds=423), error="'incbounds' must be TRUE or FALSE") -test(2038.02, between(1:5, 2, 4, incbounds=NA), error="'incbounds' must be TRUE or FALSE") +test(2038.01, between(1:5, 2, 4, incbounds=423), error="incbounds must be TRUE or FALSE") +test(2038.02, between(1:5, 2, 4, incbounds=NA), error="incbounds must be TRUE or FALSE") old = options(datatable.verbose=TRUE) test(2038.03, between(1:5, 2L, 4L), output="between parallel processing of integer took") test(2038.04, between(1:5, rep(2L,5L), rep(4L, 5L)), output="between parallel processing of integer took") @@ -14795,12 +15210,16 @@ test(2042.4, DT[ , round(mean(DiffTime)), by=Group, verbose=TRUE], # gforce wrongly applied to external variable; #875 DT = data.table(x=INT(1,1,1,2,2), y=1:5) z = 1:5 -opt = c(Inf,1L,0L) -test(2043.1,optimize=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) -# test 2043.1 subsumes 2043.1, 2043.2 and 2043.3 for testing different levels -test(2043.4,optimize=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) +options(datatable.optimize = Inf) +test(2043.1, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) +options(datatable.optimize = 1L) +test(2043.2, DT[, list(mean(z), mean(y)), by=x], ans) +options(datatable.optimize = 0L) +test(2043.3, DT[, list(mean(z), mean(y)), by=x], ans) +options(datatable.optimize = Inf) +test(2043.4, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) z = 1:4 -test(2043.5,optimize=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z +test(2043.5, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z # test type coercion in joins, #2592 dt1 <- data.table(int = 1L:10L, @@ -15799,7 +16218,7 @@ if (test_bit64) { test(2082.07, between(letters[1:2], c("foo","bar"), c("bar")), c(FALSE,FALSE)) test(2082.08, between(letters[1:2], c("foo","bar"), c("bar"), check=TRUE), error="Item 1 of lower ('foo') is greater than item 1 of upper ('bar')") test(2082.09, between(as.raw(1:5), as.raw(3), as.raw(2), check=TRUE), error="Some lower>upper for this non-numeric and non-character type") -test(2082.10, between(1:3, 2, 4, check=NA), error="'check' must be TRUE or FALSE") +test(2082.10, between(1:3, 2, 4, check=NA), error="check must be TRUE or FALSE") # partial instantiation of integer64 column was creating NA_REAL, not INT64_MIN if (test_bit64) { @@ -15963,10 +16382,12 @@ test(2098.1, DT[do.call(order, mget(groups)), verbose=TRUE], ans<-data.table(id= test(2098.2, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, output=out) test(2098.3, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) test(2098.4, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) -test(2098.5,optimize=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") -test(2098.6,optimize=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") -test(2098.7,optimize=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) -test(2098.8,optimize=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) +old = options(datatable.optimize=0L) +test(2098.5, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") +test(2098.6, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") +test(2098.7, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) +test(2098.8, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) +options(old) # Error in update join when joining on factor, #3559 d1 <- data.table(fac = factor(letters[1:4]), char = letters[1:4], val = c(1L, NA, 3L, NA)) @@ -17277,9 +17698,12 @@ test(2194.4, endsWithAny(letters, 'e'), error="Internal error.*types or lengths test(2194.5, endsWithAny(NA_character_, 'a'), FALSE) test(2194.6, endsWithAny(character(), 'a'), error="Internal error.*types or lengths incorrect") # file used in encoding tests -needed_chars = c("\u0105", "\u017E", "\u016B", "\u012F", "\u0173", "\u0117", "\u0161", "\u0119") -txt = parse(text='readLines(testDir("issue_563_fread.txt"))') -test(2194.7, requires_utf8=needed_chars, endsWithAny(eval(txt), 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5 +txt = readLines(testDir("issue_563_fread.txt")) +local(if (eval(utf8_check_expr)) { + test(2194.7, endsWithAny(txt, 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5 +} else { + cat("Test 2194.7 skipped because it needs a UTF-8 locale.\n") +}) test(2194.8, endsWith('abcd', 'd'), error="Internal error.*use endsWithAny") # uniqueN(x, by=character()) was internal error, #4594 @@ -17388,14 +17812,16 @@ test(2207, dcast(DT, x~y, value.var="z"), data.table(x=1:3, a=c(1+6i, 3+4i, 5+2i # gmin/gmax for integer64, #4444 if (test_bit64) { DT = data.table(grp=c(1L, 1L, 1L, 2L), i64=as.integer64(c(NA, 1:3))) - test(2208.1,optimize=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.2,optimize=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) - test(2208.3,optimize=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.4,optimize=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) + old = options(datatable.optimize=2L) + test(2208.1, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.2, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) + test(2208.3, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.4, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) # create an all-NA group DT[, i64:=rev(i64)] - test(2208.7,optimize=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) - test(2208.8,optimize=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) + test(2208.7, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) + test(2208.8, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) + options(old) } # when user supplies dec=',' don't try sep=',', #4483 @@ -17620,16 +18046,17 @@ if (test_bit64) test(2219.2, DT[3, A:=as.integer64("4611686018427387906")], data # gforce improve coverage DT = data.table(g=1:2, i=c(NA, 1:4, NA), f=factor(letters[1:6]), l=as.list(1:6)) +options(datatable.optimize = 2L) funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") testnum = 0L for (fun in funs) { testnum = testnum + 1L - test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="'na.rm' must be TRUE or FALSE", context=sprintf("fun=%s [na.rm='a']", fun)) + test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE", context=sprintf("fun=%s [na.rm='a']", fun)) testnum = testnum + 1L - test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun), context=sprintf("fun=%s [factor]", fun)) + test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun), context=sprintf("fun=%s [factor]", fun)) } testnum = testnum + 1L -test(2220.0 + testnum*0.01, optimize=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +test(2220.0 + testnum*0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") @@ -17645,6 +18072,7 @@ test(2223.1, DT[.(4), nomatch=FALSE], data.table(A=integer(), key="A")) test(2223.2, DT[.(4), nomatch=NA_character_], data.table(A=4L, key="A")) # gshift, #5205 +options(datatable.optimize = 2L) set.seed(123) DT = data.table(x = sample(letters[1:5], 20, TRUE), y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly @@ -17666,7 +18094,7 @@ for (col in names(DT)[-1]) { for (type in c('lag', 'lead', 'shift', 'cyclic')) { # fill is tested by group in tests 2218.*; see comments in #5205 # sapply(sapply()) changed to for(for(for())) to save 29MiB, #5517 - test(2224.1+i/10000, optimize=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE + test(2224.1+i/10000, # 192 tests here when test_bit64=TRUE; 168 when FALSE EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), ans[[i]], context=sprintf("col=%s, n=%s, type=%s", col, paste(n, collapse=","), type)) @@ -17756,8 +18184,86 @@ test(2230.12, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x test(2230.13, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, unk1=1L, unk2=2L, 3L, 4L), merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x`/`by.y`. `by` argument will be ignored.", "2 unnamed arguments.*2 unknown keyword arguments.*\\[unk1, unk2\\]")) -# tests 2231 moved to optimize.Rraw +# weighted.mean GForce optimized, #3977 +old = options(datatable.optimize=1L) +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2231.01, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce FALSE") +test(2231.02, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce FALSE") +test(2231.03, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce FALSE") +# multiple groups +DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.04, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") +test(2231.05, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") +test(2231.06, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce FALSE") +# (only x XOR w) containing NA +DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.07, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") +test(2231.08, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce FALSE") +test(2231.09, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.10, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (only x XOR w) containing NaN +DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.11, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce FALSE") +test(2231.12, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") +test(2231.13, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce FALSE") +test(2231.14, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (only x XOR w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.15, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") +test(2231.16, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") +test(2231.17, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.18, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# (x and w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.19, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.20, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.21, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") +test(2231.22, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") +# same as previous test cases but now GForce optimized +options(datatable.optimize=2L) +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2231.31, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce optimized j to") +test(2231.32, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce optimized j to") +test(2231.33, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to") +# multiple groups +DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.34, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") +test(2231.35, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") +test(2231.36, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce optimized j to") +# (only x XOR w) containing NA +DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.37, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") +test(2231.38, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce optimized j to") +test(2231.39, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.40, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (only x XOR w) containing NaN +DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.41, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce optimized j to") +test(2231.42, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") +test(2231.43, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce optimized j to") +test(2231.44, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (only x XOR w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.45, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") +test(2231.46, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") +test(2231.47, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.48, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# (x and w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.49, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.50, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") +test(2231.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") +# let wrongly named arguments get lost in ellipsis #5543 +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2231.61, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output="GForce optimized j to") +test(2231.62, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to") +test(2231.63, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) +test(2231.64, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) +options(old) # cols argument for unique.data.table, #5243 DT = data.table(g = rep(letters, 3), v1=1:78, v2=78:1) @@ -17769,18 +18275,17 @@ test(2232.3, unique(DT[1:26], by='g', cols='v1'), DT[1:26, !'v2']) test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") # support := with GForce #1414 +options(datatable.optimize = 2L) DT = data.table(a=1:3,b=(1:9)/10) -opt = 0:2 -out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") -test(2233.01,optimize=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) +test(2233.01, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output="GForce optimized j to") # GForce returning full length -test(2233.02,optimize=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) +test(2233.02, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output="GForce optimized j to") # GForce neither returning 1 per group nor full length -test(2233.03,optimize=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") +test(2233.03, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") # compare to non GForce version DT = data.table(a=1:3,b=(1:9)/10) -test(2233.04,optimize=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) -test(2233.05,optimize=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) +test(2233.04, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output="GForce optimized j to") +test(2233.05, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to") # with key and grouping by key DT = data.table(a=1:3,b=(1:9)/10, key="a") @@ -17851,23 +18356,27 @@ test(2233.38, copy(DT)[, val:=v[1L], keyby=.(A,B), verbose=TRUE], data.table(A=I set.seed(10) n = 100 a = data.table(id1=1:n, id2=sample(1:900,n,replace=TRUE), flag=sample(c(0,0,0,1),n,replace=TRUE)) -opt = c(0,Inf) -out = c("GForce FALSE", "GForce.*gsum") -B = copy(a) -A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle -test(2233.391,optimize=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out, context=sprintf("optimize=%s [I]", format(opt))) # y=A dummy just to test output= -setorder(A, id1) -test(2233.392,optimize=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out, context=sprintf("optimize=%s [II]", format(opt))) -test(2233.393,optimize=opt, !any(A[,t1!=t2])) -test(2233.394,optimize=opt, !any(A[, length(unique(t1))>1, by=id2]$V1), context=sprintf("optimize=%s [III]", format(opt))) -test(2233.395,optimize=opt, !any(A[, length(unique(t2))>1, by=id2]$V1), context=sprintf("optimize=%s [IV]", format(opt))) - +for (opt in c(0, Inf)) { + options(datatable.optimize=opt) + out = if (opt) "GForce.*gsum" else "GForce FALSE" + B = copy(a) + A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle + num_bump = (opt>0)/100 + test(2233.39+num_bump+0.001, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out, context=sprintf("optimize=%s [I]", format(opt))) # y=A dummy just to test output= + setorder(A, id1) + test(2233.39+num_bump+0.002, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out, context=sprintf("optimize=%s [II]", format(opt))) + test(2233.39+num_bump+0.003, !any(A[,t1!=t2])) + test(2233.39+num_bump+0.004, !any(A[, length(unique(t1))>1, by=id2]$V1), context=sprintf("optimize=%s [III]", format(opt))) + test(2233.39+num_bump+0.005, !any(A[, length(unique(t2))>1, by=id2]$V1), context=sprintf("optimize=%s [IV]", format(opt))) +} # test from #5337 n=4; k=2 mm = data.table(a = rep(1:k,n), b=seq_len(n*k), d=rep(1:n,k)) ans = copy(mm)[, e:=INT(NA,8,NA,12,NA,8,NA,12)] -test(2233.41,optimize=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) -# test 2233.41 subsumes 2231.41 and 2231.42 for different optimization levels +options(datatable.optimize=0) +test(2233.41, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output="GForce FALSE") +options(datatable.optimize=Inf) +test(2233.42, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output="GForce.*gsum") # test from #5345 set.seed(1) DT = data.table( @@ -17878,21 +18387,32 @@ DT = data.table( ) load(testDir("test2233-43.Rdata")) # ans setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk -test(2233.43,optimize=c(0,Inf), options = list(datatable.verbose=TRUE), +test(2233.43, + options = list(datatable.verbose=TRUE, datatable.optimize=0), copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") ][, n_idT :=dim(.SD)[[1]], by=list(t, id) ][, sum_v2_id :=sum(v2), by=.(id) ][, sum_v1_idT:=sum(v1), by=c("id", "t") ][, sum_v1_id :=sum(v1), by=c("id")], ans, - output=c("GForce FALSE", "GForce.*gsum")) -# test 2233.43 subsumes 2231.43 and 2231.44 for different optimization levels + output="GForce FALSE") +test(2233.44, + options = list(datatable.verbose=TRUE, datatable.optimize=Inf), + copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") + ][, n_idT :=dim(.SD)[[1]], by=list(t, id) + ][, sum_v2_id :=sum(v2), by=.(id) + ][, sum_v1_idT:=sum(v1), by=c("id", "t") + ][, sum_v1_id :=sum(v1), by=c("id")], + ans, + output="GForce.*gsum") # optimized := with gforce functions that can return lists #5403 +old = options(datatable.verbose=TRUE) DT = data.table(grp=1:2, x=1:4) out = "Making each group and running j (GForce TRUE)" -test(2233.45, options=c(datatable.verbose=TRUE), copy(DT)[, c("y", "z") := .(shift(x, type="lag", n=1), shift(x, type="lead", n=1)), by=grp], data.table(grp=1:2, x=1:4, y=c(NA, NA, 1:2), z=c(3:4, NA, NA)), output=out) -test(2233.46, options=c(datatable.verbose=TRUE), copy(DT)[, l := shift(x, n=c(0, 0)), by=grp], data.table(grp=1:2, x=1:4, l=list(INT(1, 1), INT(2, 2), INT(3, 3), INT(4, 4))), output=out) -test(2233.47, options=c(datatable.verbose=TRUE), copy(DT)[, c("l1", "l2") := shift(x, n=c(-1, 1)), by=grp], data.table(grp=1:2, x=1:4, l1=c(3:4,NA,NA), l2=c(NA,NA,1:2)), output=out) +test(2233.45, copy(DT)[, c("y", "z") := .(shift(x, type="lag", n=1), shift(x, type="lead", n=1)), by=grp], data.table(grp=1:2, x=1:4, y=c(NA, NA, 1:2), z=c(3:4, NA, NA)), output=out) +test(2233.46, copy(DT)[, l := shift(x, n=c(0, 0)), by=grp], data.table(grp=1:2, x=1:4, l=list(INT(1, 1), INT(2, 2), INT(3, 3), INT(4, 4))), output=out) +test(2233.47, copy(DT)[, c("l1", "l2") := shift(x, n=c(-1, 1)), by=grp], data.table(grp=1:2, x=1:4, l1=c(3:4,NA,NA), l2=c(NA,NA,1:2)), output=out) +options(old) # support by=.I; #1732 DT = data.table(V1=1:5, V2=3:7, V3=5:1) @@ -18029,16 +18549,20 @@ test(2243.38, dt[, sd(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table( dt = data.table(x = c(2,2,1,1), y = 1:4, z=letters[1:4]) i=c(1,2) j=1L -opt = c(1L,2L) -out = c("GForce FALSE", "GForce TRUE") -test(2243.41,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") -# test 2243.41 subsumes 2243.41 and 2243.51 for different optimization levels -test(2243.42,optimize=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") -# test 2243.42 subsumes 2243.42 and 2243.52 for different optimization levels -test(2243.53,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) -test(2243.54,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) -test(2243.55,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") -test(2243.56,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") +test(2243.41, options=c(datatable.optimize=1L), dt[, .I[TRUE], x]$V1, 1:4) +test(2243.42, options=c(datatable.optimize=1L), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA))) +test(2243.51, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +test(2243.52, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +test(2243.53, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[1], x]$V1, c(1L, 3L), output="GForce TRUE") +test(2243.54, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[j], x]$V1, c(1L, 3L), output="GForce TRUE") +test(2243.55, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") +test(2243.56, options=list(datatable.optimize=2L, datatable.verbose=TRUE), + dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") DT = data.table(1) test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") @@ -18059,9 +18583,11 @@ test(2245.3, dt[1], data.table(foo = 1L, bar = 4L)) # Default in this environmen # data.table:: doesn't turn off GForce, #5942 DT = data.table(a = rep(1:5, 2L), b = 1:10) -test(2246.1, options=list(datatable.optimize=Inf, datatable.verbose=TRUE), DT[, data.table::shift(b), by=a], DT[, shift(b), by=a], output="GForce TRUE") -test(2246.2, options=list(datatable.optimize=Inf, datatable.verbose=TRUE), DT[, data.table::first(b), by=a], DT[, first(b), by=a], output="GForce TRUE") -test(2246.3, options=list(datatable.optimize=Inf, datatable.verbose=TRUE), DT[, data.table::last(b), by=a], DT[, last(b), by=a], output="GForce TRUE") +old = options(datatable.optimize=Inf, datatable.verbose=TRUE) +test(2246.1, DT[, data.table::shift(b), by=a], DT[, shift(b), by=a], output="GForce TRUE") +test(2246.2, DT[, data.table::first(b), by=a], DT[, first(b), by=a], output="GForce TRUE") +test(2246.3, DT[, data.table::last(b), by=a], DT[, last(b), by=a], output="GForce TRUE") +options(old) # 5392 split(x,f) works with formula f dt = data.table(x=1:4, y=factor(letters[1:2])) @@ -18169,14 +18695,12 @@ test(2252.2, dt[, let(b=2L)], error = "\\[ was called on a data.table.*not data. rm(.datatable.aware) # tests for trunc.char handling wide characters #5096 -local({ -accented_a = "\u0061\u0301" -ja_ichi = "\u4E00" -ja_ni = "\u4E8C" -ja_ko = "\u3053" -ja_n = "\u3093" -nc = c(accented_a, ja_ichi, ja_ni, ja_ko, ja_n) -if (utf8_check(nc)) { +local(if (eval(utf8_check_expr)) { + accented_a = "\u0061\u0301" + ja_ichi = "\u4E00" + ja_ni = "\u4E8C" + ja_ko = "\u3053" + ja_n = "\u3093" dots = "..." clean_regex = "^\\d+:\\s+" # removes row numbering from beginning of output # Tests for combining character latin a and acute accent, single row @@ -18223,7 +18747,7 @@ if (utf8_check(nc)) { test(2253.20, options=list(datatable.prettyprint.char = 1L), data.table(a = c("abc", NA)), output=" a\n1: a...\n2: ") } else { cat("Tests 2253* skipped because they need a UTF-8 locale.\n") -}}) +}) # allow 1-D matrix in j for consistency, #783 DT=data.table(a = rep(1:2, 3), b = 1:6) @@ -18361,7 +18885,14 @@ test(2262.6, set(null.data.table(), j=c("a","b"), value=list(1:2, 3:4)), dt3) test(2262.7, data.table(a=1, b=2)[, c("a", "b") := list(NULL, NULL)], null.data.table()) test(2262.8, data.table(a=1, b=2)[, c("a", "b") := list(NULL)], null.data.table()) -# test 2263 moved to optimize.Rraw +# GForce retains attributes in by arguments #5567 +dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) +test(2263.1, options=list(datatable.verbose=TRUE, datatable.optimize=0L), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce FALSE") +test(2263.2, options=list(datatable.verbose=TRUE, datatable.optimize=0L), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce FALSE") +test(2263.3, options=list(datatable.verbose=TRUE, datatable.optimize=0L), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce FALSE") +test(2263.4, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce optimized j to") +test(2263.5, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce optimized j to") +test(2263.6, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce optimized j to") # tests for printing indices alongside data.tables NN = 200 @@ -18497,7 +19028,7 @@ test(2269.2, fread("x\n?\n \n", colClasses="POSIXct", na.strings="?"), dt) # Error found by revdep in #6284: mean(a,b) is valid, expr names() can be NULL DT = data.table(a = 1, b = 2) -test(2270,optimize=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") +test(2270, options=c(datatable.optimize=1L), DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") # Missing newline in verbose output -> harder to read DT1 = data.table(a=1:2) @@ -18809,7 +19340,94 @@ test(2282.08, rowwiseDT(A=,B=,1,2,C=,4), error="Header must be the first N argum ncols = 1e6 test(2282.09, rowwiseDT(A=,ncols), data.table(A=ncols)) -# test 2283 moved tests to optimize.Rraw +# named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 + +M <- as.data.table(mtcars) +M[, " " := hp] +M[, "." := hp] + +sdnames <- setdiff(names(M), "cyl") +sdlist <- vector("list", length(sdnames)) +names(sdlist) <- sdnames + +for (opt in c(0, 1, 2)) { + test(2283 + opt/10 + 0.001, options=c(datatable.optimize=opt), + names(M[, c(m=lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(m=sdlist))), + context=sprintf("optimize=%s [I]", format(opt))) + test(2283 + opt/10 + 0.002, options=c(datatable.optimize=opt), + names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", "Mpg", sdnames), + context=sprintf("optimize=%s [II]", format(opt))) + test(2283 + opt/10 + 0.003, options=c(datatable.optimize=opt), + names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), + c("cyl", "Mpg", names(c(m=sdlist))), + context=sprintf("optimize=%s [III]", format(opt))) + test(2283 + opt/10 + 0.004, options=c(datatable.optimize=opt), + names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), + c("cyl", "mpg", names(c(mpg=sdlist))), + context=sprintf("optimize=%s [IV]", format(opt))) + test(2283 + opt/10 + 0.005, options=c(datatable.optimize=opt), + names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", "V1", sdnames), + context=sprintf("optimize=%s [V]", format(opt))) + test(2283 + opt/10 + 0.006, options=c(datatable.optimize=opt), + names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), + c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L)), + context=sprintf("optimize=%s [VI]", format(opt))) + test(2283 + opt/10 + 0.007, options=c(datatable.optimize=opt), + names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), + c("cyl", sdnames, sdnames), + context=sprintf("optimize=%s [VII]", format(opt))) + test(2283 + opt/10 + 0.008, options=c(datatable.optimize=opt), + names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), + c("cyl", names(c(mean=sdlist, sum=sdlist))), + context=sprintf("optimize=%s [VIII]", format(opt))) + test(2283 + opt/10 + 0.009, options=c(datatable.optimize=opt), + names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), + c("cyl", sdnames, names(c(sum=sdlist))) , + context=sprintf("optimize=%s [IX]", format(opt))) + test(2283 + opt/10 + 0.010, options=c(datatable.optimize=opt), + names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), + c("cyl", names(c(" "=sdlist, "."=sdlist))), + context=sprintf("optimize=%s [X]", format(opt))) + test(2283 + opt/10 + 0.011, options=c(datatable.optimize=opt), + names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(a=0, b=0))), sdnames), + context=sprintf("optimize=%s [XI]", format(opt))) + test(2283 + opt/10 + 0.012, options=c(datatable.optimize=opt), + names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0, 0))), sdnames), + context=sprintf("optimize=%s [XII]", format(opt))) + test(2283 + opt/10 + 0.013, options=c(datatable.optimize=opt), + names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0, b=0, 0))), sdnames), + context=sprintf("optimize=%s [XIII]", format(opt))) + test(2283 + opt/10 + 0.014, options=c(datatable.optimize=opt), + names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0))), sdnames), + context=sprintf("optimize=%s [XIV]", format(opt))) + test(2283 + opt/10 + 0.015, options=c(datatable.optimize=opt), + names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames), + context=sprintf("optimize=%s [XV]", format(opt))) + test(2283 + opt/10 + 0.016, options=c(datatable.optimize=opt), + names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames), + context=sprintf("optimize=%s [XVI]", format(opt))) + test(2283 + opt/10 + 0.017, options=c(datatable.optimize=opt), + names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "b", "vs", "am"), + context=sprintf("optimize=%s [XVII]", format(opt))) + test(2283 + opt/10 + 0.018, options=c(datatable.optimize=opt), + names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "b", "vs", "am"), + context=sprintf("optimize=%s [XVIII]", format(opt))) + test(2283 + opt/10 + 0.019, options=c(datatable.optimize=opt), + names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "V2", "b", "vs", "am"), + context=sprintf("optimize=%s [XIX]", format(opt))) +} # Confusing behavior with DT[, min(var):max(var)] #2069 DT = data.table(t = c(2L, 1L, 3L), a=0, b=1) @@ -20285,20 +20903,18 @@ x = data.table(a=1, b=2L) y = data.table(c=1.5, d=1L) test(2297.31, y[x, on=.(c == a, d == a), nomatch=NULL], output="Empty data.table (0 rows and 3 cols): c,d,b") -local(if (utf8_check(c("\u00e4", "\u00f6", "\u00fc"))) { +local(if (eval(utf8_check_expr)) { # rbindlist(l, use.names=TRUE) should handle different colnames encodings #5452 x = data.table(a = 1, b = 2, c = 3) y = data.table(x = 4, y = 5, z = 6) # a-umlaut, o-umlaut, u-umlaut - eval(parse(text = ' # eval(parse()) defers parsing to runtime; see utf8_check description - setnames(x , c("\u00e4", "\u00f6", "\u00fc")) - setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) - test(2298.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) - test(2298.2, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1))) - set(y, j="\u00e4", value=NULL) - test(2298.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) - test(2298.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1))) - ')) + setnames(x , c("\u00e4", "\u00f6", "\u00fc")) + setnames(y , iconv(c("\u00f6", "\u00fc", "\u00e4"), from = "UTF-8", to = "latin1")) + test(2298.1, rbindlist(list(x,y), use.names=TRUE), data.table("\u00e4"=c(1,6), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) + test(2298.2, rbindlist(list(y,x), use.names=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(6,1))) + set(y, j="\u00e4", value=NULL) + test(2298.3, rbindlist(list(x,y), use.names=TRUE, fill=TRUE), data.table("\u00e4"=c(1,NA), "\u00f6"=c(2,4), "\u00fc"=c(3,5))) + test(2298.4, rbindlist(list(y,x), use.names=TRUE, fill=TRUE), data.table("\u00f6"=c(4,2), "\u00fc"=c(5,3), "\u00e4"=c(NA,1))) } else { cat("Tests 2298.* skipped because they need a UTF-8 locale.\n") }) @@ -21064,13 +21680,13 @@ if (base::getRversion() >= "4.3.0") { ## follow up of #7213, see #7321 } # fwrite: allow dec=',' with single column, #7227 -test(2337.1, fwrite(data.table(1), dec=","), output = "V1\n1") +test(2337.1, fwrite(data.table(1), dec=","), NULL) if (base::getRversion() >= "4.0.0") { # rely on stopifnot(named = ...) for correct message test(2337.2, fwrite(data.table(0.1, 0.2), dec=",", sep=","), error = "dec and sep must be distinct") } -test(2337.3, fwrite(data.table(c(0.1, 0.2)), dec=",", sep="\t"), output = "V1\n0,1\n0,2") -test(2337.4, fwrite(data.table(a=numeric(), b=numeric()), dec=",", sep=","), output = "a,b") -test(2337.5, fwrite(data.table(a=numeric()), dec=",", sep=","), output = "a") +test(2337.3, is.null(fwrite(data.table(c(0.1, 0.2)), dec=",", sep="\t"))) +test(2337.4, is.null(fwrite(data.table(a=numeric(), b=numeric()), dec=",", sep=","))) +test(2337.5, is.null(fwrite(data.table(a=numeric()), dec=",", sep=","))) # 2864 force decimal points for whole numbers in numeric columns dd = data.table(x=c(1, 2, 3)) @@ -21407,125 +22023,3 @@ local({ test(2357.1, fread(f), DT) test(2357.2, fread(paste0("file://", f)), DT) }) - -#7571 issue for na.rm on int64 -if (test_bit64) local({ - # integer64 + GForce grouped sum with na.rm = FALSE - # Example 1 from issue: ids 1:8, 9, 9; three leading NAs then 4:10 - dt_short = data.table( - id = c(1:8, 9, 9), - value = c(rep(NA_integer64_, 3L), as.integer64(4:10)) - ) - test(2358.1, options=c(datatable.optimize=2L), - dt_short[, sum(value, na.rm = FALSE), by = id]$V1, - as.integer64(c(NA, NA, NA, 4:8, 19)) - ) - - # Example 2 from issue: ids in pairs, same values; checks multi-row groups - dt_short2 = data.table( - id = rep(1:5, each = 2L), - value = c(rep(NA_integer64_, 3L), as.integer64(4:10)) - ) - test(2358.2, options=c(datatable.optimize=2L), - dt_short2[, sum(value, na.rm = FALSE), by = id]$V1, - as.integer64(c(NA, NA, 11, 15, 19)) - ) - - # Test mean for integer64 with NA - dt_mean = data.table( - id = c(1,1,2,2,3,3), - value = as.integer64(c(NA, NA, NA, 20000000, 5, 3)) - ) - test(2358.3, options=c(datatable.optimize=2L), - dt_mean[, mean(value, na.rm=FALSE), by = id]$V1, - c(NA, NA, 4) - ) - - # GForce sum vs base::sum for integer64 - DT = data.table(id = sample(letters, 1000, TRUE), value = as.integer64(sample(c(1:100, NA), 1000, TRUE))) - gforce = DT[, .(gforce_sum = sum(value)), by=id] - base = DT[, .(true_sum = base::sum(value)), by=id] - merged = merge(gforce, base, by="id", all=TRUE) - test(2358.4, options=c(datatable.optimize=2L), - merged$gforce_sum, merged$true_sum - ) - - # GForce mean vs base::mean for integer64 - DTm = data.table(id = sample(letters, 1000, TRUE), value = as.integer64(sample(c(1:100, NA), 1000, TRUE))) - gforce_m = DTm[, .(gforce_mean = mean(value)), by=id] - base_m = DTm[, .(true_mean = base::mean(value)), by=id] - merged_m = merge(gforce_m, base_m, by="id", all=TRUE) - test(2358.5, options=c(datatable.optimize=2L), - merged$gforce_mean, merged$true_mean - ) -}) - -# 7407 Test for fread() handling \x1A (ASCII SUB) at end of input -txt = paste0("foo\n", strrep("a", 4096 * 100), "\x1A") -test(2359.1, nchar(fread(txt)$foo), 409600L) - -# rowwiseDT() valid and invalid handling of complex objects #7219 -test(2360.1, rowwiseDT(x =, y =, 1, 2, 3, 4), data.table(x = c(1, 3), y = c(2, 4))) -test(2360.2, rowwiseDT(x =, func =, - 1, list(\(x) x + 1), - 2, list(function(z) z * 2)), - data.table(x = c(1, 2), func = list(\(x) x + 1, function(z) z * 2))) -test(2360.3, rowwiseDT(x =, func =, 1, \(x) x + 1), - error = "Column 'func' is type 'function'. Non-atomic, non-list objects must be wrapped in list\\(\\)") -test(2360.4, rowwiseDT(x =, expr =, 1, quote(a + b)), - error = "Column 'expr' is type 'call'. Non-atomic, non-list objects must be wrapped in list\\(\\)") -test(2360.5, rowwiseDT(x =, plist =, 1, as.pairlist(list(123))), - error = "Column 'plist' is type 'pairlist'. Non-atomic, non-list objects must be wrapped in list\\(\\)") - -# setattr() must not crash for out-of-bounds factor indices when fixing duplicate levels, #7595 -test(2361.1, setattr(factor(c(1, NA), levels = 1), "levels", c("1", "1")), factor(c(1, NA))) -test(2361.2, setattr(structure(c(-999L, 999L), class = "factor", levels = "a"), "levels", c("b", "b")), factor(c(NA, NA), levels = "b")) - -# gforce should also work with Map in j #5336 -# conversions should not turn gforce off #2934 -# lapply gforce should also work without .SD #5032 -# support arithmetic in j with gforce #3815 -out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") -# unwrap type conversions -dt = data.table(a=1:4, b=1:2) -test(2362.01, optimize=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) -test(2362.02, optimize=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) -test(2362.03, optimize=0:2, dt[, max(as.integer(as.integer(as.integer(a)))), by=b, verbose=TRUE], data.table(b=1:2, V1=3:4), output=out) -# Map in j -dt = data.table(a=1:4, b=1:2) -test(2362.11, optimize=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) -test(2362.12, optimize=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") -# lapply without .SD -dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) -test(2362.13, optimize=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") -test(2362.14, optimize=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") -test(2362.15, optimize=0:2, dt[, Map(sum, .SD), by=b, .SDcols=c("a","c"), verbose=TRUE], dt[, lapply(.SD, sum), by=b, .SDcols=c("a","c")], output=out) -dt = data.table(a=1:2, b=1, c=1:4) -test(2362.21, optimize=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) -test(2362.22, optimize=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) -# support arithmetic in j -dt = data.table(a=1:4, b=1:2) -test(2362.31, optimize=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) -test(2362.32, optimize=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) -test(2362.33, optimize=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) -test(2362.34, optimize=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) -test(2362.35, optimize=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) -test(2362.36, optimize=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") -test(2362.37, optimize=0:2, dt[, sum(a) %% 2, b, verbose=TRUE], output=out) -test(2362.38, optimize=0:2, dt[, sum(a) %/% 2, b, verbose=TRUE], output=out) -test(2362.39, optimize=0:2, dt[, -sum(a), b, verbose=TRUE], output=out) -test(2362.40, optimize=0:2, dt[, .(sum(a)-sum(b)), b, verbose=TRUE], output="GForce FALSE") -# mix cases of the above -dt = data.table(a=1:4, b=1:2) -test(2362.41, optimize=0:2, dt[, sum(as.numeric(a)) + mean(as.integer(a)), by=b, verbose=TRUE], output=out) -test(2362.42, optimize=0:2, dt[, Map(sum, .SD), by=b, .SDcols="a", verbose=TRUE], output=out) -test(2362.43, optimize=0:2, dt[, lapply(list(as.numeric(a)), sum), by=b, verbose=TRUE], output="GForce FALSE") -test(2362.44, optimize=0:2, dt[, sum(a) + as.numeric("5"), by=b, verbose=TRUE], data.table(b=1:2, V1=c(9, 11)), output="GForce FALSE") -test(2362.45, optimize=0:2, dt[, sum(a[a > 2]), by=b, verbose=TRUE], output="GForce FALSE") -dt = data.table(a=1:4, b=1:2, c=2:5) -test(2362.46, optimize=0:2, dt[, .(sum(a) + sum(c)), by=b, verbose=TRUE], output=out) -# coverage and edge cases for lapply(.SD, ...) -dt = data.table(a=1:4, b=1:2) -test(2362.51, optimize=0:2, dt[, c(list()), b, verbose=TRUE], data.table(b=integer(0L)), output="GForce FALSE") -test(2362.52, optimize=0:2, dt[, c(lapply(.SD, sum), list()), b, verbose=TRUE], output=out) -test(2362.53, optimize=0:2, dt[, list(lapply(.SD, sum), list()), b, verbose=TRUE], output="GForce FALSE") diff --git a/man/data.table.Rd b/man/data.table.Rd index cfdcb27068..a674ecccb0 100644 --- a/man/data.table.Rd +++ b/man/data.table.Rd @@ -181,7 +181,7 @@ data.table(\dots, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFac \item{env}{ List or an environment, passed to \code{\link{substitute2}} for substitution of parameters in \code{i}, \code{j} and \code{by} (or \code{keyby}). Use \code{verbose} to preview constructed expressions. For more details see \href{../doc/datatable-programming.html}{\code{vignette("datatable-programming")}}. } - \item{showProgress}{ \code{TRUE} (default when \code{interactive()}) shows a progress indicator with estimated time to completion for lengthy "by" operations, updating every 3 seconds. An integer value controls the update interval in seconds (minimum 3). \code{FALSE} disables the progress indicator. } + \item{showProgress}{ \code{TRUE} shows progress indicator with estimated time to completion for lengthy "by" operations. } } \details{ \code{data.table} builds on base \R functionality to reduce 2 types of time:\cr diff --git a/man/nafill.Rd b/man/nafill.Rd index e0d3397027..1304c8b67a 100644 --- a/man/nafill.Rd +++ b/man/nafill.Rd @@ -37,11 +37,6 @@ x = c(1, NA, NaN, 3, NaN, NA, 4) nafill(x, "locf") nafill(x, "locf", nan=NaN) -# works for factors -x = gl(3, 2, 10) -is.na(x) = 1:2 -nafill(x, "nocb") - # fill= applies to any leftover NA nafill(c(NA, x), "locf") nafill(c(NA, x), "locf", fill=0) diff --git a/man/setkey.Rd b/man/setkey.Rd index f15373c94e..96e293fd28 100644 --- a/man/setkey.Rd +++ b/man/setkey.Rd @@ -110,7 +110,7 @@ reference. \references{ \url{https://en.wikipedia.org/wiki/Radix_sort}\cr \url{https://en.wikipedia.org/wiki/Counting_sort}\cr - \url{https://stereopsis.com/radix.html}\cr + \url{http://stereopsis.com/radix.html}\cr \url{https://codercorner.com/RadixSortRevisited.htm}\cr \url{https://cran.r-project.org/package=bit64}\cr \url{https://github.com/Rdatatable/data.table/wiki/Presentations} diff --git a/man/setorder.Rd b/man/setorder.Rd index b4a346cf18..c810048d4e 100644 --- a/man/setorder.Rd +++ b/man/setorder.Rd @@ -113,7 +113,7 @@ If you require a copy, take a copy first (using \code{DT2 = copy(DT)}). See \references{ \url{https://en.wikipedia.org/wiki/Radix_sort}\cr \url{https://en.wikipedia.org/wiki/Counting_sort}\cr - \url{https://stereopsis.com/radix.html}\cr + \url{http://stereopsis.com/radix.html}\cr \url{https://codercorner.com/RadixSortRevisited.htm}\cr \url{https://medium.com/basecs/getting-to-the-root-of-sorting-with-radix-sort-f8e9240d4224} } diff --git a/man/test.Rd b/man/test.Rd index 651ef1d359..19bd9d4f9c 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,8 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL, env = NULL, context = NULL, - requires_utf8 = FALSE, optimize = NULL) + options = NULL, env = NULL, context = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } @@ -24,8 +23,6 @@ test(num, x, y = TRUE, \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } \item{context}{ String, default \code{NULL}. Used to provide context where this is useful, e.g. in a test run in a loop where we can't just search for the test number. } -\item{requires_utf8}{ \code{FALSE} (default), \code{TRUE}, or a character string. When set, the test is skipped if UTF-8 characters cannot be represented in the native encoding. Use \code{TRUE} for default UTF-8 test characters or provide a custom string of test characters. } -\item{optimize}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=optimize)} set accordingly. All optimization levels must pass the test for the overall test to pass. If no \code{y} is supplied, the results from the different levels are compared to each other for equality. If a \code{y} is supplied, the results from each level are compared to \code{y}. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. diff --git a/src/assign.c b/src/assign.c index 5901d5a152..849cb08f2a 100644 --- a/src/assign.c +++ b/src/assign.c @@ -52,7 +52,7 @@ void setselfref(SEXP x) { */ static int _selfrefok(SEXP x, Rboolean checkNames, Rboolean verbose) { - SEXP v, p, tag, prot; + SEXP v, p, tag, prot, names; v = getAttrib(x, SelfRefSymbol); if (v==R_NilValue || TYPEOF(v)!=EXTPTRSXP) { // .internal.selfref missing is expected and normal for i) a pre v1.7.8 data.table loaded @@ -70,11 +70,11 @@ static int _selfrefok(SEXP x, Rboolean checkNames, Rboolean verbose) { if (!isNull(p)) internal_error(__func__, ".internal.selfref ptr is neither NULL nor R_NilValue"); // # nocov tag = R_ExternalPtrTag(v); if (!(isNull(tag) || isString(tag))) internal_error(__func__, ".internal.selfref tag is neither NULL nor a character vector"); // # nocov + names = getAttrib(x, R_NamesSymbol); prot = R_ExternalPtrProtected(v); if (TYPEOF(prot) != EXTPTRSXP) // Very rare. Was error(_(".internal.selfref prot is not itself an extptr")). return 0; // # nocov ; see http://stackoverflow.com/questions/15342227/getting-a-random-internal-selfref-error-in-data-table-for-r - if (!checkNames) return x == R_ExternalPtrAddr(prot); - return getAttrib(x, R_NamesSymbol) == tag; + return checkNames ? names==tag : x==R_ExternalPtrAddr(prot); } static Rboolean selfrefok(SEXP x, Rboolean verbose) { // for readability @@ -180,7 +180,7 @@ SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose) SEXP names, klass; // klass not class at request of pydatatable because class is reserved word in C++, PR #3129 R_len_t l, tl; if (isNull(dt)) error(_("alloccol has been passed a NULL dt")); - if (TYPEOF(dt) != VECSXP) error(_("dt passed to %s isn't type VECSXP"), "alloccol"); + if (TYPEOF(dt) != VECSXP) error(_("dt passed to alloccol isn't type VECSXP")); klass = getAttrib(dt, R_ClassSymbol); if (isNull(klass)) error(_("dt passed to alloccol has no class attribute. Please report result of traceback() to data.table issue tracker.")); l = LENGTH(dt); @@ -222,7 +222,7 @@ int checkOverAlloc(SEXP x) SEXP alloccolwrapper(SEXP dt, SEXP overAllocArg, SEXP verbose) { if (!IS_TRUE_OR_FALSE(verbose)) - error(_("'%s' must be TRUE or FALSE"), "verbose"); + error(_("%s must be TRUE or FALSE"), "verbose"); int overAlloc = checkOverAlloc(overAllocArg); SEXP ans = PROTECT(alloccol(dt, length(dt)+overAlloc, LOGICAL(verbose)[0])); @@ -256,103 +256,6 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) { return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0])); } -struct attrib_name_ctx { - hashtab *indexNames; // stores a 1 for every CHARSXP index name in use, 0 for removed - R_xlen_t indexNamesLen; // how much memory to allocate for the hash? - SEXP index; // attr(DT, "index") - SEXP assignedNames; // STRSXP vector of variable names just assigned - bool verbose; -}; - -// Mark each CHARSXP attribute name with a 1 inside the hash, or count them to find out the allocation size. -static SEXP getOneAttribName(SEXP key, SEXP val, void *ctx_) { - (void)val; - struct attrib_name_ctx *ctx = ctx_; - if (ctx->indexNames) - hash_set(ctx->indexNames, PRINTNAME(key), 1); - else - ctx->indexNamesLen++; - return NULL; -} - -// For a given index, find out if it sorts a column that has just been assigned. If so, shorten the index (if an equivalent one doesn't already exist) or remove it altogether. -static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) { - const struct attrib_name_ctx *ctx = ctx_; - - hashtab *indexNames = ctx->indexNames; - SEXP index = ctx->index, assignedNames = ctx->assignedNames; - R_xlen_t indexLength = xlength(value); - bool verbose = ctx->verbose; - - const char *tc1, *c1; - tc1 = c1 = CHAR(PRINTNAME(tag)); // the index name; e.g. "__col1__col2" - - if (*tc1!='_' || *(tc1+1)!='_') { - // fix for #1396 - if (verbose) { - Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1); - } - setAttrib(index, tag, R_NilValue); - return NULL; - } - - tc1 += 2; // tc1 always marks the start of a key column - if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov - - void *vmax = vmaxget(); - // check the position of the first appearance of an assigned column in the index. - // the new index will be truncated to this position. - size_t newKeyLength = strlen(c1); - char *s4 = R_alloc(newKeyLength + 3, 1); - memcpy(s4, c1, newKeyLength); - memcpy(s4 + newKeyLength, "__", 3); - - for(int i = 0; i < xlength(assignedNames); i++){ - const char *tc2 = CHAR(STRING_ELT(assignedNames, i)); - void *vmax2 = vmaxget(); - size_t tc2_len = strlen(tc2); - char *s5 = R_alloc(tc2_len + 5, 1); //4 * '_' + \0 - memcpy(s5, "__", 2); - memcpy(s5 + 2, tc2, tc2_len); - memcpy(s5 + 2 + tc2_len, "__", 3); - tc2 = strstr(s4, s5); - if(tc2 && (tc2 - s4 < newKeyLength)){ // new column is part of key; match is before last match - newKeyLength = tc2 - s4; - } - vmaxset(vmax2); - } - - s4[newKeyLength] = '\0'; // truncate the new key to the new length - if(newKeyLength == 0){ // no valid key column remains. Drop the key - setAttrib(index, tag, R_NilValue); - hash_set(indexNames, PRINTNAME(tag), 0); - if (verbose) { - Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); - } - } else if(newKeyLength < strlen(c1)) { - SEXP s4Str = PROTECT(mkChar(s4)); - if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) - !hash_lookup(indexNames, s4Str, 0)) { // index with shortened name not present yet - setAttrib(index, installChar(s4Str), value); - hash_set(indexNames, PRINTNAME(tag), 0); - setAttrib(index, tag, R_NilValue); - hash_set(indexNames, s4Str, 1); - if (verbose) - Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4+2); - } else { // indexLength > 0 || shortened name present already - // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) - // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. - setAttrib(index, tag, R_NilValue); - hash_set(indexNames, PRINTNAME(tag), 0); - if (verbose) - Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); - } - UNPROTECT(1); // s4Str - } //else: index is not affected by assign: nothing to be done - vmaxset(vmax); - return NULL; -} - int *_Last_updated = NULL; SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) @@ -361,13 +264,14 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // newcolnames : add these columns (if any) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign - R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum; - SEXP targetcol, nullint, colnam, tmp, key, index, assignedNames; + R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; + SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames; bool verbose=GetVerbose(); int ndelete=0; // how many columns are being deleted - int *buf; + const char *c1, *tc1, *tc2; + int *buf, indexNo; if (isNull(dt)) error(_("assign has been passed a NULL dt")); - if (TYPEOF(dt) != VECSXP) error(_("dt passed to %s isn't type VECSXP"), "assign"); + if (TYPEOF(dt) != VECSXP) error(_("dt passed to assign isn't type VECSXP")); if (islocked(dt)) error(_(".SD is locked. Updating .SD by reference using := or set are reserved for future use. Use := in j directly. Or use copy(.SD) as a (slow) last resort, until shallow() is exported.")); @@ -645,17 +549,93 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } index = getAttrib(dt, install("index")); if (index != R_NilValue) { - struct attrib_name_ctx ctx = { 0, }; - R_mapAttrib(index, getOneAttribName, &ctx); // how many attributes? - hashtab *h = hash_create(ctx.indexNamesLen); - PROTECT(h->prot); - ctx.indexNames = h; - R_mapAttrib(index, getOneAttribName, &ctx); // now remember the names - ctx.index = index; - ctx.assignedNames = assignedNames; - ctx.verbose = verbose; - R_mapAttrib(index, fixIndexAttrib, &ctx); // adjust indices as needed - UNPROTECT(1); // h + s = ATTRIB(index); + indexNo = 0; + // get a vector with all index names + PROTECT(indexNames = allocVector(STRSXP, xlength(s))); protecti++; + while(s != R_NilValue){ + SET_STRING_ELT(indexNames, indexNo, PRINTNAME(TAG(s))); + indexNo++; + s = CDR(s); + } + s = ATTRIB(index); // reset to first element + indexNo = 0; + while(s != R_NilValue) { + a = TAG(s); + indexLength = xlength(CAR(s)); + tc1 = c1 = CHAR(PRINTNAME(a)); // the index name; e.g. "__col1__col2" + if (*tc1!='_' || *(tc1+1)!='_') { + // fix for #1396 + if (verbose) { + Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1); + } + setAttrib(index, a, R_NilValue); + indexNo++; + s = CDR(s); + continue; // with next index + } + tc1 += 2; // tc1 always marks the start of a key column + if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov + // check the position of the first appearance of an assigned column in the index. + // the new index will be truncated to this position. + char *s4 = malloc(strlen(c1) + 3); + if (!s4) { + internal_error(__func__, "Couldn't allocate memory for s4"); // # nocov + } + memcpy(s4, c1, strlen(c1)); + memset(s4 + strlen(c1), '\0', 1); + strcat(s4, "__"); // add trailing '__' to newKey so we can search for pattern '__colName__' also at the end of the index. + int newKeyLength = strlen(c1); + for(int i = 0; i < xlength(assignedNames); i++){ + tc2 = CHAR(STRING_ELT(assignedNames, i)); + char *s5 = malloc(strlen(tc2) + 5); //4 * '_' + \0 + if (!s5) { + free(s4); // # nocov + internal_error(__func__, "Couldn't allocate memory for s5"); // # nocov + } + memset(s5, '_', 2); + memset(s5 + 2, '\0', 1); + strcat(s5, tc2); + strcat(s5, "__"); + tc2 = strstr(s4, s5); + if(tc2 == NULL){ // column is not part of key + free(s5); + continue; + } + if(tc2 - s4 < newKeyLength){ // new column match is before last match + newKeyLength = tc2 - s4; + } + free(s5); + } + memset(s4 + newKeyLength, '\0', 1); // truncate the new key to the new length + if(newKeyLength == 0){ // no valid key column remains. Drop the key + setAttrib(index, a, R_NilValue); + SET_STRING_ELT(indexNames, indexNo, NA_STRING); + if (verbose) { + Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); + } + } else if(newKeyLength < strlen(c1)) { + SEXP s4Str = PROTECT(mkString(s4)); + if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) + LOGICAL(chin(s4Str, indexNames))[0] == 0) {// index with shortened name not present yet + SET_TAG(s, install(s4)); + SET_STRING_ELT(indexNames, indexNo, mkChar(s4)); + if (verbose) + Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4 + 2); + } else { // indexLength > 0 || shortened name present already + // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) + // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. + setAttrib(index, a, R_NilValue); + SET_STRING_ELT(indexNames, indexNo, NA_STRING); + if (verbose) + Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); + } + UNPROTECT(1); // s4Str + } //else: index is not affected by assign: nothing to be done + free(s4); + indexNo ++; + s = CDR(s); + } } if (ndelete) { // delete any columns assigned NULL (there was a 'continue' earlier in loop above) @@ -1210,9 +1190,9 @@ SEXP allocNAVectorLike(SEXP x, R_len_t n) { SEXP setcharvec(SEXP x, SEXP which, SEXP newx) { int w; - if (!isString(x)) error(_("'%s' must be a character vector"), "x"); + if (!isString(x)) error(_("x must be a character vector")); if (!isInteger(which)) error(_("'which' must be an integer vector")); - if (!isString(newx)) error(_("'%s' must be a character vector"), "new"); + if (!isString(newx)) error(_("'new' must be a character vector")); if (LENGTH(newx)!=LENGTH(which)) error(_("'new' is length %d. Should be the same as length of 'which' (%d)"),LENGTH(newx),LENGTH(which)); for (int i=0; i #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT @@ -106,11 +103,6 @@ } # define R_resizeVector(x, newlen) R_resizeVector_(x, newlen) #endif -// TODO(R>=4.6.0): remove the SVN revision check -#if R_VERSION < R_Version(4, 6, 0) || R_SVN_REVISION < 89194 -# define BACKPORT_MAP_ATTRIB -# define R_mapAttrib(x, fun, ctx) R_mapAttrib_(x, fun, ctx) -#endif // init.c extern SEXP char_integer64; @@ -351,9 +343,6 @@ SEXP R_allocResizableVector_(SEXPTYPE type, R_xlen_t maxlen); SEXP R_duplicateAsResizable_(SEXP x); void R_resizeVector_(SEXP x, R_xlen_t newlen); #endif -#ifdef BACKPORT_MAP_ATTRIB -SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx); -#endif SEXP is_direct_child(SEXP pids); // types.c diff --git a/src/dogroups.c b/src/dogroups.c index 7fd1b956e9..5200d33f36 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -3,8 +3,6 @@ #include #include -static SEXP anySpecialAttribute(SEXP key, SEXP val, void *ctx); - static bool anySpecialStatic(SEXP x, hashtab * specials) { // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups @@ -41,7 +39,7 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) { // with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed. // Currently the marker is negative truelength. These specials are protected by us here and before we release them // we restore the true truelength for when R starts to use vector truelength. - SEXP list_el; + SEXP attribs, list_el; const int n = length(x); // use length() not LENGTH() because isNewList() is true for NULL if (n==0) @@ -55,29 +53,20 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) { list_el = VECTOR_ELT(x,i); if (anySpecialStatic(list_el, specials)) return true; - if (R_mapAttrib(list_el, anySpecialAttribute, specials)) - return true; // #4936 + for(attribs = ATTRIB(list_el); attribs != R_NilValue; attribs = CDR(attribs)) { + if (anySpecialStatic(CAR(attribs), specials)) + return true; // #4936 + } } } return false; } -static SEXP anySpecialAttribute(SEXP key, SEXP val, void *specials) { - (void)key; - return anySpecialStatic(val, specials) ? R_NilValue : NULL; -} - -static SEXP findRowNames(SEXP key, SEXP val, void *data) { - (void)data; - if (key == R_RowNamesSymbol) return val; - return NULL; -} - SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; int nprotect=0; - SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, RHS, target, source; + SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source; Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; const bool verbose = LOGICAL(verboseArg)[0]==1; double tstart=0, tblock[10]={0}; int nblock[10]={0}; // For verbose printing, tstart is updated each block @@ -94,13 +83,12 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX ngrpcols = length(grpcols); nrowgroups = length(VECTOR_ELT(groups,0)); // fix for longstanding FR/bug, #495. E.g., DT[, c(sum(v1), lapply(.SD, mean)), by=grp, .SDcols=v2:v3] resulted in error.. the idea is, 1) we create .SDall, which is normally == .SD. But if extra vars are detected in jexp other than .SD, then .SD becomes a shallow copy of .SDall with only .SDcols in .SD. Since internally, we don't make a copy, changing .SDall will reflect in .SD. Hopefully this'll workout :-). - SEXP SDall = PROTECT(R_getVar(install(".SDall"), env, false)); nprotect++; // PROTECT for rchk - SEXP SD = PROTECT(R_getVar(install(".SD"), env, false)); nprotect++; + SEXP SDall = PROTECT(findVar(install(".SDall"), env)); nprotect++; // PROTECT for rchk + SEXP SD = PROTECT(findVar(install(".SD"), env)); nprotect++; - int updateTime = INTEGER(showProgressArg)[0]; - const bool showProgress = updateTime > 0 && ngrp > 1; // showProgress only if more than 1 group + const bool showProgress = LOGICAL(showProgressArg)[0]==1 && ngrp > 1; // showProgress only if more than 1 group double startTime = (showProgress) ? wallclock() : 0; // For progress printing, startTime is set at the beginning - double nextTime = (showProgress) ? startTime + MAX(updateTime, 3) : 0; // wait at least 3 seconds before starting to print progress + double nextTime = (showProgress) ? startTime+3 : 0; // wait 3 seconds before printing progress hashtab * specials = hash_create(3 + ngrpcols + xlength(SDall)); // .I, .N, .GRP plus columns of .BY plus SDall PROTECT(specials->prot); nprotect++; @@ -125,12 +113,12 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX error("!length(bynames)[%d]==length(groups)[%d]==length(grpcols)[%d]", length(bynames), length(groups), length(grpcols)); // # notranslate // TO DO: check this check above. - N = PROTECT(R_getVar(install(".N"), env, false)); nprotect++; // PROTECT for rchk + N = PROTECT(findVar(install(".N"), env)); nprotect++; // PROTECT for rchk hash_set(specials, N, -1); // marker for anySpecialStatic(); see its comments - GRP = PROTECT(R_getVar(install(".GRP"), env, false)); nprotect++; + GRP = PROTECT(findVar(install(".GRP"), env)); nprotect++; hash_set(specials, GRP, -1); // marker for anySpecialStatic(); see its comments - iSD = PROTECT(R_getVar(install(".iSD"), env, false)); nprotect++; // 1-row and possibly no cols (if no i variables are used via JIS) - xSD = PROTECT(R_getVar(install(".xSD"), env, false)); nprotect++; + iSD = PROTECT(findVar(install(".iSD"), env)); nprotect++; // 1-row and possibly no cols (if no i variables are used via JIS) + xSD = PROTECT(findVar(install(".xSD"), env)); nprotect++; R_len_t maxGrpSize = 0; const int *ilens = INTEGER(lens), n=LENGTH(lens); for (R_len_t i=0; i