makeyourownmaker · trangdata · Aug 29, 2020 · Aug 29, 2020
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .Rproj.user
 .Rhistory
 .RData
+.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -12,4 +12,4 @@ URL: https://github.com/makeyourownmaker/pmlblite
 Imports: utils
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 6.1.0.9000
+RoxygenNote: 7.1.0
diff --git a/R/data.R b/R/data.R
@@ -1,63 +1,39 @@
-
-#' Names of available classification data sets
-#' 
-#' A list of the names of available classification data sets
-#' 
+#' Names of all available datasets
+#'
+#' A list of the names of available datasets
+#'
 #' @source \url{https://github.com/EpistasisLab/penn-ml-benchmarks}
-"classification_dataset_names"
+"dataset_names"
 
-#' Summary statistics for the classification data sets
-#' 
-#' @format A data frame with 15 variables:
-#' \describe{
-#'   \item{MajorityClassSize:}{Number of instances in majority class of target variable}
-#'   \item{MinorityClassSize:}{Number of instances in minority class of target variable}
-#'   \item{NumberOfClasses:}{Number of classes in target variable}
-#'   \item{ImbalanceMetric:}{Imbalance metric, where zero means that the dataset is perfectly balanced and the higher the value, the more imbalanced the dataset}
-#'   \item{NumberOfFeatures:}{Total number of features (equal to number of columns)}
-#'   \item{NumberOfBinaryFeatures:}{Number of binary features}
-#'   \item{NumberOfIntegerFeatures:}{Number of integer features}
-#'   \item{NumberOfFloatFeatures:}{Number of float features}
-#'   \item{NumberOfInstances:}{Number of data observations (equal to number of rows)}
-#'   \item{NumberOfInstancesWithMissingValues:}{Number of instances with missing values (always 0)}
-#'   \item{NumberOfMissingValues:}{Number of missing values (always 0)}
-#'   \item{NumberOfNumericFeatures:}{Number of numeric features}
-#'   \item{NumberOfSymbolicFeatures:}{Number of symbolic features}
-#'   \item{name:}{Dataset name}
-#'   \item{status:}{All datasets are currently categorised as 'active'}
-#' }
-#' 
+#' Names of available classification datasets
+#'
+#' A list of the names of available classification datasets
+#'
 #' @source \url{https://github.com/EpistasisLab/penn-ml-benchmarks}
-"classification_summary"
+"classification_dataset_names"
 
-#' Names of available regression data sets
-#' 
-#' A list of the names of available regression data sets
-#' 
+#' Names of available regression datasets
+#'
+#' A list of the names of available regression datasets
+#'
 #' @source \url{https://github.com/EpistasisLab/penn-ml-benchmarks}
 "regression_dataset_names"
 
-#' Summary statistics for the regression data sets
-#' 
-#' @format A data frame with 13 variables:
+#' Summary statistics for the all datasets
+#'
+#' @format A data frame with 10 variables:
 #' \describe{
-#'   \item{did:}{OpenML identifier (see \url{https://www.openml.org/)}}
-#'   \item{MajorityClassSize:}{Number of instances in majority class of target variable (always -1)}
-#'   \item{MaxNominalAttDistinctValues:}{Maximum number of distinct values among attributes of the nominal type}
-#'   \item{MinorityClassSize:}{Number of instances in minority class of target variable (always -1)}
-#'   \item{NumberOfClasses:}{Number of classes in target variable}
-#'   \item{NumberOfFeatures:}{Total number of features (equal to number of columns)}
-#'   \item{NumberOfInstances:}{Number of data observations (equal to number of rows)}
-#'   \item{NumberOfInstancesWithMissingValues:}{Number of instances with missing values (always 0)}
-#'   \item{NumberOfMissingValues:}{Number of missing values (always 0)}
-#'   \item{NumberOfNumericFeatures:}{Number of numeric features}
-#'   \item{NumberOfSymbolicFeatures:}{Number of symbolic features}
-#'   \item{name:}{Dataset name}
-#'   \item{status:}{All datasets are currently categorised as 'active'}
+#'   \item{dataset:}{Dataset name}
+#'   \item{n_instances:}{Number of data observations (equal to number of rows)}
+#'   \item{n_features:}{Total number of features (number of columns - 1)}
+#'   \item{n_binary_features:}{Number of binary features}
+#'   \item{n_categorical_features:}{Number of categorical features}
+#'   \item{n_continuous_features:}{Number of continuous features}
+#'   \item{n_classes:}{Number of classes in target variable}
+#'   \item{endpoint_type:}{Value type of endpoint/target (can be binary, categorical or continuous)}
+#'   \item{imbalance_metric:}{Imbalance metric, where zero means that the dataset is perfectly balanced and the higher the value, the more imbalanced the dataset}
+#'   \item{problem_type:}{Type of problem/task. Can be classification or regression.}
 #' }
 #'
-#' For further details, see \url{https://www.openml.org/}
-#' 
 #' @source \url{https://github.com/EpistasisLab/penn-ml-benchmarks}
-"regression_summary"
-
+"summary_stats"
diff --git a/R/pmlblite.R b/R/pmlblite.R
@@ -16,7 +16,7 @@
 #'  # Features and labels in single data frame
 #'  iris <- fetch_data('iris')
 #'  iris
-#'  
+#'
 #'  # Features and labels stored in separate data structures
 #'  iris <- fetch_data('iris', return_X_y=TRUE)
 #'  iris$x # data frame
@@ -26,22 +26,17 @@ fetch_data  <- function(dataset_name, return_X_y=FALSE, local_cache_dir=NA) {
     GITHUB_URL <- 'https://github.com/EpistasisLab/penn-ml-benchmarks/raw/master/datasets'
     SUFFIX     <- '.tsv.gz'
 
-    if ( dataset_name %in% classification_dataset_names ) {
-        data_type <- 'classification'
-    } else if ( dataset_name %in% regression_dataset_names ) {
-        data_type <- 'regression'
-    } else {
-        stop("'dataset_name', ", dataset_name, " not found in PMLB.\n * Check spelling, capitalisation etc.", call.=FALSE)
+    if (!dataset_name %in% dataset_names){
+      stop("'dataset_name' ", dataset_name, " not found in PMLB.\n * Check spelling, capitalisation etc.", call.=FALSE)
     }
 
     if ( return_X_y != TRUE && return_X_y != FALSE ) {
-        stop("'return_X_y' must be TRUE or FALSE:\n * return_X_y is ", return_X_y, ".", call.=FALSE)
+      stop("'return_X_y' must be TRUE or FALSE:\n * return_X_y is ", return_X_y, ".", call.=FALSE)
     }
 
-    dataset_url <- paste0(GITHUB_URL,   '/', 
-                          data_type,    '/',
+    dataset_url <- paste0(GITHUB_URL,   '/',
                           dataset_name, '/',
-                          dataset_name, 
+                          dataset_name,
                           SUFFIX)
 
     if ( is.na(local_cache_dir) ) {
@@ -57,7 +52,7 @@ fetch_data  <- function(dataset_name, return_X_y=FALSE, local_cache_dir=NA) {
       }
 
       dataset_path <- file.path(local_cache_dir, paste0(dataset_name, SUFFIX))
-      
+
       # read file from cache
       if ( file.exists(dataset_path) ) {
         dataset <- utils::read.csv( dataset_path,
@@ -78,32 +73,32 @@ fetch_data  <- function(dataset_name, return_X_y=FALSE, local_cache_dir=NA) {
       x <- dataset[, names(dataset) != "target"]
       y <- dataset$target
       dataset <- list(x=x, y=y)
-    } 
+    }
 
     return(dataset)
 }
 
 
 
-#' pmlblite: R interface to the Penn Machine Learning Benchmarks data repository 
+#' pmlblite: R interface to the Penn Machine Learning Benchmarks data repository
 #'
-#' The \href{https://github.com/EpistasisLab/penn-ml-benchmarks}{PMLB} repository contains a curated collection of data sets for evaluating and 
-#' comparing machine learning algorithms. 
-#' These data sets cover a range of applications, and include binary/multi-class classification problems and regression problems, 
+#' The \href{https://github.com/EpistasisLab/penn-ml-benchmarks}{PMLB} repository contains a curated collection of data sets for evaluating and
+#' comparing machine learning algorithms.
+#' These data sets cover a range of applications, and include binary/multi-class classification problems and regression problems,
 #' as well as combinations of categorical, ordinal, and continuous features.  There are approximately 290 data sets included in the PMLB repository
 #' and there are no missing values in these data sets.
-#' 
+#'
 #' This R library includes summaries of the classification and regression data sets but does NOT
 #' include any of the PMLB data sets.  The data sets can be downloaded using the \code{\link{fetch_data}} function which
-#' is similar to the corresponding PMLB python function.  
-#' 
+#' is similar to the corresponding PMLB python function.
+#'
 #' See \code{\link{fetch_data}}, \code{\link{classification_summary}} and \code{\link{regression_summary}} for usage examples and further information.
 #'
 #' If you use PMLB in a scientific publication, please consider citing the following paper:
 #'
 #' Randal S. Olson, William La Cava, Patryk Orzechowski, Ryan J. Urbanowicz, and Jason H. Moore (2017).
 #'
-#' PMLB: a large benchmark suite for machine learning evaluation and comparison 
+#' PMLB: a large benchmark suite for machine learning evaluation and comparison
 #'
 #' https://biodatamining.biomedcentral.com/articles/10.1186/s13040-017-0154-4
 #'

diff --git a/data-raw/Regression_datasets_pmlb.tsv b/data-raw/Regression_datasets_pmlb.tsv