Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.Rproj.user
.Rhistory
.RData
.DS_Store
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ URL: https://github.com/makeyourownmaker/pmlblite
Imports: utils
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.0.9000
RoxygenNote: 7.1.0
80 changes: 28 additions & 52 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -1,63 +1,39 @@

#' Names of available classification data sets
#'
#' A list of the names of available classification data sets
#'
#' Names of all available datasets
#'
#' A list of the names of available datasets
#'
#' @source \url{https://github.com/EpistasisLab/penn-ml-benchmarks}
"classification_dataset_names"
"dataset_names"

#' Summary statistics for the classification data sets
#'
#' @format A data frame with 15 variables:
#' \describe{
#' \item{MajorityClassSize:}{Number of instances in majority class of target variable}
#' \item{MinorityClassSize:}{Number of instances in minority class of target variable}
#' \item{NumberOfClasses:}{Number of classes in target variable}
#' \item{ImbalanceMetric:}{Imbalance metric, where zero means that the dataset is perfectly balanced and the higher the value, the more imbalanced the dataset}
#' \item{NumberOfFeatures:}{Total number of features (equal to number of columns)}
#' \item{NumberOfBinaryFeatures:}{Number of binary features}
#' \item{NumberOfIntegerFeatures:}{Number of integer features}
#' \item{NumberOfFloatFeatures:}{Number of float features}
#' \item{NumberOfInstances:}{Number of data observations (equal to number of rows)}
#' \item{NumberOfInstancesWithMissingValues:}{Number of instances with missing values (always 0)}
#' \item{NumberOfMissingValues:}{Number of missing values (always 0)}
#' \item{NumberOfNumericFeatures:}{Number of numeric features}
#' \item{NumberOfSymbolicFeatures:}{Number of symbolic features}
#' \item{name:}{Dataset name}
#' \item{status:}{All datasets are currently categorised as 'active'}
#' }
#'
#' Names of available classification datasets
#'
#' A list of the names of available classification datasets
#'
#' @source \url{https://github.com/EpistasisLab/penn-ml-benchmarks}
"classification_summary"
"classification_dataset_names"

#' Names of available regression data sets
#'
#' A list of the names of available regression data sets
#'
#' Names of available regression datasets
#'
#' A list of the names of available regression datasets
#'
#' @source \url{https://github.com/EpistasisLab/penn-ml-benchmarks}
"regression_dataset_names"

#' Summary statistics for the regression data sets
#'
#' @format A data frame with 13 variables:
#' Summary statistics for the all datasets
#'
#' @format A data frame with 10 variables:
#' \describe{
#' \item{did:}{OpenML identifier (see \url{https://www.openml.org/)}}
#' \item{MajorityClassSize:}{Number of instances in majority class of target variable (always -1)}
#' \item{MaxNominalAttDistinctValues:}{Maximum number of distinct values among attributes of the nominal type}
#' \item{MinorityClassSize:}{Number of instances in minority class of target variable (always -1)}
#' \item{NumberOfClasses:}{Number of classes in target variable}
#' \item{NumberOfFeatures:}{Total number of features (equal to number of columns)}
#' \item{NumberOfInstances:}{Number of data observations (equal to number of rows)}
#' \item{NumberOfInstancesWithMissingValues:}{Number of instances with missing values (always 0)}
#' \item{NumberOfMissingValues:}{Number of missing values (always 0)}
#' \item{NumberOfNumericFeatures:}{Number of numeric features}
#' \item{NumberOfSymbolicFeatures:}{Number of symbolic features}
#' \item{name:}{Dataset name}
#' \item{status:}{All datasets are currently categorised as 'active'}
#' \item{dataset:}{Dataset name}
#' \item{n_instances:}{Number of data observations (equal to number of rows)}
#' \item{n_features:}{Total number of features (number of columns - 1)}
#' \item{n_binary_features:}{Number of binary features}
#' \item{n_categorical_features:}{Number of categorical features}
#' \item{n_continuous_features:}{Number of continuous features}
#' \item{n_classes:}{Number of classes in target variable}
#' \item{endpoint_type:}{Value type of endpoint/target (can be binary, categorical or continuous)}
#' \item{imbalance_metric:}{Imbalance metric, where zero means that the dataset is perfectly balanced and the higher the value, the more imbalanced the dataset}
#' \item{problem_type:}{Type of problem/task. Can be classification or regression.}
#' }
#'
#' For further details, see \url{https://www.openml.org/}
#'
#' @source \url{https://github.com/EpistasisLab/penn-ml-benchmarks}
"regression_summary"

"summary_stats"
37 changes: 16 additions & 21 deletions R/pmlblite.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#' # Features and labels in single data frame
#' iris <- fetch_data('iris')
#' iris
#'
#'
#' # Features and labels stored in separate data structures
#' iris <- fetch_data('iris', return_X_y=TRUE)
#' iris$x # data frame
Expand All @@ -26,22 +26,17 @@ fetch_data <- function(dataset_name, return_X_y=FALSE, local_cache_dir=NA) {
GITHUB_URL <- 'https://github.com/EpistasisLab/penn-ml-benchmarks/raw/master/datasets'
SUFFIX <- '.tsv.gz'

if ( dataset_name %in% classification_dataset_names ) {
data_type <- 'classification'
} else if ( dataset_name %in% regression_dataset_names ) {
data_type <- 'regression'
} else {
stop("'dataset_name', ", dataset_name, " not found in PMLB.\n * Check spelling, capitalisation etc.", call.=FALSE)
if (!dataset_name %in% dataset_names){
stop("'dataset_name' ", dataset_name, " not found in PMLB.\n * Check spelling, capitalisation etc.", call.=FALSE)
}

if ( return_X_y != TRUE && return_X_y != FALSE ) {
stop("'return_X_y' must be TRUE or FALSE:\n * return_X_y is ", return_X_y, ".", call.=FALSE)
stop("'return_X_y' must be TRUE or FALSE:\n * return_X_y is ", return_X_y, ".", call.=FALSE)
}

dataset_url <- paste0(GITHUB_URL, '/',
data_type, '/',
dataset_url <- paste0(GITHUB_URL, '/',
dataset_name, '/',
dataset_name,
dataset_name,
SUFFIX)

if ( is.na(local_cache_dir) ) {
Expand All @@ -57,7 +52,7 @@ fetch_data <- function(dataset_name, return_X_y=FALSE, local_cache_dir=NA) {
}

dataset_path <- file.path(local_cache_dir, paste0(dataset_name, SUFFIX))

# read file from cache
if ( file.exists(dataset_path) ) {
dataset <- utils::read.csv( dataset_path,
Expand All @@ -78,32 +73,32 @@ fetch_data <- function(dataset_name, return_X_y=FALSE, local_cache_dir=NA) {
x <- dataset[, names(dataset) != "target"]
y <- dataset$target
dataset <- list(x=x, y=y)
}
}

return(dataset)
}



#' pmlblite: R interface to the Penn Machine Learning Benchmarks data repository
#' pmlblite: R interface to the Penn Machine Learning Benchmarks data repository
#'
#' The \href{https://github.com/EpistasisLab/penn-ml-benchmarks}{PMLB} repository contains a curated collection of data sets for evaluating and
#' comparing machine learning algorithms.
#' These data sets cover a range of applications, and include binary/multi-class classification problems and regression problems,
#' The \href{https://github.com/EpistasisLab/penn-ml-benchmarks}{PMLB} repository contains a curated collection of data sets for evaluating and
#' comparing machine learning algorithms.
#' These data sets cover a range of applications, and include binary/multi-class classification problems and regression problems,
#' as well as combinations of categorical, ordinal, and continuous features. There are approximately 290 data sets included in the PMLB repository
#' and there are no missing values in these data sets.
#'
#'
#' This R library includes summaries of the classification and regression data sets but does NOT
#' include any of the PMLB data sets. The data sets can be downloaded using the \code{\link{fetch_data}} function which
#' is similar to the corresponding PMLB python function.
#'
#' is similar to the corresponding PMLB python function.
#'
#' See \code{\link{fetch_data}}, \code{\link{classification_summary}} and \code{\link{regression_summary}} for usage examples and further information.
#'
#' If you use PMLB in a scientific publication, please consider citing the following paper:
#'
#' Randal S. Olson, William La Cava, Patryk Orzechowski, Ryan J. Urbanowicz, and Jason H. Moore (2017).
#'
#' PMLB: a large benchmark suite for machine learning evaluation and comparison
#' PMLB: a large benchmark suite for machine learning evaluation and comparison
#'
#' https://biodatamining.biomedcentral.com/articles/10.1186/s13040-017-0154-4
#'
Expand Down
121 changes: 0 additions & 121 deletions data-raw/Regression_datasets_pmlb.tsv

This file was deleted.

Loading