diff --git a/pufferlib/gp_torch.py b/pufferlib/gp_torch.py new file mode 100644 index 0000000000..bc4a258a18 --- /dev/null +++ b/pufferlib/gp_torch.py @@ -0,0 +1,140 @@ +""" +Drop-in replacement for GPyTorch exact GP training, CUDA implementation only. +""" + +import numpy as np +import torch +import torch.nn as nn + +from pufferlib import _C + +class _MLLFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, params, backend): + ctx.backend = backend + return params.new_tensor(backend.log_marginal_likelihood) + + @staticmethod + def backward(ctx, grad_output): + grads = torch.tensor(ctx.backend.mll_grad(), + dtype=grad_output.dtype, device=grad_output.device) + return grad_output * grads, None + + +def _np32(x): + if isinstance(x, torch.Tensor): + x = x.detach().cpu().numpy() + return np.ascontiguousarray(x, dtype=np.float32) + + +class GaussianProcess(nn.Module): + def __init__(self, dim, capacity, + lengthscale=1.0, outputscale=1.0, noise=1e-2, offset=1.0, + use_cuda=True): + super().__init__() + self._backend = _C.GaussianProcess(dim=dim, capacity=capacity, + lengthscale=lengthscale, outputscale=outputscale, + noise=noise, offset=offset) + self.raw_lengthscale = nn.Parameter( + torch.tensor(np.asarray(self._backend.raw_lengthscale), dtype=torch.float64)) + self.raw_outputscale = nn.Parameter( + torch.tensor(self._backend.raw_outputscale, dtype=torch.float64)) + self.raw_noise = nn.Parameter( + torch.tensor(self._backend.raw_noise, dtype=torch.float64)) + self.raw_offset = nn.Parameter( + torch.tensor(self._backend.raw_offset, dtype=torch.float64)) + + @property + def lengthscale(self): + return np.asarray(self._backend.lengthscale) + + @property + def outputscale(self): return self._backend.outputscale + + @property + def noise(self): return self._backend.noise + + @property + def offset(self): return self._backend.offset + + @property + def log_marginal_likelihood(self): return self._backend.log_marginal_likelihood + + @property + def lengthscale_range(self): + ells = self.lengthscale + return float(np.min(ells)), float(np.max(ells)) + + @property + def n(self): return self._backend.n + + @property + def dim(self): return self._backend.dim + + @property + def capacity(self): return self._backend.capacity + + def fit(self, X, y): + self._sync() + self._backend.fit(_np32(X), _np32(y)) + + def recompute(self): + self._sync() + self._backend.recompute() + + def mll(self, recompute=True): + self._sync() + if recompute: + self._backend.recompute() + params = torch.cat([self.raw_lengthscale, + self.raw_outputscale.unsqueeze(0), + self.raw_noise.unsqueeze(0), + self.raw_offset.unsqueeze(0)]) + return _MLLFunction.apply(params, self._backend) + + def predict(self, Xs): + self._sync() + means, vars_ = self._backend.predict(_np32(Xs)) + return torch.from_numpy(means), torch.from_numpy(vars_) + + def eval(self): + result = super().eval() + if hasattr(self, '_backend'): + self._sync() + self._backend.recompute() + return result + + def save(self, path): + self._sync() + self._backend.save(path) + + @classmethod + def load(cls, path, extra_cap=0, use_cuda=True): + obj = cls.__new__(cls) + nn.Module.__init__(obj) + obj._backend = _C.GaussianProcess.load(path, extra_cap) + obj.raw_lengthscale = nn.Parameter( + torch.tensor(np.asarray(obj._backend.raw_lengthscale), dtype=torch.float64)) + obj.raw_outputscale = nn.Parameter( + torch.tensor(obj._backend.raw_outputscale, dtype=torch.float64)) + obj.raw_noise = nn.Parameter( + torch.tensor(obj._backend.raw_noise, dtype=torch.float64)) + obj.raw_offset = nn.Parameter( + torch.tensor(obj._backend.raw_offset, dtype=torch.float64)) + return obj + + def __repr__(self): + ells = self.lengthscale + if len(ells) == 1: + ell_s = f"{ells[0]:.3g}" + else: + ell_s = f"[{np.min(ells):.3g}..{np.max(ells):.3g}]" + return (f"") + + def _sync(self): + self._backend.raw_lengthscale = self.raw_lengthscale.detach().cpu().numpy().astype(np.float32) + self._backend.raw_outputscale = float(self.raw_outputscale.item()) + self._backend.raw_noise = float(self.raw_noise.item()) + self._backend.raw_offset = float(self.raw_offset.item()) diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py index 36e27bf42a..ee954b18d3 100644 --- a/pufferlib/sweep.py +++ b/pufferlib/sweep.py @@ -9,18 +9,13 @@ import pufferlib import torch -import gpytorch -from gpytorch.models import ExactGP -from gpytorch.likelihoods import GaussianLikelihood -from gpytorch.kernels import MaternKernel, PolynomialKernel, ScaleKernel, AdditiveKernel -from gpytorch.means import ConstantMean -from gpytorch.mlls import ExactMarginalLogLikelihood -from gpytorch.priors import LogNormalPrior from scipy.optimize import minimize from scipy.stats.qmc import Sobol from scipy.spatial import KDTree from sklearn.linear_model import LogisticRegression +from pufferlib.gp_torch import GaussianProcess + EPSILON = 1e-6 def unroll_nested_dict(d): @@ -146,7 +141,8 @@ def _params_from_puffer_sweep(sweep_config, only_include=None): for name, param in sweep_config.items(): if name in ('method', 'metric', 'metric_distribution', 'goal', 'downsample', 'use_gpu', 'prune_pareto', - 'sweep_only', 'max_suggestion_cost', 'early_stop_quantile', 'gpus', 'max_runs'): + 'sweep_only', 'max_suggestion_cost', 'early_stop_quantile', 'gpus', 'max_runs', + 'match_enemy_model_path', 'match_num_games', 'match_enemy_hidden_size', 'match_enemy_num_layers'): continue assert isinstance(param, dict), f'Param {name} is not a dict' @@ -396,53 +392,41 @@ def early_stop(self, logs, target_key): return False -class ExactGPModel(ExactGP): - def __init__(self, train_x, train_y, likelihood, x_dim): - super(ExactGPModel, self).__init__(train_x, train_y, likelihood) - self.mean_module = ConstantMean() - # Matern 3/2 kernel (equivalent to Pyro's Matern32) - matern_kernel = MaternKernel(nu=1.5, ard_num_dims=x_dim) - - # NOTE: setting this constraint changes GP behavior, including the lengthscale - # even though the lengthscale is well within the range ... Commenting out for now. - # lengthscale_constraint = gpytorch.constraints.Interval(0.01, 10.0) - # matern_kernel = MaternKernel(nu=1.5, ard_num_dims=x_dim, lengthscale_constraint=lengthscale_constraint) +_NOISE_PRIOR_MU = math.log(1e-2) # LogNormal prior on noise, from HEBO +_NOISE_PRIOR_SIGMA = 0.5 - linear_kernel = PolynomialKernel(power=1) - self.covar_module = ScaleKernel(AdditiveKernel(linear_kernel, matern_kernel)) +def _noise_log_prior(raw_noise_param): + # LogNormal(mu, sigma) prior in constrained space, with softplus change-of-variables. + # log p(raw) = log p_lognormal(softplus(raw) + lb) + log softplus_grad(raw) + noise = torch.nn.functional.softplus(raw_noise_param) + 1e-4 + log_noise = torch.log(noise) + log_p = -0.5 * ((log_noise - _NOISE_PRIOR_MU) / _NOISE_PRIOR_SIGMA) ** 2 \ + - log_noise - math.log(_NOISE_PRIOR_SIGMA) + log_jac = -torch.log1p(torch.exp(-raw_noise_param)) # log sigmoid = log softplus_grad + return log_p + log_jac - def forward(self, x): - mean_x = self.mean_module(x) - covar_x = self.covar_module(x) - return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) - @property - def lengthscale_range(self): - # Get lengthscale from MaternKernel - lengthscale = self.covar_module.base_kernel.kernels[1].lengthscale.tolist()[0] - return min(lengthscale), max(lengthscale) +def _fit_gp(gp_model, optimizer, train_x, train_y, training_iter=50): + if isinstance(train_x, torch.Tensor): + train_x = train_x.cpu().numpy() + if isinstance(train_y, torch.Tensor): + train_y = train_y.cpu().numpy() -def train_gp_model(model, likelihood, mll, optimizer, train_x, train_y, training_iter=50): - model.train() - likelihood.train() - model.set_train_data(inputs=train_x, targets=train_y, strict=False) + gp_model.train() + gp_model.fit(train_x, train_y) loss = None for _ in range(training_iter): try: optimizer.zero_grad() - output = model(train_x) - loss = -mll(output, train_y) + loss = -gp_model.mll() - _noise_log_prior(gp_model.raw_noise) loss.backward() optimizer.step() loss = loss.detach() - - except gpytorch.utils.errors.NotPSDError: - # It's rare but it does happen. Hope it's a transient issue. + except Exception: break - model.eval() - likelihood.eval() + gp_model.eval() return loss.item() if loss is not None else 0 @@ -592,36 +576,40 @@ def __init__(self, self.stop_threshold_model = RobustLogCostModel(quantile=sweep_config['early_stop_quantile']) self.upper_cost_threshold = -np.inf - # Use 64 bit for GP regression - with default_tensor_dtype(torch.float64): - # Params taken from HEBO: https://arxiv.org/abs/2012.03826 - noise_prior = LogNormalPrior(math.log(1e-2), 0.5) - - # Create dummy data for model initialization on the selected device - dummy_x = torch.ones((1, self.hyperparameters.num), device=self.device) - dummy_y = torch.zeros(1, device=self.device) - # Score GP - self.likelihood_score = GaussianLikelihood(noise_prior=deepcopy(noise_prior)).to(self.device) - self.gp_score = ExactGPModel(dummy_x, dummy_y, self.likelihood_score, self.hyperparameters.num).to(self.device) - self.mll_score = ExactMarginalLogLikelihood(self.likelihood_score, self.gp_score).to(self.device) - self.score_opt = torch.optim.Adam(self.gp_score.parameters(), lr=self.gp_learning_rate, amsgrad=True) + # Score GP + self.gp_score = GaussianProcess(dim=self.hyperparameters.num, capacity=self.gp_max_obs, use_cuda=_use_gpu) + self.score_opt = torch.optim.Adam(self.gp_score.parameters(), lr=self.gp_learning_rate, amsgrad=True) - # Cost GP - self.likelihood_cost = GaussianLikelihood(noise_prior=deepcopy(noise_prior)).to(self.device) - self.gp_cost = ExactGPModel(dummy_x, dummy_y, self.likelihood_cost, self.hyperparameters.num).to(self.device) - self.mll_cost = ExactMarginalLogLikelihood(self.likelihood_cost, self.gp_cost).to(self.device) - self.cost_opt = torch.optim.Adam(self.gp_cost.parameters(), lr=self.gp_learning_rate, amsgrad=True) + # Cost GP + self.gp_cost = GaussianProcess(dim=self.hyperparameters.num, capacity=self.gp_max_obs, use_cuda=_use_gpu) + self.cost_opt = torch.optim.Adam(self.gp_cost.parameters(), lr=self.gp_learning_rate, amsgrad=True) + + # Buffers for GP training and inference + self.gp_params_buffer = torch.empty(self.gp_max_obs, self.hyperparameters.num, dtype=torch.float64, device=self.device) + self.gp_score_buffer = torch.empty(self.gp_max_obs, dtype=torch.float64, device=self.device) + self.gp_cost_buffer = torch.empty(self.gp_max_obs, dtype=torch.float64, device=self.device) + self.infer_batch_buffer = torch.empty(self.infer_batch_size, self.hyperparameters.num, dtype=torch.float64, device=self.device) - # Buffers for GP training and inference - self.gp_params_buffer = torch.empty(self.gp_max_obs, self.hyperparameters.num, device=self.device) - self.gp_score_buffer = torch.empty(self.gp_max_obs, device=self.device) - self.gp_cost_buffer = torch.empty(self.gp_max_obs, device=self.device) - self.infer_batch_buffer = torch.empty(self.infer_batch_size, self.hyperparameters.num, device=self.device) + _CUDA_ATTRS = ('gp_score', 'gp_cost', 'score_opt', 'cost_opt', + 'gp_params_buffer', 'gp_score_buffer', + 'gp_cost_buffer', 'infer_batch_buffer') + + def __getstate__(self): + state = self.__dict__.copy() + for attr in self._CUDA_ATTRS: + state.pop(attr, None) + return state + + def __setstate__(self, state): + self.__dict__.update(state) + for attr in self._CUDA_ATTRS: + if attr not in self.__dict__: + self.__dict__[attr] = None def to(self, device): self.device = torch.device(device) - for attr in ('gp_score', 'gp_cost', 'likelihood_score', 'likelihood_cost', - 'mll_score', 'mll_cost', 'gp_params_buffer', 'gp_score_buffer', + for attr in ('gp_score', 'gp_cost', + 'gp_params_buffer', 'gp_score_buffer', 'gp_cost_buffer', 'infer_batch_buffer'): setattr(self, attr, getattr(self, attr).to(self.device)) for opt in (self.score_opt, self.cost_opt): @@ -713,10 +701,8 @@ def _train_gp_models(self): log_c_norm_tensor = self.gp_cost_buffer[:num_sampled] log_c_norm_tensor.copy_(torch.from_numpy(log_c_norm)) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", gpytorch.utils.warnings.NumericalWarning) - score_loss = train_gp_model(self.gp_score, self.likelihood_score, self.mll_score, self.score_opt, params_tensor, y_norm_tensor, training_iter=self.gp_training_iter) - cost_loss = train_gp_model(self.gp_cost, self.likelihood_cost, self.mll_cost, self.cost_opt, params_tensor, log_c_norm_tensor, training_iter=self.gp_training_iter) + score_loss = _fit_gp(self.gp_score, self.score_opt, params_tensor, y_norm_tensor, training_iter=self.gp_training_iter) + cost_loss = _fit_gp(self.gp_cost, self.cost_opt, params_tensor, log_c_norm_tensor, training_iter=self.gp_training_iter) return score_loss, cost_loss @@ -818,30 +804,25 @@ def suggest(self, fill, fixed_total_timesteps=None): # Batch predictions to avoid GPU OOM for large number of suggestions gp_y_norm_list, gp_log_c_norm_list = [], [] - with torch.no_grad(), gpytorch.settings.fast_pred_var(), warnings.catch_warnings(): - warnings.simplefilter("ignore", gpytorch.utils.warnings.NumericalWarning) - - # Create a reusable buffer on the device to avoid allocating a huge tensor + with torch.no_grad(): for i in range(0, len(suggestions), self.infer_batch_size): batch_numpy = suggestions[i:i+self.infer_batch_size] current_batch_size = len(batch_numpy) - # Use a slice of the buffer if the current batch is smaller batch_tensor = self.infer_batch_buffer[:current_batch_size] batch_tensor.copy_(torch.from_numpy(batch_numpy)) try: # Score and cost prediction - pred_y_mean = self.likelihood_score(self.gp_score(batch_tensor)).mean.cpu() - pred_c_mean = self.likelihood_cost(self.gp_cost(batch_tensor)).mean.cpu() - + pred_y_mean, _ = self.gp_score.predict(batch_tensor) + pred_c_mean, _ = self.gp_cost.predict(batch_tensor) except RuntimeError: # Handle numerical errors during GP prediction - pred_y_mean, pred_c_mean = torch.zeros(current_batch_size) - - gp_y_norm_list.append(pred_y_mean.cpu()) - gp_log_c_norm_list.append(pred_c_mean.cpu()) + pred_y_mean = torch.zeros(current_batch_size, dtype=torch.float64) + pred_c_mean = torch.zeros(current_batch_size, dtype=torch.float64) + gp_y_norm_list.append(pred_y_mean) + gp_log_c_norm_list.append(pred_c_mean) del pred_y_mean, pred_c_mean # Concatenate results from all batches diff --git a/src/bindings.cu b/src/bindings.cu index 64be61194d..d0fe08b1e7 100644 --- a/src/bindings.cu +++ b/src/bindings.cu @@ -4,6 +4,7 @@ #include #include #include "pufferlib.cu" +#include "gp_cuda.cu" #define _PUFFER_STRINGIFY(x) #x #define PUFFER_STRINGIFY(x) _PUFFER_STRINGIFY(x) @@ -463,6 +464,157 @@ std::unique_ptr create_pufferl(py::dict args) { return pufferl; } +// --------------------------------------------------------------------------- +// CUDA GP wrapper +// --------------------------------------------------------------------------- + +using gp_arr = py::array_t; + +class PyGP { + GaussianProcess *gp_; + + void check() const { + if (!gp_) throw std::runtime_error("GaussianProcess object is invalid"); + } + + static int check_X(py::buffer_info &buf, int dim) { + if (buf.ndim == 1) { + if (dim != 1) throw std::invalid_argument("1-D X requires dim=1"); + return (int)buf.shape[0]; + } + if (buf.ndim == 2) { + if ((int)buf.shape[1] != dim) + throw std::invalid_argument( + "X has " + std::to_string(buf.shape[1]) + + " cols, expected " + std::to_string(dim)); + return (int)buf.shape[0]; + } + throw std::invalid_argument("X must be 1-D or 2-D"); + } + +public: + PyGP(int dim, int capacity, + float lengthscale, float outputscale, float noise, float offset) + : gp_(gp_create(dim, capacity, + gp_kernel_matern32_linear(dim, lengthscale, outputscale, offset), + noise)) + { + if (!gp_) throw std::runtime_error("gp_create failed"); + } + explicit PyGP(GaussianProcess *raw) : gp_(raw) {} + ~PyGP() { if (gp_) gp_destroy(gp_); } + PyGP(const PyGP &) = delete; + PyGP &operator=(const PyGP &) = delete; + PyGP(PyGP &&o) noexcept : gp_(o.gp_) { o.gp_ = nullptr; } + PyGP &operator=(PyGP &&o) noexcept { + if (gp_) gp_destroy(gp_); gp_ = o.gp_; o.gp_ = nullptr; return *this; + } + + void fit(gp_arr X, gp_arr y) { + check(); + auto xbuf = X.request(), ybuf = y.request(); + int n = check_X(xbuf, gp_->dim); + if ((int)ybuf.shape[0] != n) throw std::invalid_argument("X and y length mismatch"); + int rc = gp_fit(gp_, (const float *)xbuf.ptr, (const float *)ybuf.ptr, n, 0); + if (rc == -1) throw std::runtime_error("n exceeds capacity"); + if (rc == -2) throw std::runtime_error("Cholesky factorisation failed"); + } + void recompute() { check(); gp_recompute(gp_, 0); } + + py::tuple predict(gp_arr X, bool noise = false) { + check(); + auto buf = X.request(); + int m = check_X(buf, gp_->dim); + auto means = py::array_t(m); + auto vars = py::array_t(m); + gp_predict(gp_, (const float *)buf.ptr, + (float *)means.request().ptr, + (float *)vars.request().ptr, m, 0); + if (noise) { + float sn = gp_get_noise(gp_); + float *vp = (float *)vars.request().ptr; + for (int i = 0; i < m; i++) vp[i] += sn; + } + return py::make_tuple(means, vars); + } + + float log_marginal_likelihood() const { check(); return gp_marginal_log_likelihood(gp_); } + + py::tuple mll_grad() const { + check(); + int np = gp_->kernel->n_params, d = gp_->dim; + float d_raw_noise; + std::vector kg((size_t)np); + gp_mll_grad(gp_, &d_raw_noise, kg.data(), 0); + py::list lst; + for (int i = 0; i < d; i++) lst.append(kg[i]); + lst.append(kg[np - 2]); + lst.append(d_raw_noise); + lst.append(kg[np - 1]); + return py::tuple(lst); + } + + py::array_t lengthscale() const { + check(); int d = gp_->dim; + py::array_t res(d); + auto buf = res.mutable_unchecked<1>(); + for (int i = 0; i < d; i++) buf(i) = gp_kernel_get_lengthscale(gp_->kernel, i); + return res; + } + float outputscale() const { check(); return gp_kernel_get_outputscale(gp_->kernel); } + float gp_noise() const { check(); return gp_get_noise(gp_); } + float offset() const { check(); return gp_kernel_get_offset(gp_->kernel); } + + void set_lengthscale(gp_arr v) { + check(); auto buf = v.request(); + if ((int)buf.shape[0] != gp_->dim) + throw std::invalid_argument("wrong lengthscale size: expected " + std::to_string(gp_->dim)); + const float *p = (const float *)buf.ptr; + for (int i = 0; i < gp_->dim; i++) gp_kernel_set_lengthscale(gp_->kernel, i, p[i]); + } + void set_outputscale(float v) { check(); gp_kernel_set_outputscale(gp_->kernel, v); } + void set_gp_noise (float v) { check(); gp_set_noise(gp_, v); } + void set_offset (float v) { check(); gp_kernel_set_offset(gp_->kernel, v); } + + py::array_t raw_lengthscale() const { + check(); int d = gp_->dim; + py::array_t res(d); + auto buf = res.mutable_unchecked<1>(); + for (int i = 0; i < d; i++) buf(i) = gp_->kernel->raw_params[i]; + return res; + } + float raw_outputscale() const { check(); return gp_->kernel->raw_params[gp_->kernel->n_params - 2]; } + float raw_noise() const { check(); return gp_->raw_noise; } + float raw_offset() const { check(); return gp_->kernel->raw_params[gp_->kernel->n_params - 1]; } + + void set_raw_lengthscale(gp_arr v) { + check(); auto buf = v.request(); + if ((int)buf.shape[0] != gp_->dim) + throw std::invalid_argument("wrong lengthscale size: expected " + std::to_string(gp_->dim)); + const float *p = (const float *)buf.ptr; + for (int i = 0; i < gp_->dim; i++) gp_->kernel->raw_params[i] = p[i]; + } + void set_raw_outputscale(float v) { check(); gp_->kernel->raw_params[gp_->kernel->n_params - 2] = v; } + void set_raw_noise (float v) { check(); gp_->raw_noise = v; } + void set_raw_offset (float v) { check(); gp_->kernel->raw_params[gp_->kernel->n_params - 1] = v; } + + int n() const { check(); return gp_->n; } + int dim() const { check(); return gp_->dim; } + int capacity() const { check(); return gp_->cap; } + float dedup_threshold() const { check(); return gp_->dedup_threshold; } + void set_dedup_threshold(float v) { check(); gp_->dedup_threshold = v; } + + void save(const std::string &path) const { + check(); + if (gp_save(gp_, path.c_str()) != 0) throw std::runtime_error("save failed: " + path); + } + static PyGP load(const std::string &path, int extra_cap = 0) { + GaussianProcess *raw = gp_load(path.c_str(), extra_cap); + if (!raw) throw std::runtime_error("load failed: " + path); + return PyGP(raw); + } +}; + PYBIND11_MODULE(_C, m) { // Multi-GPU: generate NCCL unique ID (call on rank 0, pass bytes to all ranks) m.def("get_nccl_id", []() { @@ -629,4 +781,91 @@ PYBIND11_MODULE(_C, m) { .def("num_params", [](PuffeRL& self) -> int64_t { return numel(self.master_weights.shape); }); + + // CUDA GP regression + py::class_(m, "GaussianProcess") + .def(py::init(), + py::arg("dim"), py::arg("capacity"), + py::arg("lengthscale") = 1.0, + py::arg("outputscale") = 1.0, + py::arg("noise") = 1e-2, + py::arg("offset") = 1.0) + .def("fit", &PyGP::fit, py::arg("X"), py::arg("y")) + .def("recompute", &PyGP::recompute) + .def("predict", &PyGP::predict, py::arg("X"), py::arg("noise") = false) + .def("mll_grad", &PyGP::mll_grad) + .def_property("lengthscale", &PyGP::lengthscale, &PyGP::set_lengthscale) + .def_property("outputscale", &PyGP::outputscale, &PyGP::set_outputscale) + .def_property("noise", &PyGP::gp_noise, &PyGP::set_gp_noise) + .def_property("offset", &PyGP::offset, &PyGP::set_offset) + .def_property("raw_lengthscale", &PyGP::raw_lengthscale, &PyGP::set_raw_lengthscale) + .def_property("raw_outputscale", &PyGP::raw_outputscale, &PyGP::set_raw_outputscale) + .def_property("raw_noise", &PyGP::raw_noise, &PyGP::set_raw_noise) + .def_property("raw_offset", &PyGP::raw_offset, &PyGP::set_raw_offset) + .def_property_readonly("log_marginal_likelihood", &PyGP::log_marginal_likelihood) + .def_property_readonly("n", &PyGP::n) + .def_property_readonly("dim", &PyGP::dim) + .def_property_readonly("capacity", &PyGP::capacity) + .def_property("dedup_threshold", &PyGP::dedup_threshold, &PyGP::set_dedup_threshold) + .def("save", &PyGP::save, py::arg("path")) + .def_static("load", &PyGP::load, py::arg("path"), py::arg("extra_cap") = 0) + .def(py::pickle( + [](const PyGP &g) { + py::dict state; + state["dim"] = g.dim(); + state["capacity"] = g.capacity(); + state["dedup_threshold"] = g.dedup_threshold(); + state["raw_noise"] = g.raw_noise(); + state["raw_lengthscale"] = g.raw_lengthscale(); + state["raw_outputscale"] = g.raw_outputscale(); + state["raw_offset"] = g.raw_offset(); + if (g.n() > 0) { + char tmppath[] = "/tmp/pufferlib_gp_XXXXXX"; + int fd = mkstemp(tmppath); + if (fd < 0) throw std::runtime_error("pickle: mkstemp failed"); + close(fd); + try { g.save(std::string(tmppath)); } + catch (...) { remove(tmppath); throw; } + FILE *fp = fopen(tmppath, "rb"); + if (!fp) { remove(tmppath); throw std::runtime_error("pickle: fopen failed"); } + fseek(fp, 0, SEEK_END); long sz = ftell(fp); fseek(fp, 0, SEEK_SET); + std::string buf((size_t)sz, '\0'); + fread(&buf[0], 1, (size_t)sz, fp); fclose(fp); remove(tmppath); + state["data"] = py::bytes(buf.data(), (size_t)sz); + } + return state; + }, + [](py::dict state) { + int dim = state["dim"].cast(); + int cap = state["capacity"].cast(); + float dedup = state["dedup_threshold"].cast(); + if (state.contains("data")) { + py::bytes blob = state["data"].cast(); + const char *ptr = PyBytes_AS_STRING(blob.ptr()); + Py_ssize_t len = PyBytes_GET_SIZE(blob.ptr()); + char tmppath[] = "/tmp/pufferlib_gp_XXXXXX"; + int fd = mkstemp(tmppath); + if (fd < 0) throw std::runtime_error("unpickle: mkstemp failed"); + ssize_t written = write(fd, ptr, (size_t)len); close(fd); + if (written != len) { remove(tmppath); throw std::runtime_error("unpickle: write failed"); } + int n; memcpy(&n, ptr + 8, sizeof(int)); + int extra_cap = cap > n ? cap - n : 0; + PyGP gp(nullptr); + try { gp = PyGP::load(std::string(tmppath), extra_cap); } + catch (...) { remove(tmppath); throw; } + remove(tmppath); gp.set_dedup_threshold(dedup); return gp; + } + PyGP gp(dim, cap, 1.0f, 1.0f, 1e-2f, 1.0f); + gp.set_raw_noise(state["raw_noise"].cast()); + gp.set_raw_lengthscale(state["raw_lengthscale"].cast()); + gp.set_raw_outputscale(state["raw_outputscale"].cast()); + gp.set_raw_offset(state["raw_offset"].cast()); + gp.set_dedup_threshold(dedup); return gp; + } + )) + .def("__repr__", [](const PyGP &g) { + return ""; + }); } diff --git a/src/gp_cuda.cu b/src/gp_cuda.cu new file mode 100644 index 0000000000..885782aa18 --- /dev/null +++ b/src/gp_cuda.cu @@ -0,0 +1,593 @@ +// Exact GP regression, CUDA (cuBLAS/cuSOLVER). + +#ifndef GP_CUDA_CU +#define GP_CUDA_CU + +#define _USE_MATH_DEFINES +#include +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif +#include +#include +#include +#include +#include +#include + +#include "gp_cuda_kernel.cu" + +#define CUSOLVER_CHECK(call) do { \ + cusolverStatus_t _s = (call); \ + if (_s != CUSOLVER_STATUS_SUCCESS) { \ + fprintf(stderr, "cuSOLVER error %s:%d: %d\n", __FILE__, __LINE__, (int)_s); \ + abort(); \ + } \ +} while (0) + +// -- GaussianProcess struct ------------------------------------------------------------- + +typedef struct { + int dim, n, cap; + float *d_X; + float *d_y; + float *d_L; + float *d_alpha; + float *d_work; + int lwork; + int *d_info; + float raw_noise; + float dedup_threshold; + cublasHandle_t cublas; + cusolverDnHandle_t cusolver; + GPKernel *kernel; +} GaussianProcess; + +GaussianProcess *gp_create (int dim, int cap, GPKernel *kernel, float noise); +void gp_destroy (GaussianProcess *gp); +float gp_get_noise(const GaussianProcess *gp); +void gp_set_noise(GaussianProcess *gp, float v); +int gp_fit (GaussianProcess *gp, const float *X, const float *y, int n, cudaStream_t stream); +int gp_recompute(GaussianProcess *gp, cudaStream_t stream); +void gp_predict(const GaussianProcess *gp, const float *Xs, + float *means, float *vars, int m, cudaStream_t stream); +float gp_marginal_log_likelihood(const GaussianProcess *gp); +void gp_mll_grad(const GaussianProcess *gp, float *d_raw_noise, float *kernel_grads, + cudaStream_t stream); +int gp_save(const GaussianProcess *gp, const char *path); +GaussianProcess *gp_load(const char *path, int extra_cap); + +// -- utility device kernels -------------------------------------------------- + +__global__ void gp_k_add_diag(float *A, int n, float val) +{ + int i = blockIdx.x * 256 + threadIdx.x; + if (i < n) A[(size_t)i * (n + 1)] += val; +} + +__global__ void gp_k_extract_diag(const float *A, float *d, int n) +{ + int i = blockIdx.x * 256 + threadIdx.x; + if (i < n) d[i] = A[(size_t)i * (n + 1)]; +} + +__global__ void gp_k_col_sqnorms(const float *V, float *sqnorms, int n, int m) +{ + int j = blockIdx.x * 256 + threadIdx.x; + if (j >= m) return; + float sq = 0.0; + const float *col = V + (size_t)j * n; + for (int i = 0; i < n; i++) sq += col[i] * col[i]; + sqnorms[j] = sq; +} + +__global__ void gp_k_var_subtract(float *vars, const float *sqnorms, int m) +{ + int j = blockIdx.x * 256 + threadIdx.x; + if (j >= m) return; + float v = vars[j] - sqnorms[j]; + vars[j] = v > 0.0 ? v : 0.0; +} + +// -- host-side near-duplicate filter ----------------------------------------- + +#define KD_LEAF 16 + +typedef struct { int lo, hi, left, right, split_dim; float split_val; } KDNode; +typedef struct { const float *X; int n, dim, n_nodes; int *idx; KDNode *nodes; } KDTree; + +static int g_kd_split_dim, g_kd_d; +static const float *g_kd_X; + +static int kd_cmp_fn(const void *a, const void *b) +{ + float va = g_kd_X[*(const int *)a * g_kd_d + g_kd_split_dim]; + float vb = g_kd_X[*(const int *)b * g_kd_d + g_kd_split_dim]; + return (va > vb) - (va < vb); +} + +static void kd_build(KDTree *t, int node, int lo, int hi) +{ + KDNode *nd = &t->nodes[node]; + nd->lo = lo; nd->hi = hi; nd->left = nd->right = -1; nd->split_dim = -1; + if (hi - lo <= KD_LEAF) return; + + int best = 0; float spread = -1.0; + for (int k = 0; k < t->dim; k++) { + float mn = t->X[t->idx[lo] * t->dim + k], mx = mn; + for (int i = lo + 1; i < hi; i++) { + float v = t->X[t->idx[i] * t->dim + k]; + if (v < mn) mn = v; + if (v > mx) mx = v; + } + if (mx - mn > spread) { spread = mx - mn; best = k; } + } + int mid = (lo + hi) / 2; + g_kd_split_dim = best; g_kd_d = t->dim; g_kd_X = t->X; + qsort(t->idx + lo, hi - lo, sizeof(int), kd_cmp_fn); + nd->split_dim = best; + nd->split_val = t->X[t->idx[mid] * t->dim + best]; + int ln = t->n_nodes++, rn = t->n_nodes++; + nd->left = ln; nd->right = rn; + kd_build(t, ln, lo, mid); + kd_build(t, rn, mid, hi); +} + +static KDTree kd_create(const float *X, int n, int dim) +{ + KDTree t; + t.X = X; t.n = n; t.dim = dim; t.n_nodes = 1; + t.idx = (int *)malloc(n * sizeof(int)); + t.nodes = (KDNode *)malloc(2 * n * sizeof(KDNode)); + for (int i = 0; i < n; i++) t.idx[i] = i; + kd_build(&t, 0, 0, n); + return t; +} + +static void kd_destroy(KDTree *t) { free(t->idx); free(t->nodes); } + +static void kd_query_ball(const KDTree *t, int node, const float *q, + float r, float r2, int *out, int *cnt) +{ + const KDNode *nd = &t->nodes[node]; + if (nd->split_dim < 0) { + for (int i = nd->lo; i < nd->hi; i++) { + int p = t->idx[i]; float sq = 0.0; + for (int k = 0; k < t->dim; k++) { + float df = q[k] - t->X[p * t->dim + k]; sq += df * df; + } + if (sq <= r2) out[(*cnt)++] = p; + } + return; + } + float dv = q[nd->split_dim] - nd->split_val; + if (nd->left >= 0 && dv <= r) kd_query_ball(t, nd->left, q, r, r2, out, cnt); + if (nd->right >= 0 && dv >= -r) kd_query_ball(t, nd->right, q, r, r2, out, cnt); +} + +static int gp_filter_near_duplicates(const float *X, int n, int dim, + float threshold, int *kept_indices) +{ + if (n <= 0) return 0; + if (n == 1) { kept_indices[0] = 0; return 1; } + + KDTree tree = kd_create(X, n, dim); + int *keep = (int *)malloc(n * sizeof(int)); + int *nearby = (int *)malloc(n * sizeof(int)); + float r2 = threshold * threshold; + for (int i = 0; i < n; i++) keep[i] = 1; + + for (int i = n - 1; i >= 0; i--) { + if (!keep[i]) continue; + int cnt = 0; + kd_query_ball(&tree, 0, &X[(size_t)i * dim], threshold, r2, nearby, &cnt); + for (int j = 0; j < cnt; j++) + if (nearby[j] != i) keep[nearby[j]] = 0; + } + + int count = 0; + for (int i = 0; i < n; i++) + if (keep[i]) kept_indices[count++] = i; + + free(nearby); free(keep); kd_destroy(&tree); + return count; +} + +// -- hyperparameter access --------------------------------------------------- + +float gp_get_noise(const GaussianProcess *gp) { return softplus(gp->raw_noise); } +void gp_set_noise(GaussianProcess *gp, float v) { gp->raw_noise = inv_softplus(v); } + +// -- lifecycle --------------------------------------------------------------- + +GaussianProcess *gp_create(int dim, int cap, GPKernel *kernel, float noise) +{ + GaussianProcess *gp = (GaussianProcess *)calloc(1, sizeof(GaussianProcess)); + gp->dim = dim; + gp->cap = cap; + gp->kernel = kernel; + gp_set_noise(gp, noise); + CUDA_CHECK(cudaMalloc(&gp->d_X, (size_t)cap * dim * sizeof(float))); + CUDA_CHECK(cudaMalloc(&gp->d_y, (size_t)cap * sizeof(float))); + CUDA_CHECK(cudaMalloc(&gp->d_L, (size_t)cap * cap * sizeof(float))); + CUDA_CHECK(cudaMalloc(&gp->d_alpha, (size_t)cap * sizeof(float))); + CUDA_CHECK(cudaMalloc(&gp->d_info, sizeof(int))); + CUBLAS_CHECK (cublasCreate (&gp->cublas)); + CUSOLVER_CHECK(cusolverDnCreate(&gp->cusolver)); + return gp; +} + +void gp_destroy(GaussianProcess *gp) +{ + if (!gp) return; + cudaFree(gp->d_X); cudaFree(gp->d_y); + cudaFree(gp->d_L); cudaFree(gp->d_alpha); + cudaFree(gp->d_work); cudaFree(gp->d_info); + cublasDestroy (gp->cublas); + cusolverDnDestroy(gp->cusolver); + if (gp->kernel && gp->kernel->destroy) gp->kernel->destroy(gp->kernel); + free(gp); +} + +// -- internal helpers -------------------------------------------------------- + +static int run_potrf(GaussianProcess *gp, float *d_K, int n, cudaStream_t stream) +{ + CUSOLVER_CHECK(cusolverDnSetStream(gp->cusolver, stream)); + int lwork = 0; + CUSOLVER_CHECK(cusolverDnSpotrf_bufferSize( + gp->cusolver, CUBLAS_FILL_MODE_LOWER, n, d_K, n, &lwork)); + if (lwork > gp->lwork) { + cudaFree(gp->d_work); + CUDA_CHECK(cudaMalloc(&gp->d_work, (size_t)lwork * sizeof(float))); + gp->lwork = lwork; + } + CUSOLVER_CHECK(cusolverDnSpotrf( + gp->cusolver, CUBLAS_FILL_MODE_LOWER, n, d_K, n, + gp->d_work, gp->lwork, gp->d_info)); + + int h_info; + CUDA_CHECK(cudaMemcpyAsync(&h_info, gp->d_info, sizeof(int), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + return h_info; +} + +static void solve_alpha(GaussianProcess *gp, const float *d_L, int n, float *d_alpha, + cudaStream_t stream) +{ + CUBLAS_CHECK(cublasSetStream(gp->cublas, stream)); + CUBLAS_CHECK(cublasStrsv(gp->cublas, + CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, + n, d_L, n, d_alpha, 1)); + CUBLAS_CHECK(cublasStrsv(gp->cublas, + CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, + n, d_L, n, d_alpha, 1)); +} + +// -- training ---------------------------------------------------------------- + +int gp_recompute(GaussianProcess *gp, cudaStream_t stream) +{ + int n = gp->n; + if (n == 0) return 0; + + gp->kernel->build_K(gp->kernel, gp->d_X, n, gp->dim, gp_get_noise(gp), gp->d_L, stream); + + int info = run_potrf(gp, gp->d_L, n, stream); + if (info != 0) { + gp->kernel->build_K(gp->kernel, gp->d_X, n, gp->dim, gp_get_noise(gp), gp->d_L, stream); + gp_k_add_diag<<<(n + 255) / 256, 256, 0, stream>>>(gp->d_L, n, 1e-8); + CUDA_CHECK(cudaGetLastError()); + info = run_potrf(gp, gp->d_L, n, stream); + if (info != 0) return -2; + } + + CUDA_CHECK(cudaMemcpyAsync(gp->d_alpha, gp->d_y, + (size_t)n * sizeof(float), cudaMemcpyDeviceToDevice, stream)); + solve_alpha(gp, gp->d_L, n, gp->d_alpha, stream); + return 0; +} + +int gp_fit(GaussianProcess *gp, const float *X, const float *y, int n, cudaStream_t stream) +{ + if (n > gp->cap) return -1; + + if (gp->dedup_threshold > 0.0 && n > 1) { + int *idx = (int *)malloc(n * sizeof(int)); + int n_kept = gp_filter_near_duplicates(X, n, gp->dim, + gp->dedup_threshold, idx); + float *X_c = (float *)malloc((size_t)n_kept * gp->dim * sizeof(float)); + float *y_c = (float *)malloc((size_t)n_kept * sizeof(float)); + for (int i = 0; i < n_kept; i++) { + memcpy(&X_c[i * gp->dim], &X[idx[i] * gp->dim], + gp->dim * sizeof(float)); + y_c[i] = y[idx[i]]; + } + free(idx); + CUDA_CHECK(cudaMemcpyAsync(gp->d_X, X_c, + (size_t)n_kept * gp->dim * sizeof(float), + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(gp->d_y, y_c, + (size_t)n_kept * sizeof(float), + cudaMemcpyHostToDevice, stream)); + free(X_c); free(y_c); + gp->n = n_kept; + } else { + CUDA_CHECK(cudaMemcpyAsync(gp->d_X, X, + (size_t)n * gp->dim * sizeof(float), + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(gp->d_y, y, + (size_t)n * sizeof(float), + cudaMemcpyHostToDevice, stream)); + gp->n = n; + } + return gp_recompute(gp, stream); +} + +// -- prediction -------------------------------------------------------------- + +static void predict_dev(const GaussianProcess *gp, const float *d_Xte, + float *d_means, float *d_vars, int m, cudaStream_t stream) +{ + int n = gp->n, d = gp->dim; + const float one = 1.0, zero = 0.0; + CUBLAS_CHECK(cublasSetStream(gp->cublas, stream)); + + float *d_Ks; + CUDA_CHECK(cudaMallocAsync(&d_Ks, (size_t)n * m * sizeof(float), stream)); + gp->kernel->build_Ks(gp->kernel, gp->d_X, d_Xte, n, m, d, d_Ks, stream); + + CUBLAS_CHECK(cublasSgemv(gp->cublas, CUBLAS_OP_T, n, m, + &one, d_Ks, n, gp->d_alpha, 1, &zero, d_means, 1)); + + if (d_vars) { + gp->kernel->build_kself_batch(gp->kernel, d_Xte, m, d, d_vars, stream); + + CUBLAS_CHECK(cublasStrsm(gp->cublas, + CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, n, m, &one, gp->d_L, n, d_Ks, n)); + float *d_sqnorms; + CUDA_CHECK(cudaMallocAsync(&d_sqnorms, (size_t)m * sizeof(float), stream)); + gp_k_col_sqnorms<<<(m + 255) / 256, 256, 0, stream>>>(d_Ks, d_sqnorms, n, m); + CUDA_CHECK(cudaGetLastError()); + gp_k_var_subtract<<<(m + 255) / 256, 256, 0, stream>>>(d_vars, d_sqnorms, m); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaFreeAsync(d_sqnorms, stream)); + } + + CUDA_CHECK(cudaFreeAsync(d_Ks, stream)); +} + +void gp_predict(const GaussianProcess *gp, const float *Xs, float *means, float *vars, + int m, cudaStream_t stream) +{ + int n = gp->n; + if (n == 0) { + for (int j = 0; j < m; j++) { + means[j] = 0.0; + if (vars) vars[j] = gp->kernel->k_self(gp->kernel, &Xs[j * gp->dim], gp->dim); + } + return; + } + + float *d_Xte; + CUDA_CHECK(cudaMallocAsync(&d_Xte, (size_t)m * gp->dim * sizeof(float), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_Xte, Xs, (size_t)m * gp->dim * sizeof(float), + cudaMemcpyHostToDevice, stream)); + + float *d_means, *d_vars = NULL; + CUDA_CHECK(cudaMallocAsync(&d_means, (size_t)m * sizeof(float), stream)); + if (vars) CUDA_CHECK(cudaMallocAsync(&d_vars, (size_t)m * sizeof(float), stream)); + + predict_dev(gp, d_Xte, d_means, d_vars, m, stream); + CUDA_CHECK(cudaFreeAsync(d_Xte, stream)); + + CUDA_CHECK(cudaMemcpyAsync(means, d_means, (size_t)m * sizeof(float), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaFreeAsync(d_means, stream)); + if (vars) { + CUDA_CHECK(cudaMemcpyAsync(vars, d_vars, (size_t)m * sizeof(float), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaFreeAsync(d_vars, stream)); + } + CUDA_CHECK(cudaStreamSynchronize(stream)); +} + +// -- likelihood and gradients ------------------------------------------------ + +float gp_marginal_log_likelihood(const GaussianProcess *gp) +{ + int n = gp->n; + if (n == 0) return 0.0; + + CUBLAS_CHECK(cublasSetStream(gp->cublas, 0)); + float data_fit; + CUBLAS_CHECK(cublasSdot(gp->cublas, n, gp->d_y, 1, gp->d_alpha, 1, &data_fit)); + + float *d_diag; + CUDA_CHECK(cudaMalloc(&d_diag, (size_t)n * sizeof(float))); + gp_k_extract_diag<<<(n + 255) / 256, 256>>>(gp->d_L, d_diag, n); + CUDA_CHECK(cudaGetLastError()); + + float *h_diag = (float *)malloc(n * sizeof(float)); + CUDA_CHECK(cudaMemcpy(h_diag, d_diag, (size_t)n * sizeof(float), + cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaFree(d_diag)); + + float log_det = 0.0; + for (int i = 0; i < n; i++) log_det += log(h_diag[i]); + free(h_diag); + + return (-0.5 * data_fit - log_det - 0.5 * n * log(2.0 * M_PI)) / n; +} + +void gp_mll_grad(const GaussianProcess *gp, float *d_raw_noise, float *kernel_grads, + cudaStream_t stream) +{ + int n = gp->n; + if (n == 0) { + if (d_raw_noise) *d_raw_noise = 0.0; + if (kernel_grads) + for (int i = 0; i < gp->kernel->n_params; i++) kernel_grads[i] = 0.0; + return; + } + + CUSOLVER_CHECK(cusolverDnSetStream(gp->cusolver, stream)); + CUBLAS_CHECK(cublasSetStream(gp->cublas, stream)); + + float *d_Kinv; + CUDA_CHECK(cudaMallocAsync(&d_Kinv, (size_t)n * n * sizeof(float), stream)); + CUDA_CHECK(cudaMemsetAsync(d_Kinv, 0, (size_t)n * n * sizeof(float), stream)); + gp_k_add_diag<<<(n + 255) / 256, 256, 0, stream>>>(d_Kinv, n, 1.0); + CUDA_CHECK(cudaGetLastError()); + + CUSOLVER_CHECK(cusolverDnSpotrs(gp->cusolver, CUBLAS_FILL_MODE_LOWER, + n, n, gp->d_L, n, d_Kinv, n, gp->d_info)); + + if (d_raw_noise) { + float term1, term2; + CUBLAS_CHECK(cublasSdot (gp->cublas, n, gp->d_alpha, 1, gp->d_alpha, 1, &term1)); + CUBLAS_CHECK(cublasSasum(gp->cublas, n, d_Kinv, n + 1, &term2)); + *d_raw_noise = 0.5 * (term1 - term2) * softplus_grad(gp->raw_noise) / n; + } + + if (kernel_grads) { + for (int i = 0; i < gp->kernel->n_params; i++) kernel_grads[i] = 0.0; + gp->kernel->mll_grad(gp->kernel, gp->d_X, n, gp->dim, + gp->cublas, stream, gp->d_alpha, d_Kinv, kernel_grads); + for (int i = 0; i < gp->kernel->n_params; i++) kernel_grads[i] /= n; + } + + CUDA_CHECK(cudaFreeAsync(d_Kinv, stream)); +} + +// -- kernel registry for gp_load --------------------------------------------- + +typedef struct { const char *tag; GPKernel *(*make)(float *, int); } KernelFactory; + +static GPKernel *kf_matern32lin(float *rp, int np) +{ + int dim = np - 2; + GPKernel *k = gp_kernel_matern32_linear(dim, 1.0, 1.0, 1.0); + memcpy(k->raw_params, rp, (size_t)np * sizeof(float)); + return k; +} + +static const KernelFactory kernel_registry[] = { + { GP_KERNEL_TAG_MATERN32_LINEAR, kf_matern32lin }, + { NULL, NULL } +}; + +static GPKernel *kernel_from_tag(const char *tag, float *rp, int np) +{ + for (int i = 0; kernel_registry[i].tag; i++) + if (memcmp(tag, kernel_registry[i].tag, 4) == 0) + return kernel_registry[i].make(rp, np); + return NULL; +} + +// -- save / load ------------------------------------------------------------- + +int gp_save(const GaussianProcess *gp, const char *path) +{ + if (gp->n == 0) return -1; + CUDA_CHECK(cudaDeviceSynchronize()); + FILE *f = fopen(path, "wb"); + if (!f) return -1; + + int n = gp->n, dim = gp->dim, np = gp->kernel->n_params; + fwrite("GC04", 1, 4, f); + fwrite(&dim, sizeof(int), 1, f); + fwrite(&n, sizeof(int), 1, f); + fwrite(&gp->raw_noise, sizeof(float), 1, f); + fwrite(gp->kernel->tag, 1, 4, f); + fwrite(&np, sizeof(int), 1, f); + fwrite(gp->kernel->raw_params, sizeof(float), np, f); + + float *h_X = (float *)malloc((size_t)n * dim * sizeof(float)); + float *h_y = (float *)malloc((size_t)n * sizeof(float)); + float *h_L = (float *)malloc((size_t)n * n * sizeof(float)); + float *h_alpha = (float *)malloc((size_t)n * sizeof(float)); + CUDA_CHECK(cudaMemcpy(h_X, gp->d_X, (size_t)n * dim * sizeof(float), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_y, gp->d_y, (size_t)n * sizeof(float), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_L, gp->d_L, (size_t)n * n * sizeof(float), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_alpha, gp->d_alpha, (size_t)n * sizeof(float), cudaMemcpyDeviceToHost)); + + fwrite(h_X, sizeof(float), (size_t)n * dim, f); + fwrite(h_y, sizeof(float), n, f); + fwrite(h_L, sizeof(float), (size_t)n * n, f); + fwrite(h_alpha, sizeof(float), n, f); + + free(h_X); free(h_y); free(h_L); free(h_alpha); + fclose(f); + return 0; +} + +GaussianProcess *gp_load(const char *path, int extra_cap) +{ + GaussianProcess *gp = NULL; + float *h_X = NULL, *h_y = NULL; + float *h_L = NULL, *h_alpha = NULL; + + FILE *f = fopen(path, "rb"); + if (!f) return NULL; + +#define RD(ptr, sz, cnt) \ + if (fread(ptr, sz, cnt, f) != (size_t)(cnt)) break + + do { + char magic[4], ktag[4]; + int dim, n, np; + float rn; + + RD(magic, 1, 4); + if (memcmp(magic, "GC04", 4) != 0) break; + + RD(&dim, sizeof(int), 1); + RD(&n, sizeof(int), 1); + RD(&rn, sizeof(float), 1); + RD(ktag, 1, 4); + RD(&np, sizeof(int), 1); + + float *rp = (float *)malloc((size_t)np * sizeof(float)); + if (!rp) break; + if (fread(rp, sizeof(float), np, f) != (size_t)np) { free(rp); break; } + + GPKernel *k = kernel_from_tag(ktag, rp, np); + free(rp); + if (!k) break; + + int cap = n + (extra_cap > 0 ? extra_cap : 0); + gp = gp_create(dim, cap, k, 1.0); + gp->raw_noise = rn; + gp->n = n; + + h_X = (float *)malloc((size_t)n * dim * sizeof(float)); + h_y = (float *)malloc((size_t)n * sizeof(float)); + h_L = (float *)malloc((size_t)n * n * sizeof(float)); + h_alpha = (float *)malloc((size_t)n * sizeof(float)); + + RD(h_X, sizeof(float), (size_t)n * dim); + RD(h_y, sizeof(float), n); + RD(h_L, sizeof(float), (size_t)n * n); + RD(h_alpha, sizeof(float), n); + + CUDA_CHECK(cudaMemcpy(gp->d_X, h_X, (size_t)n * dim * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(gp->d_y, h_y, (size_t)n * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(gp->d_L, h_L, (size_t)n * n * sizeof(float), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(gp->d_alpha, h_alpha, (size_t)n * sizeof(float), cudaMemcpyHostToDevice)); + + free(h_X); free(h_y); free(h_L); free(h_alpha); + fclose(f); + return gp; + } while (0); + +#undef RD + free(h_X); free(h_y); free(h_L); free(h_alpha); + if (gp) gp_destroy(gp); + fclose(f); + return NULL; +} + +#endif // GP_CUDA_CU diff --git a/src/gp_cuda_kernel.cu b/src/gp_cuda_kernel.cu new file mode 100644 index 0000000000..3f7ba9378f --- /dev/null +++ b/src/gp_cuda_kernel.cu @@ -0,0 +1,348 @@ +// gp_cuda_kernel.cu -- Matern32+Linear GP covariance kernel (CUDA), ARD. +// + +#ifndef GP_CUDA_KERNEL_CU +#define GP_CUDA_KERNEL_CU + +#include +#include +#include +#include +#include +#include + +#define CUDA_CHECK(call) do { \ + cudaError_t _e = (call); \ + if (_e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(_e)); \ + abort(); \ + } \ +} while (0) + +#define CUBLAS_CHECK(call) do { \ + cublasStatus_t _s = (call); \ + if (_s != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "cuBLAS error %s:%d: %d\n", __FILE__, __LINE__, (int)_s); \ + abort(); \ + } \ +} while (0) + +#define SP_LB 1e-4 +__host__ __device__ __forceinline__ float softplus(float x) { return (x > 20.0 ? x : log1p(exp(x))) + SP_LB; } +__host__ __device__ __forceinline__ float inv_softplus(float x) { float v = x - SP_LB; return v > 20.0 ? v : log(expm1(v)); } +__host__ __device__ __forceinline__ float softplus_grad(float x) { return 1.0 / (1.0 + exp(-x)); } + +// -- kernel struct ----------------------------------------------------------- + +typedef struct GPKernel GPKernel; +struct GPKernel { + int n_params; + float *raw_params; + char tag[4]; + + void (*build_K)(const GPKernel *k, const float *d_X, int n, int d, + float sigma_n, float *d_K, cudaStream_t stream); + void (*build_Ks)(const GPKernel *k, const float *d_Xtr, const float *d_Xte, + int n, int m, int d, float *d_Ks, cudaStream_t stream); + void (*build_kself_batch)(const GPKernel *k, const float *d_X, int m, int d, + float *d_out, cudaStream_t stream); + float (*k_self)(const GPKernel *k, const float *x, int d); + void (*mll_grad)(const GPKernel *k, const float *d_X, int n, int d, + cublasHandle_t cublas, cudaStream_t stream, + const float *d_alpha, const float *d_Kinv, + float *kernel_grads); + void (*destroy)(GPKernel *k); +}; + +#define GP_KERNEL_TAG_MATERN32_LINEAR "M32L" + +GPKernel *gp_kernel_matern32_linear(int dim, float lengthscale, float outputscale, + float offset); + +float gp_kernel_get_lengthscale(const GPKernel *k, int d); +float gp_kernel_get_outputscale(const GPKernel *k); +float gp_kernel_get_offset (const GPKernel *k); +void gp_kernel_set_lengthscale(GPKernel *k, int d, float v); +void gp_kernel_set_outputscale(GPKernel *k, float v); +void gp_kernel_set_offset (GPKernel *k, float v); + +// -- kernel implementations -------------------------------------------------- + +#define SF_IDX(np) ((np) - 2) +#define OFF_IDX(np) ((np) - 1) + +#define GP_BLOCK 16 +inline int gp_grid(int n) { return (n + GP_BLOCK - 1) / GP_BLOCK; } + +__global__ void matern32lin_k_build_K( + const float * __restrict__ X, + float * __restrict__ K, + const float * __restrict__ inv_ells, + int n, int d, float sigma_f, float sigma_n, float offset) +{ + int col = blockIdx.x * GP_BLOCK + threadIdx.x; + int row = blockIdx.y * GP_BLOCK + threadIdx.y; + if (row >= n || col >= n) return; + float dot = 0.0, r2 = 0.0; + for (int k = 0; k < d; k++) { + float xr = X[row * d + k], xc = X[col * d + k]; + dot += xr * xc; + float diff = (xr - xc) * inv_ells[k]; + r2 += diff * diff; + } + if (r2 < 0.0) r2 = 0.0; + float u = sqrt(3.0) * sqrt(r2); + float val = sigma_f * (dot + offset + (1.0 + u) * exp(-u)); + if (row == col) val += sigma_n; + K[col * n + row] = val; +} + +__global__ void matern32lin_k_build_Ks( + const float * __restrict__ Xtr, + const float * __restrict__ Xte, + float * __restrict__ Ks, + const float * __restrict__ inv_ells, + int n, int m, int d, float sigma_f, float offset) +{ + int col = blockIdx.x * GP_BLOCK + threadIdx.x; + int row = blockIdx.y * GP_BLOCK + threadIdx.y; + if (row >= n || col >= m) return; + float dot = 0.0, r2 = 0.0; + for (int k = 0; k < d; k++) { + float xr = Xtr[row * d + k], xc = Xte[col * d + k]; + dot += xr * xc; + float diff = (xr - xc) * inv_ells[k]; + r2 += diff * diff; + } + if (r2 < 0.0) r2 = 0.0; + float u = sqrt(3.0) * sqrt(r2); + Ks[col * n + row] = sigma_f * (dot + offset + (1.0 + u) * exp(-u)); +} + +__global__ void matern32lin_k_build_D_ard( + const float * __restrict__ X, + float * __restrict__ D, + const float * __restrict__ inv_ells, + int n, int d) +{ + int col = blockIdx.x * GP_BLOCK + threadIdx.x; + int row = blockIdx.y * GP_BLOCK + threadIdx.y; + if (row >= n || col >= n) return; + float sq = 0.0; + for (int k = 0; k < d; k++) { + float diff = (X[row * d + k] - X[col * d + k]) * inv_ells[k]; + sq += diff * diff; + } + D[col * n + row] = sq < 0.0 ? 0.0 : sq; +} + +__global__ void matern32lin_k_compute_W( + const float * __restrict__ alpha, + const float * __restrict__ Kinv, + float * __restrict__ W, + int n) +{ + int col = blockIdx.x * GP_BLOCK + threadIdx.x; + int row = blockIdx.y * GP_BLOCK + threadIdx.y; + if (row >= n || col >= n) return; + W[col * n + row] = alpha[row] * alpha[col] - Kinv[col * n + row]; +} + +__global__ void matern32lin_k_dk_dell_d( + const float * __restrict__ X, + const float * __restrict__ D_ard, + float * __restrict__ out, + int n, int d, int dd, float sigma_f, float inv_ell_d3) +{ + int col = blockIdx.x * GP_BLOCK + threadIdx.x; + int row = blockIdx.y * GP_BLOCK + threadIdx.y; + if (row >= n || col >= n) return; + float r2 = D_ard[col * n + row]; + float diff = X[row * d + dd] - X[col * d + dd]; + out[col * n + row] = sigma_f * 3.0 * diff * diff * inv_ell_d3 + * exp(-sqrt(3.0) * sqrt(r2)); +} + +__global__ void matern32lin_k_fill(float *v, int n, float val) +{ + int i = blockIdx.x * 256 + threadIdx.x; + if (i < n) v[i] = val; +} + +__global__ void matern32lin_k_kself_batch( + const float * __restrict__ X, float * __restrict__ out, + int m, int d, float sigma_f, float offset) +{ + int row = blockIdx.x * 256 + threadIdx.x; + if (row >= m) return; + float norm2 = 0.0; + for (int k = 0; k < d; k++) { float v = X[row * d + k]; norm2 += v * v; } + out[row] = sigma_f * (norm2 + offset + 1.0); +} + +// -- helpers ----------------------------------------------------------------- + +static float *h2d(const float *h, int n, cudaStream_t stream) +{ + float *d; + CUDA_CHECK(cudaMallocAsync(&d, (size_t)n * sizeof(float), stream)); + CUDA_CHECK(cudaMemcpyAsync(d, h, (size_t)n * sizeof(float), + cudaMemcpyHostToDevice, stream)); + return d; +} + +static float *device_inv_ells(const GPKernel *k, int d, cudaStream_t stream) +{ + float *h = (float *)malloc(d * sizeof(float)); + for (int i = 0; i < d; i++) h[i] = 1.0 / softplus(k->raw_params[i]); + float *dev = h2d(h, d, stream); + free(h); + return dev; +} + +// -- launcher functions ------------------------------------------------------ + +static void matern32lin_build_K(const GPKernel *k, const float *d_X, int n, int d, + float sigma_n, float *d_K, cudaStream_t stream) +{ + int np = k->n_params; + float sigma_f = softplus(k->raw_params[SF_IDX(np)]); + float offset = softplus(k->raw_params[OFF_IDX(np)]); + float *d_inv = device_inv_ells(k, d, stream); + dim3 block(GP_BLOCK, GP_BLOCK), grid(gp_grid(n), gp_grid(n)); + matern32lin_k_build_K<<>>(d_X, d_K, d_inv, n, d, sigma_f, sigma_n, offset); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaFreeAsync(d_inv, stream)); +} + +static void matern32lin_build_Ks(const GPKernel *k, const float *d_Xtr, const float *d_Xte, + int n, int m, int d, float *d_Ks, cudaStream_t stream) +{ + int np = k->n_params; + float sigma_f = softplus(k->raw_params[SF_IDX(np)]); + float offset = softplus(k->raw_params[OFF_IDX(np)]); + float *d_inv = device_inv_ells(k, d, stream); + dim3 block(GP_BLOCK, GP_BLOCK), grid(gp_grid(m), gp_grid(n)); + matern32lin_k_build_Ks<<>>(d_Xtr, d_Xte, d_Ks, d_inv, n, m, d, sigma_f, offset); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaFreeAsync(d_inv, stream)); +} + +static float matern32lin_k_self(const GPKernel *k, const float *x, int d) +{ + int np = k->n_params; + float sigma_f = softplus(k->raw_params[SF_IDX(np)]); + float offset = softplus(k->raw_params[OFF_IDX(np)]); + float norm2 = 0.0; + for (int i = 0; i < d; i++) norm2 += x[i] * x[i]; + return sigma_f * (norm2 + offset + 1.0); +} + +static void matern32lin_build_kself_batch(const GPKernel *k, const float *d_X, + int m, int d, float *d_out, cudaStream_t stream) +{ + int np = k->n_params; + float sigma_f = softplus(k->raw_params[SF_IDX(np)]); + float offset = softplus(k->raw_params[OFF_IDX(np)]); + matern32lin_k_kself_batch<<<(m + 255) / 256, 256, 0, stream>>>(d_X, d_out, m, d, sigma_f, offset); + CUDA_CHECK(cudaGetLastError()); +} + +static void matern32lin_mll_grad(const GPKernel *k, const float *d_X, int n, int d, + cublasHandle_t cublas, cudaStream_t stream, + const float *d_alpha, const float *d_Kinv, + float *kernel_grads) +{ + int np = k->n_params; + float sigma_f = softplus(k->raw_params[SF_IDX(np)]); + float offset = softplus(k->raw_params[OFF_IDX(np)]); + const float one = 1.0, zero = 0.0; + + CUBLAS_CHECK(cublasSetStream(cublas, stream)); + + float *d_inv = device_inv_ells(k, d, stream); + float *d_D, *d_W, *d_temp; + CUDA_CHECK(cudaMallocAsync(&d_D, (size_t)n * n * sizeof(float), stream)); + CUDA_CHECK(cudaMallocAsync(&d_W, (size_t)n * n * sizeof(float), stream)); + CUDA_CHECK(cudaMallocAsync(&d_temp, (size_t)n * n * sizeof(float), stream)); + + dim3 block(GP_BLOCK, GP_BLOCK), grid(gp_grid(n), gp_grid(n)); + + matern32lin_k_build_D_ard<<>>(d_X, d_D, d_inv, n, d); + CUDA_CHECK(cudaGetLastError()); + matern32lin_k_compute_W<<>>(d_alpha, d_Kinv, d_W, n); + CUDA_CHECK(cudaGetLastError()); + + for (int dd = 0; dd < d; dd++) { + float ell_d = softplus(k->raw_params[dd]); + float inv_ell3 = 1.0 / (ell_d * ell_d * ell_d); + matern32lin_k_dk_dell_d<<>>(d_X, d_D, d_temp, n, d, dd, sigma_f, inv_ell3); + CUDA_CHECK(cudaGetLastError()); + float dot_val; + CUBLAS_CHECK(cublasSdot(cublas, n * n, d_W, 1, d_temp, 1, &dot_val)); + kernel_grads[dd] = 0.5 * dot_val * softplus_grad(k->raw_params[dd]); + } + + matern32lin_k_build_K<<>>(d_X, d_temp, d_inv, n, d, sigma_f, 0.0, offset); + CUDA_CHECK(cudaGetLastError()); + { + float dot_val; + CUBLAS_CHECK(cublasSdot(cublas, n * n, d_W, 1, d_temp, 1, &dot_val)); + kernel_grads[SF_IDX(np)] = 0.5 * dot_val / sigma_f + * softplus_grad(k->raw_params[SF_IDX(np)]); + } + + { + float *d_ones, *d_wrow; + CUDA_CHECK(cudaMallocAsync(&d_ones, (size_t)n * sizeof(float), stream)); + CUDA_CHECK(cudaMallocAsync(&d_wrow, (size_t)n * sizeof(float), stream)); + matern32lin_k_fill<<<(n + 255) / 256, 256, 0, stream>>>(d_ones, n, 1.0); + CUDA_CHECK(cudaGetLastError()); + float w_sum; + CUBLAS_CHECK(cublasSgemv(cublas, CUBLAS_OP_N, n, n, + &one, d_W, n, d_ones, 1, &zero, d_wrow, 1)); + CUBLAS_CHECK(cublasSdot(cublas, n, d_wrow, 1, d_ones, 1, &w_sum)); + kernel_grads[OFF_IDX(np)] = 0.5 * sigma_f * w_sum + * softplus_grad(k->raw_params[OFF_IDX(np)]); + CUDA_CHECK(cudaFreeAsync(d_ones, stream)); + CUDA_CHECK(cudaFreeAsync(d_wrow, stream)); + } + + CUDA_CHECK(cudaFreeAsync(d_inv, stream)); + CUDA_CHECK(cudaFreeAsync(d_D, stream)); + CUDA_CHECK(cudaFreeAsync(d_W, stream)); + CUDA_CHECK(cudaFreeAsync(d_temp, stream)); +} + +static void matern32lin_destroy(GPKernel *k) { free(k); } + +GPKernel *gp_kernel_matern32_linear(int dim, float lengthscale, float outputscale, + float offset) +{ + int n_params = dim + 2; + GPKernel *k = (GPKernel *)malloc(sizeof(GPKernel) + (size_t)n_params * sizeof(float)); + k->n_params = n_params; + k->raw_params = (float *)(k + 1); + memcpy(k->tag, GP_KERNEL_TAG_MATERN32_LINEAR, 4); + for (int i = 0; i < dim; i++) + k->raw_params[i] = inv_softplus(lengthscale); + k->raw_params[SF_IDX(n_params)] = inv_softplus(outputscale); + k->raw_params[OFF_IDX(n_params)] = inv_softplus(offset); + k->build_K = matern32lin_build_K; + k->build_Ks = matern32lin_build_Ks; + k->build_kself_batch = matern32lin_build_kself_batch; + k->k_self = matern32lin_k_self; + k->mll_grad = matern32lin_mll_grad; + k->destroy = matern32lin_destroy; + return k; +} + +float gp_kernel_get_lengthscale(const GPKernel *k, int d) { return softplus(k->raw_params[d]); } +float gp_kernel_get_outputscale(const GPKernel *k) { return softplus(k->raw_params[SF_IDX(k->n_params)]); } +float gp_kernel_get_offset (const GPKernel *k) { return softplus(k->raw_params[OFF_IDX(k->n_params)]); } +void gp_kernel_set_lengthscale(GPKernel *k, int d, float v) { k->raw_params[d] = inv_softplus(v); } +void gp_kernel_set_outputscale(GPKernel *k, float v) { k->raw_params[SF_IDX(k->n_params)] = inv_softplus(v); } +void gp_kernel_set_offset (GPKernel *k, float v) { k->raw_params[OFF_IDX(k->n_params)] = inv_softplus(v); } + +#endif // GP_CUDA_KERNEL_CU diff --git a/tests/test_custom_gp_numerics.py b/tests/test_custom_gp_numerics.py new file mode 100644 index 0000000000..0b967a85dc --- /dev/null +++ b/tests/test_custom_gp_numerics.py @@ -0,0 +1,137 @@ +""" +Custom CUDA GP vs GPyTorch + +Hyperparameters are fixed identically, just comparing the raw numerical results and timings for training and prediction. +""" + +import time, sys, math +import numpy as np + +def make_data(n, dim=1, seed=42): + rng = np.random.RandomState(seed) + X = rng.uniform(-5, 5, size=(n, dim)) + y = np.sin(X[:, 0]) + 0.1 * rng.randn(n) + return X.astype(np.float64), y.astype(np.float64) + +def make_test(m=200, dim=1): + xs = np.linspace(-6, 6, m).reshape(-1, 1) + if dim > 1: + xs = np.hstack([xs, np.zeros((m, dim - 1))]) + return xs.astype(np.float64) + +def bench_custom(X, y, Xs, hp): + from pufferlib.gp_torch import GaussianProcess + import torch # ensure CUDA context is warm before timing + + # Warm-up: a tiny fit so the first CUDA call overhead isn't charged + # to the benchmark (driver init, JIT, etc.) + _gp = GaussianProcess(dim=X.shape[1], capacity=1, + lengthscale=hp["lengthscale"], + outputscale=hp["outputscale"], + noise=hp["noise"]) + + t0 = time.perf_counter() + gp = GaussianProcess(dim=X.shape[1], capacity=len(X), + lengthscale=hp["lengthscale"], + outputscale=hp["outputscale"], + noise=hp["noise"]) + + print(gp) + gp.fit(X, y) + t_train = time.perf_counter() - t0 + + t0 = time.perf_counter() + means, vars_ = gp.predict(Xs) + t_pred = time.perf_counter() - t0 + + return means.numpy(), vars_.numpy(), gp.log_marginal_likelihood, t_train, t_pred + +def bench_gpytorch(X, y, Xs, hp): + import torch, gpytorch + + class ExactGP(gpytorch.models.ExactGP): + def __init__(self, tx, ty, lik): + super().__init__(tx, ty, lik) + from gpytorch.kernels import (MaternKernel, PolynomialKernel, + AdditiveKernel, ScaleKernel) + self.mean_module = gpytorch.means.ZeroMean() + matern = MaternKernel(nu=1.5, ard_num_dims=tx.shape[-1]) + linear = PolynomialKernel(power=1) + self.covar_module = ScaleKernel(AdditiveKernel(linear, matern)) + def forward(self, x): + return gpytorch.distributions.MultivariateNormal( + self.mean_module(x), self.covar_module(x)) + + Xt = torch.from_numpy(X).double() + yt = torch.from_numpy(y).double() + Xst = torch.from_numpy(Xs).double() + + t0 = time.perf_counter() + lik = gpytorch.likelihoods.GaussianLikelihood() + model = ExactGP(Xt, yt, lik) + model.double() + model.covar_module.base_kernel.kernels[1].lengthscale = hp["lengthscale"] + model.covar_module.outputscale = hp["outputscale"] + lik.noise = hp["noise"] + model.covar_module.base_kernel.kernels[0].raw_offset.data.fill_( + math.log(math.expm1(hp["offset"]))) + t_train = time.perf_counter() - t0 + + # Force exact Cholesky (default CG cutoff is 800) + chol_size = max(len(yt) + 1, 801) + model.eval(); lik.eval() + t0 = time.perf_counter() + with torch.no_grad(), \ + gpytorch.settings.fast_pred_var(), \ + gpytorch.settings.max_cholesky_size(chol_size): + latent = model(Xst) + means = latent.mean.numpy() + vars_ = latent.variance.numpy() + t_pred = time.perf_counter() - t0 + + model.train(); lik.train() + with torch.no_grad(), gpytorch.settings.max_cholesky_size(chol_size): + mll = gpytorch.mlls.ExactMarginalLogLikelihood(lik, model) + mll = mll(model(Xt), yt).item() + + return means, vars_, mll, t_train, t_pred + +def run(n, dim=1): + hp = dict(lengthscale=1.0, outputscale=1.0, noise=0.01, offset=math.log(2)) + X, y = make_data(n, dim) + Xs = make_test(200, dim) + + rows = {} + + try: + rows["CUDA"] = bench_custom(X, y, Xs, hp) + except ImportError: + pass + + rows["GPyTorch"] = bench_gpytorch(X, y, Xs, hp) + + ref_means = rows["GPyTorch"][0] + ref_vars = rows["GPyTorch"][1] + + print(f"\n{'='*84}") + print(f" n={n} dim={dim}") + print(f"{'='*84}") + print(f" {'':22s} {'train':>8s} {'predict':>8s} {'total':>8s}" + f" {'MLL':>12s} {'|err_mean|':>10s} {'|err_var|':>10s}") + + for label, (m_, v_, mll, tt, tp) in rows.items(): + me = np.max(np.abs(m_ - ref_means)) + ve = np.max(np.abs(v_ - ref_vars)) + print(f" {label:22s} {tt:7.4f}s {tp:7.4f}s {tt+tp:7.4f}s" + f" {mll:12.4f} {me:9.2e} {ve:9.2e}") + + print(f" {'':22s} (GPyTorch: exact Cholesky forced for n>800, errors vs GPyTorch)") + +if __name__ == "__main__": + sizes = [100, 200, 1000, 2000] + dims = [1, 5] + if len(sys.argv) > 1: + sizes = [int(s) for s in sys.argv[1:]] + for n in sizes: + for d in dims: + run(n, dim=d) diff --git a/tests/test_custom_gp_optim.py b/tests/test_custom_gp_optim.py new file mode 100644 index 0000000000..494b758dec --- /dev/null +++ b/tests/test_custom_gp_optim.py @@ -0,0 +1,171 @@ +""" +mini-batch hyperparameter optimization: custom CUDA GP vs GPyTorch + +Runs Adam on the same batches from the same starting point +""" + +import math, sys +import numpy as np +import torch +import gpytorch +from gpytorch.kernels import MaternKernel, PolynomialKernel, AdditiveKernel, ScaleKernel + +sys.path.insert(0, ".") +from pufferlib.gp_torch import GaussianProcess + +N = 300 +BATCH = 40 +STEPS = 60 +LR = 0.05 +SEED = 42 + + +def make_data(n, seed): + rng = np.random.RandomState(seed) + X = rng.uniform(-5, 5, (n, 1)).astype(np.float64) + y = (np.sin(X[:, 0]) + 0.1 * rng.randn(n)).astype(np.float64) + return X, y + + +class ExactGP(gpytorch.models.ExactGP): + def __init__(self, tx, ty, lik): + super().__init__(tx, ty, lik) + self.mean_module = gpytorch.means.ZeroMean() + matern = MaternKernel(nu=1.5, ard_num_dims=1) + linear = PolynomialKernel(power=1) + self.covar_module = ScaleKernel(AdditiveKernel(linear, matern)) + + def forward(self, x): + return gpytorch.distributions.MultivariateNormal( + self.mean_module(x), self.covar_module(x)) + + +def main(): + X, y = make_data(N, SEED) + Xt, yt = torch.from_numpy(X).double(), torch.from_numpy(y).double() + + rng = np.random.RandomState(SEED + 1) + batches = [rng.choice(N, BATCH, replace=False) for _ in range(STEPS)] + + # shared initial constrained hyperparameters + ell0, sf0, noise0, off0 = 1.0, 1.0, 0.01, 1.0 + + # -- our model --------------------------------------------------------- + gp = GaussianProcess(dim=1, capacity=BATCH, + lengthscale=ell0, outputscale=sf0, noise=noise0, offset=off0, + use_cuda=False) + opt_ours = torch.optim.Adam(gp.parameters(), lr=LR) + gp.train() + + # -- GPyTorch model ---------------------------------------------------- + lik = gpytorch.likelihoods.GaussianLikelihood() + gpyt = ExactGP(Xt, yt, lik) + gpyt.double() + # Match initial constrained hyperparameters + gpyt.covar_module.base_kernel.kernels[1].lengthscale = ell0 + gpyt.covar_module.outputscale = sf0 + lik.noise = noise0 + gpyt.covar_module.base_kernel.kernels[0].raw_offset.data.fill_( + math.log(math.expm1(off0))) + + mll_fn = gpytorch.mlls.ExactMarginalLogLikelihood(lik, gpyt) + opt_gpyt = torch.optim.Adam(gpyt.parameters(), lr=LR) + gpyt.train(); lik.train() + + # -- step-by-step comparison ------------------------------------------- + print(f"\n{'step':>4} {'MLL (ours)':>11} {'MLL (gpyt)':>11} {'|dMLL|':>8}" + f" {'ell_ours':>9} {'ell_gpyt':>9} {'noise_ours':>11} {'noise_gpyt':>11}") + print("-" * 102) + + for step, idx in enumerate(batches): + X_b, y_b = Xt[idx], yt[idx] + + # ours: fit loads batch + Cholesky; mll(recompute=False) skips a second one + gp.fit(X_b.numpy(), y_b.numpy()) + opt_ours.zero_grad() + loss_ours = -gp.mll(recompute=False) + loss_ours.backward() + mll_ours = -loss_ours.item() + opt_ours.step() + + # gpytorch: update stored training data to the batch, then compute MLL + gpyt.set_train_data(inputs=X_b, targets=y_b, strict=False) + opt_gpyt.zero_grad() + with gpytorch.settings.max_cholesky_size(BATCH + 1): + loss_gpyt = -mll_fn(gpyt(X_b), y_b) + loss_gpyt.backward() + mll_gpyt = -loss_gpyt.item() + opt_gpyt.step() + + if step % 10 == 0 or step == STEPS - 1: + ell_ours = float(gp.lengthscale[0]) + ell_g = gpyt.covar_module.base_kernel.kernels[1].lengthscale.flatten()[0].item() + print(f"{step:>4} {mll_ours:>11.5f} {mll_gpyt:>11.5f} " + f"{abs(mll_ours - mll_gpyt):>8.2e}" + f" {ell_ours:>9.4f} {ell_g:>9.4f}" + f" {gp.noise:>11.6f} {lik.noise.item():>11.6f}") + + # -- final hyperparameter comparison ----------------------------------- + ell_ours = float(gp.lengthscale[0]) + ell_gpyt = gpyt.covar_module.base_kernel.kernels[1].lengthscale.flatten()[0].item() + ours_vals = [ell_ours, gp.outputscale, gp.noise, gp.offset] + gpyt_vals = [ + ell_gpyt, + gpyt.covar_module.outputscale.item(), + lik.noise.item(), + gpyt.covar_module.base_kernel.kernels[0].offset.item(), + ] + print() + print("Final hyperparameters:") + for label, ov, gv in zip(["lengthscale", "outputscale", "noise", "offset"], + ours_vals, gpyt_vals): + print(f" {label:12s} ours={ov:.6f} gpyt={gv:.6f} |d|={abs(ov - gv):.2e}") + + # -- gradient agreement at a fixed point (sanity check) ---------------- + print() + print("Gradient agreement check (fresh models, same batch, same params):") + idx0 = batches[0] + X_b0, y_b0 = Xt[idx0], yt[idx0] + + # fresh ours + gp2 = GaussianProcess(dim=1, capacity=BATCH, + lengthscale=ell0, outputscale=sf0, noise=noise0, offset=off0, + use_cuda=False) + gp2.train() + gp2.fit(X_b0.numpy(), y_b0.numpy()) + (-gp2.mll(recompute=False)).backward() + # raw_lengthscale is (1,); flatten all param grads into a list + grads_ours = [] + for p in gp2.parameters(): + grads_ours.extend(p.grad.detach().flatten().tolist()) + + # fresh gpytorch + lik2 = gpytorch.likelihoods.GaussianLikelihood() + gpyt2 = ExactGP(Xt, yt, lik2) + gpyt2.double() + gpyt2.covar_module.base_kernel.kernels[1].lengthscale = ell0 + gpyt2.covar_module.outputscale = sf0 + lik2.noise = noise0 + gpyt2.covar_module.base_kernel.kernels[0].raw_offset.data.fill_( + math.log(math.expm1(off0))) + gpyt2.train(); lik2.train() + mll_fn2 = gpytorch.mlls.ExactMarginalLogLikelihood(lik2, gpyt2) + gpyt2.set_train_data(inputs=X_b0, targets=y_b0, strict=False) + with gpytorch.settings.max_cholesky_size(BATCH + 1): + (-mll_fn2(gpyt2(X_b0), y_b0)).backward() + + # collect gpytorch gradients in the same param order as ours + gpyt_grads = [ + gpyt2.covar_module.base_kernel.kernels[1].raw_lengthscale.grad.flatten()[0].item(), + gpyt2.covar_module.raw_outputscale.grad.item(), + lik2.noise_covar.raw_noise.grad.item(), + gpyt2.covar_module.base_kernel.kernels[0].raw_offset.grad.item(), + ] + + labels = ["raw_ell_0", "raw_outputscale", "raw_noise", "raw_offset"] + for label, og, gg in zip(labels, grads_ours, gpyt_grads): + print(f" {label:16s} ours={og:+.6f} gpyt={gg:+.6f} |d|={abs(og - gg):.2e}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_sweep.py b/tests/test_sweep.py index 982d9d91cb..f02ddadf55 100644 --- a/tests/test_sweep.py +++ b/tests/test_sweep.py @@ -76,7 +76,7 @@ def test_sweep(args): target_metric = args['sweep']['metric'] scores, costs = [], [] - for i in range(args['max_runs']): + for i in range(args['sweep']['max_runs']): seed = time.time_ns() & 0xFFFFFFFF random.seed(seed) np.random.seed(seed) @@ -164,19 +164,26 @@ def visualize(args): if __name__ == '__main__': + import argparse + import sys from pufferlib import pufferl - parser = pufferl.make_parser() - parser.add_argument('--task', type=str, default='linear', help='Task to optimize') - parser.add_argument('--vis-path', type=str, default='', + custom_parser = argparse.ArgumentParser(add_help=False) + custom_parser.add_argument('--task', type=str, default='linear', help='Task to optimize') + custom_parser.add_argument('--vis-path', type=str, default='', help='Set to visualize a saved sweep') - parser.add_argument('--data-path', type=str, default='sweep', + custom_parser.add_argument('--data-path', type=str, default='sweep', help='Used for testing hparam algorithms') + custom_args, remaining_argv = custom_parser.parse_known_args() + sys.argv = [sys.argv[0]] + remaining_argv - args = pufferl.load_config('default', parser=parser) + args = pufferl.load_config('default') + args['task'] = custom_args.task + args['vis_path'] = custom_args.vis_path + args['data_path'] = custom_args.data_path + + test_sweep(args) if args['vis_path']: visualize(args) exit(0) - - test_sweep(args)