diff --git a/doc/changes/dev/13909.other.rst b/doc/changes/dev/13909.other.rst new file mode 100644 index 00000000000..a2131a6ade9 --- /dev/null +++ b/doc/changes/dev/13909.other.rst @@ -0,0 +1,4 @@ +Batch and vectorise classifier estimation and scoring in +:meth:`mne.decoding.GeneralizingEstimator.score` for ``scoring=None``, +``"accuracy"``, ``"balanced_accuracy"`` and ``"roc_auc"``, by +:newcontrib:`Mathias Sablé-Meyer`. diff --git a/doc/changes/names.inc b/doc/changes/names.inc index ba0c60e79e5..a8a0726e9d7 100644 --- a/doc/changes/names.inc +++ b/doc/changes/names.inc @@ -213,6 +213,7 @@ .. _Martin Luessi: https://github.com/mluessi .. _Martin Oberg: https://github.com/obergmartin .. _Martin Schulz: https://github.com/marsipu +.. _Mathias Sablé-Meyer: https://s-m.ac/ .. _Mathieu Scheltienne: https://github.com/mscheltienne .. _Mathurin Massias: https://mathurinm.github.io/ .. _Mats van Es: https://github.com/matsvanes diff --git a/mne/decoding/search_light.py b/mne/decoding/search_light.py index 81699ecd5ba..fa7f5e2f882 100644 --- a/mne/decoding/search_light.py +++ b/mne/decoding/search_light.py @@ -5,6 +5,7 @@ import logging import numpy as np +from scipy.stats import rankdata from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone from sklearn.metrics import check_scoring from sklearn.preprocessing import LabelEncoder @@ -680,14 +681,15 @@ def _gl_transform(estimators, X, method, pb): Returns ------- - Xt : array, shape (n_samples, n_slices) - The transformed values generated by each estimator. - """ + Xt : array, shape (n_samples, n_estimators, n_slices[, n_classes]) + Predictions for each (estimator, slice) pair. The trailing axis is + present when ``method`` returns multi-output (e.g. ``predict_proba``). + """ # noqa: E501 n_sample, n_iter = X.shape[0], X.shape[-1] + # stack generalized data for faster prediction + X_stack = X.transpose(np.r_[0, X.ndim - 1, range(1, X.ndim - 1)]) + X_stack = X_stack.reshape((n_sample * n_iter,) + X_stack.shape[2:]) for ii, est in enumerate(estimators): - # stack generalized data for faster prediction - X_stack = X.transpose(np.r_[0, X.ndim - 1, range(1, X.ndim - 1)]) - X_stack = X_stack.reshape(np.r_[n_sample * n_iter, X_stack.shape[2:]]) transform = getattr(est, method) _y_pred = transform(X_stack) # unstack generalizations @@ -715,6 +717,100 @@ def _gl_init_pred(y_pred, X, n_train): return y_pred +def _resolve_scoring_for_classifier(scoring, estimators): + """Promote scoring=None to 'accuracy' for default estimator. + + scoring=None goes through sklearn's ``_PassthroughScorer``, which delegates + to ``estimator.score(X, y)``. For a classifier that inherits + ``ClassifierMixin.score`` unchanged, that's accuracy — which we can batch. + We compare ``type(est).score.__qualname__`` rather than ``.__name__`` + because the bare name is "score" regardless of the defining class. A bare + method has qualname "ClassifierMixin.score"; any override resolves to + ".score", which we leave untouched. + """ + if len(estimators) and getattr(scoring, "_score_func", None) is None: + qname = getattr(type(estimators[0]).score, "__qualname__", "") + if qname == "ClassifierMixin.score": + scoring = check_scoring(estimators[0], "accuracy") + return scoring + + +def _detect_response_method(scoring): + """Return (response_method, can_batch) for ``scoring``. + + If we can batch the estimator (one of predict, predict_proba, + decision_function, or a tuple of those) then we return that, and + can_batch is True if additionally _score_func is not None. + Otherwise we return None (for the response_method) and False + """ + score_func = getattr(scoring, "_score_func", None) + rm = getattr(scoring, "_response_method", None) + valid = {"predict", "predict_proba", "decision_function"} + if rm == "default": + response_method = "predict" + elif isinstance(rm, str) and rm in valid: + response_method = rm + elif isinstance(rm, tuple) and all(m in valid for m in rm): + response_method = rm + else: + response_method = None + can_batch = score_func is not None and response_method is not None + return response_method, can_batch + + +def _make_batched_score(score_func, response_method, method, y, sign, kwargs): + """Return a callable ``y_pred``->score per task, None if not recognised. + + The returned callable expects ``y_pred`` of shape + ``(n_sample, n_train, n_iter)`` and returns shape ``(n_train, n_iter)``. + Falls back to None for any scorer with non-default ``kwargs`` or + multi-target ``y``, both of which require a slice-by-slice loop. + """ + if kwargs or y.ndim != 1: + return None + name = getattr(score_func, "__name__", "") + + if name == "accuracy_score" and response_method == "predict": + + def batched_score(y_pred): + return sign * (y_pred == y[:, None, None]).mean(axis=0) + + return batched_score + + if name == "balanced_accuracy_score" and response_method == "predict": + classes = np.unique(y) + + def batched_score(y_pred): + return sign * np.stack( + [(y_pred[y == c] == c).mean(axis=0) for c in classes] + ).mean(axis=0) + + return batched_score + + if name == "roc_auc_score" and method in ("predict_proba", "decision_function"): + classes = np.unique(y) + if len(classes) != 2: # multi-class needs ovr/ovo; defer + return None + pos = y == classes[1] + n_pos, n_neg = int(pos.sum()), int((~pos).sum()) + if not (n_pos and n_neg): # degenerate folds raise downstream in sklearn + return None + + def batched_score(y_pred): + # Mann-Whitney U identity with average-rank tie correction. + # Equivalent to sklearn's roc_auc within floating point precision, + # but different computation. + ranks = rankdata(y_pred, method="average", axis=0) + return ( + sign + * (ranks[pos].sum(axis=0) - n_pos * (n_pos + 1) / 2.0) + / (n_pos * n_neg) + ) + + return batched_score + return None + + def _gl_score(estimators, scoring, X, y, pb): """Score GeneralizingEstimator in parallel. @@ -743,17 +839,63 @@ def _gl_score(estimators, scoring, X, y, pb): """ # FIXME: The level parallelization may be a bit high, and might be memory # consuming. Perhaps need to lower it down to the loop across X slices. - score_shape = [len(estimators), X.shape[-1]] - for jj in range(X.shape[-1]): - for ii, est in enumerate(estimators): - _score = scoring(est, X[..., jj], y) - # Initialize array of predictions on the first score iteration - if (ii == 0) and (jj == 0): - dtype = type(_score) - score = np.zeros(score_shape, dtype) - score[ii, jj, ...] = _score - - pb.update(jj * len(estimators) + ii + 1) + n_iter = X.shape[-1] + n_train = len(estimators) + score_shape = [n_train, n_iter] + + scoring = _resolve_scoring_for_classifier(scoring, estimators) + response_method, can_batch = _detect_response_method(scoring) + + # If we can't batch, fall back to a simple nested loop for scoring + if not can_batch: + for jj in range(n_iter): + for ii, est in enumerate(estimators): + _score = scoring(est, X[..., jj], y) + if (ii == 0) and (jj == 0): + score = np.zeros(score_shape, type(_score)) + score[ii, jj, ...] = _score + pb.update(jj * n_train + ii + 1) + return score + + # Resolve a single method name for _gl_transform: pick the first available + # if response_method is a tuple (e.g. roc_auc). + if isinstance(response_method, str): + method = response_method + else: + for m in response_method: + if hasattr(estimators[0], m): + method = m + break + + # Batch all predictions through _gl_transform. y_pred shape: + # (n_sample, n_train, n_iter) or (n_sample, n_train, n_iter, n_classes). + y_pred = _gl_transform(estimators, X, method, pb) + + # Binary predict_proba: take the positive-class column to match sklearn + # scorer expectations for binary problems. + if method == "predict_proba" and y_pred.ndim == 4 and y_pred.shape[-1] == 2: + y_pred = y_pred[..., 1] + + # `scoring._kwargs or {}` also guards score_func(..., **kwargs) against + # scoring._kwargs being None. + score_func = scoring._score_func + sign = scoring._sign + kwargs = scoring._kwargs or {} + batched_score = _make_batched_score( + score_func, response_method, method, y, sign, kwargs + ) + + # Reduce predictions to scores. Vectorised if we recognised the scorer, + # otherwise nested loops over (estimator, slice). + if batched_score is not None: + score = batched_score(y_pred) + else: + for ii in range(n_train): + for jj in range(n_iter): + _score = sign * score_func(y, y_pred[:, ii, jj], **kwargs) + if (ii == 0) and (jj == 0): + score = np.zeros(score_shape, type(_score)) + score[ii, jj, ...] = _score return score diff --git a/mne/decoding/tests/test_search_light.py b/mne/decoding/tests/test_search_light.py index 56d239c1bcc..1ed96c034d1 100644 --- a/mne/decoding/tests/test_search_light.py +++ b/mne/decoding/tests/test_search_light.py @@ -3,11 +3,12 @@ # Copyright the MNE-Python contributors. import platform +from functools import wraps from inspect import signature import numpy as np import pytest -from numpy.testing import assert_array_equal, assert_equal +from numpy.testing import assert_allclose, assert_array_equal, assert_equal sklearn = pytest.importorskip("sklearn") @@ -15,7 +16,7 @@ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.ensemble import BaggingClassifier from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge -from sklearn.metrics import make_scorer, roc_auc_score +from sklearn.metrics import check_scoring, make_scorer, roc_auc_score from sklearn.model_selection import cross_val_predict from sklearn.multiclass import OneVsRestClassifier from sklearn.pipeline import make_pipeline @@ -246,7 +247,11 @@ def test_generalization_light(metadata_routing): gl.fit(X, y) score = gl.score(X, y) auc = roc_auc_score(y, gl.estimators_[0].predict_proba(X[..., 0])[..., 1]) - assert_equal(score[0, 0], auc) + + # The rank identity implemented when batching gives the same AUC as sklearn + # within floating point precision, but implements it with different + # operations. A bit-exact match would need a loop, defeating the batching. + assert_allclose(score[0, 0], auc) for scoring in ["foo", 999]: gl = GeneralizingEstimator(logreg, scoring=scoring) @@ -267,7 +272,8 @@ def test_generalization_light(metadata_routing): [roc_auc_score(y - 1, _y_pred) for _y_pred in _y_preds] for _y_preds in gl.decision_function(X).transpose(1, 2, 0) ] - assert_array_equal(score, manual_score) + # allclose instead of equal: see above, batching roc_auc forces this. + assert_allclose(score, manual_score) # n_jobs gl = GeneralizingEstimator(logreg, n_jobs=2) @@ -293,6 +299,78 @@ def test_generalization_light(metadata_routing): assert_array_equal(y_preds[0], y_preds[1]) +@pytest.mark.parametrize( + "scoring, est_name, method", + [ + (None, "logreg", "predict"), + ("accuracy", "logreg", "predict"), + ("balanced_accuracy", "logreg", "predict"), + ("roc_auc", "logreg", "decision_function"), + ("neg_log_loss", "logreg", "predict_proba"), + (None, "ridge", "predict"), + ("roc_auc_multiclass", "logreg", "predict_proba"), + ("accuracy_kwargs", "logreg", "predict"), + ], +) +def test_gl_score_branches(scoring, est_name, method): + """Test _gl_score against its own can_batch=False nested-loop fallback.""" + n_trials, n_sensors, n_iter = 12, 3, 4 + rng = np.random.RandomState(0) + X = rng.randn(n_trials, n_sensors, n_iter) + y = rng.randint(0, 3 if scoring == "roc_auc_multiclass" else 2, n_trials) + per_slice = scoring in ("neg_log_loss", "roc_auc_multiclass", "accuracy_kwargs") + # liblinear is binary-only, switch to lbfgs for the multi-class case. + solver = "lbfgs" if scoring == "roc_auc_multiclass" else "liblinear" + if scoring == "roc_auc_multiclass": + scoring = make_scorer( + roc_auc_score, response_method="predict_proba", multi_class="ovr" + ) + elif scoring == "accuracy_kwargs": + # start from the default scorer but add a kwarg to prevent batching + acc_func = check_scoring(LogisticRegression(), "accuracy")._score_func + scoring = make_scorer(acc_func, normalize=False) + est = Ridge() if est_name == "ridge" else LogisticRegression(solver=solver) + gl = GeneralizingEstimator(est, scoring=scoring).fit(X, y) + + # Measure batching: count pred/call scores. Wraps `fn` calls so they append + # to a bucket; @wraps preserves __name__ (needed as _gl_score matches it) + def counting(fn, bucket): + @wraps(fn) + def wrapped(*a, **k): + bucket.append(1) + return fn(*a, **k) + + return wrapped + + # First we count calls to scorer + score_calls = [] + scorer = check_scoring(est, scoring) + if getattr(scorer, "_score_func", None) is not None: + scorer._score_func = counting(scorer._score_func, score_calls) + + # Now we count calls to the estimator that _gl_score will call (hardcoded) + pred_calls = [] + for e in gl.estimators_: + setattr(e, method, counting(getattr(e, method), pred_calls)) + + # Batched path: assert call counts immediately so the buckets only reflect + # this run (the reference run below would otherwise add to them). + gl.scoring = scorer + actual = gl.score(X, y) + assert len(pred_calls) == (n_iter if est_name != "ridge" else n_iter**2) + assert len(score_calls) == (n_iter**2 if per_slice else 0) + + # Reference: force can_batch=False. _score_func set (non-None) bypasses the + # qname coercion; missing _response_method makes can_batch False. + def force_fallback(e, X, y): + return scorer(e, X, y) + + force_fallback._score_func = id + gl.scoring = force_fallback + expected = gl.score(X, y) + assert_allclose(actual, expected) + + @pytest.mark.parametrize( "n_jobs, verbose", [(1, False), (2, False), (1, True), (2, "info")] )