diff --git a/.all-contributorsrc b/.all-contributorsrc index 9821a70c4..a9b5f0657 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -245,7 +245,7 @@ "name": "Khush Agrawal", "profile": "https://github.com/Khushmagrawal", "contributions": [ - "code", + "code" ] }, { diff --git a/docs/source/api_reference/regression.rst b/docs/source/api_reference/regression.rst index 95fa14a42..137c319a6 100644 --- a/docs/source/api_reference/regression.rst +++ b/docs/source/api_reference/regression.rst @@ -141,6 +141,7 @@ Formally, these algorithms are reduction algorithms, to tabular regression. BaggingRegressor NGBoostRegressor + VotingProbaRegressor .. currentmodule:: skpro.regression.cyclic_boosting diff --git a/skpro/regression/ensemble/__init__.py b/skpro/regression/ensemble/__init__.py index e93fa1a84..e981ff93e 100644 --- a/skpro/regression/ensemble/__init__.py +++ b/skpro/regression/ensemble/__init__.py @@ -1,7 +1,8 @@ -"""Natural Gradient Boosting Regressor models.""" +"""Ensemble probabilistic regressors.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) from skpro.regression.ensemble._bagging import BaggingRegressor from skpro.regression.ensemble._ngboost import NGBoostRegressor +from skpro.regression.ensemble._voting import VotingProbaRegressor -__all__ = ["BaggingRegressor", "NGBoostRegressor"] +__all__ = ["BaggingRegressor", "NGBoostRegressor", "VotingProbaRegressor"] diff --git a/skpro/regression/ensemble/_voting.py b/skpro/regression/ensemble/_voting.py new file mode 100644 index 000000000..339d432da --- /dev/null +++ b/skpro/regression/ensemble/_voting.py @@ -0,0 +1,173 @@ +"""Voting ensemble of heterogeneous probabilistic regressors.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +__author__ = ["Ashish-Kumar-Dash"] +__all__ = ["VotingProbaRegressor"] + +from skpro.base import BaseMetaEstimator +from skpro.distributions.mixture import Mixture +from skpro.regression.base import BaseProbaRegressor + + +class VotingProbaRegressor(BaseMetaEstimator, BaseProbaRegressor): + """Voting ensemble of heterogeneous probabilistic regressors. + + Fits multiple probabilistic regressors on the same training data. + On ``predict_proba``, returns a ``Mixture`` distribution of the + component predictions, with user-specified or uniform weights. + + Generalizes ``sklearn``'s ``VotingRegressor`` to the probabilistic + regression setting, where predictions are full distributions + rather than point predictions. + + Parameters + ---------- + estimators : list of (str, estimator) tuples or list of estimators + The ensemble members. Each estimator must be a descendant of + ``BaseProbaRegressor``. + If a plain list of estimators is passed, names are generated + automatically from class names. + weights : array-like of float, optional, default=None + Mixture weights for the component predictions. + If None, uniform weights are used. + Weights are normalized to sum to 1 internally by ``Mixture``. + + Attributes + ---------- + estimators_ : list of (str, estimator) tuples + Fitted clones of the estimators passed in ``estimators``. + + Examples + -------- + >>> from skpro.regression.ensemble import VotingProbaRegressor + >>> from skpro.regression.residual import ResidualDouble + >>> from sklearn.linear_model import LinearRegression + >>> from sklearn.datasets import load_diabetes + >>> from sklearn.model_selection import train_test_split + >>> + >>> X, y = load_diabetes(return_X_y=True, as_frame=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y) + >>> + >>> reg1 = ResidualDouble(LinearRegression()) + >>> reg2 = ResidualDouble(LinearRegression()) + >>> + >>> voter = VotingProbaRegressor( + ... estimators=[("r1", reg1), ("r2", reg2)], + ... ) + >>> voter.fit(X_train, y_train) + VotingProbaRegressor(...) + >>> y_pred = voter.predict_proba(X_test) + """ + + _tags = { + "object_type": "regressor_proba", + "estimator_type": "regressor_proba", + "named_object_parameters": "_estimators", + "fitted_named_object_parameters": "estimators_", + "capability:missing": True, + "capability:survival": True, + } + + def __init__(self, estimators, weights=None): + self.estimators = estimators + self.weights = weights + + super().__init__() + + # set capability tags as AND over all component estimators + # ensemble can only handle missing/survival if ALL components can + tags_to_and = ["capability:missing", "capability:survival"] + est_list = self._estimators + for tag in tags_to_and: + tag_val = all(est.get_tag(tag, False) for _, est in est_list) + self.set_tags(**{tag: tag_val}) + + @property + def _estimators(self): + return self._coerce_to_named_object_tuples(self.estimators, clone=False) + + @_estimators.setter + def _estimators(self, value): + self.estimators = value + + def _fit(self, X, y, C=None): + """Fit all component regressors to training data. + + Parameters + ---------- + X : pandas DataFrame + feature instances to fit regressor to + y : pandas DataFrame, must be same length as X + labels to fit regressor to + C : pandas DataFrame, optional (default=None) + censoring information for survival analysis + + Returns + ------- + self : reference to self + """ + self.estimators_ = [] + + for name, est in self._estimators: + fitted_est = est.clone().fit(X, y, C=C) + self.estimators_.append((name, fitted_est)) + + return self + + def _predict_proba(self, X): + """Predict distribution over labels for data from features. + + Returns a ``Mixture`` of predictions from all fitted regressors. + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in ``fit`` + data to predict labels for + + Returns + ------- + y_pred : skpro ``Mixture`` distribution, same length as ``X`` + mixture of probabilistic predictions from all component regressors + """ + y_probas = [(name, est.predict_proba(X)) for name, est in self.estimators_] + + return Mixture(distributions=y_probas, weights=self.weights) + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return ``"default"`` set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + ``MyClass(**params)`` or ``MyClass(**params[i])`` creates a valid test + instance. + ``create_test_instance`` uses the first (or only) dictionary in ``params`` + """ + from sklearn.linear_model import LinearRegression + + from skpro.regression.residual import ResidualDouble + + reg1 = ResidualDouble(LinearRegression()) + reg2 = ResidualDouble(LinearRegression()) + + params1 = { + "estimators": [("r1", reg1), ("r2", reg2)], + } + params2 = { + "estimators": [("r1", reg1), ("r2", reg2)], + "weights": [0.7, 0.3], + } + params3 = { + "estimators": [reg1, reg2], + } + + return [params1, params2, params3]