automl · mfeurer · Aug 4, 2022 · Jun 17, 2022 · Jun 17, 2022 · Jun 17, 2022
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -79,7 +79,7 @@
     AbstractMultiObjectiveEnsemble,
 )
 from autosklearn.ensembles.ensemble_selection import EnsembleSelection
-from autosklearn.ensembles.singlebest_ensemble import SingleBest
+from autosklearn.ensembles.singlebest_ensemble import SingleBestFromRunhistory
 from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings
 from autosklearn.evaluation.train_evaluator import TrainEvaluator, _fit_with_budget
@@ -1666,7 +1666,7 @@ def _load_best_individual_model(self):
             return None
 
         # SingleBest contains the best model found by AutoML
-        ensemble = SingleBest(
+        ensemble = SingleBestFromRunhistory(
             metrics=self._metrics,
             task_type=self._task,
             seed=self._seed,
@@ -1693,13 +1693,12 @@ def _load_pareto_set(self) -> Sequence[VotingClassifier | VotingRegressor]:
             raise ValueError("Pareto set only available if ensemble can be loaded.")
 
         if isinstance(self.ensemble_, AbstractMultiObjectiveEnsemble):
-            pareto_set = self.ensemble_.get_pareto_set()
+            pareto_set = self.ensemble_.pareto_set
         else:
             self._logger.warning(
                 "Pareto set not available for single objective ensemble "
                 "method. The Pareto set will only include the single ensemble "
-                "constructed by %s",
-                type(self.ensemble_),
+                f"constructed by {type(self.ensemble_)},"
             )
             pareto_set = [self.ensemble_]
 
@@ -2148,7 +2147,7 @@ def show_models(self) -> dict[int, Any]:
 
         ensemble_dict = {}
 
-        if self._ensemble_class is not None:
+        if self._ensemble_class is None:
             warnings.warn(
                 "No models in the ensemble. Kindly provide an ensemble class."
             )

diff --git a/autosklearn/ensembles/__init__.py b/autosklearn/ensembles/__init__.py
@@ -1,10 +1,18 @@
 from .abstract_ensemble import AbstractEnsemble, AbstractMultiObjectiveEnsemble
 from .ensemble_selection import EnsembleSelection
-from .singlebest_ensemble import SingleBest
+from .multiobjective_dummy_ensemble import MultiObjectiveDummyEnsemble
+from .singlebest_ensemble import (
+    SingleBest,
+    SingleBestFromRunhistory,
+    SingleModelEnsemble,
+)
 
 __all__ = [
     "AbstractEnsemble",
     "AbstractMultiObjectiveEnsemble",
     "EnsembleSelection",
+    "SingleBestFromRunhistory",
     "SingleBest",
+    "SingleModelEnsemble",
+    "MultiObjectiveDummyEnsemble",
 ]
diff --git a/autosklearn/ensembles/abstract_ensemble.py b/autosklearn/ensembles/abstract_ensemble.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Dict, List, Sequence, Tuple, Union
+from typing import Any, Dict, List, Sequence, Tuple, Union
 
 import numpy as np
 
@@ -18,19 +18,29 @@ def __init__(
         self,
         task_type: int,
         metrics: Sequence[Scorer] | Scorer,
-        random_state: int | np.random.RandomState | None,
         backend: Backend,
+        random_state: int | np.random.RandomState | None = None,
     ):
         pass
 
+    def __getstate__(self) -> Dict[str, Any]:
+        # Cannot serialize a metric if
+        # it is user defined.
+        # That is, if doing pickle dump
+        # the metric won't be the same as the
+        # one in __main__. we don't use the metric
+        # in the EnsembleSelection so this should
+        # be fine
+        return {key: value for key, value in self.__dict__.items() if key != "metrics"}
+
     @abstractmethod
     def fit(
         self,
         base_models_predictions: np.ndarray | List[np.ndarray],
-        X_data: SUPPORTED_FEAT_TYPES,
         true_targets: np.ndarray,
         model_identifiers: List[Tuple[int, int, float]],
         runs: Sequence[Run],
+        X_data: SUPPORTED_FEAT_TYPES | None = None,
     ) -> "AbstractEnsemble":
         """Fit an ensemble given predictions of base models and targets.
 
@@ -79,7 +89,7 @@ def predict(
 
         Returns
         -------
-        array : [n_data_points]
+        np.ndarray
         """
         pass
 
@@ -97,7 +107,7 @@ def get_models_with_weights(
 
         Returns
         -------
-        array : [(weight_1, model_1), ..., (weight_n, model_n)]
+        List[Tuple[float, BasePipeline]]
         """
 
     @abstractmethod
@@ -115,7 +125,7 @@ def get_identifiers_with_weights(
 
         Returns
         -------
-        array : [(identifier_1, weight_1), ..., (identifier_n, weight_n)]
+        List[Tuple[Tuple[int, int, float], float]
         """
 
     @abstractmethod
@@ -133,12 +143,25 @@ def get_selected_model_identifiers(self) -> List[Tuple[int, int, float]]:
     def get_validation_performance(self) -> float:
         """Return validation performance of ensemble.
 
-        Return
-        ------
+        Returns
+        -------
         float
         """
 
 
 class AbstractMultiObjectiveEnsemble(AbstractEnsemble):
-    def get_pareto_set(self) -> Sequence[AbstractEnsemble]:
-        pass
+    @property
+    @abstractmethod
+    def pareto_set(self) -> Sequence[AbstractEnsemble]:
+        """Get a sequence on ensembles that are on the pareto front
+
+        Raises
+        ------
+        SklearnNotFittedError
+            If ``fit`` has not been called and the pareto set does not exist yet
+
+        Returns
+        -------
+        Sequence[AbstractEnsemble]
+        """
+        ...
diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Sequence, Tuple, Union
 
 import random
 import warnings
@@ -23,11 +23,11 @@ def __init__(
         self,
         task_type: int,
         metrics: Sequence[Scorer] | Scorer,
-        random_state: Optional[Union[int, np.random.RandomState]],
         backend: Backend,
         ensemble_size: int = 50,
         bagging: bool = False,
         mode: str = "fast",
+        random_state: int | np.random.RandomState | None = None,
     ) -> None:
         """An ensemble of selected algorithms
 
@@ -43,14 +43,6 @@ def __init__(
             The metric used to evaluate the models. If multiple metrics are passed,
             ensemble selection only optimizes for the first
 
-        random_state: Optional[int | RandomState] = None
-            The random_state used for ensemble selection.
-
-            * None - Uses numpy's default RandomState object
-            * int - Successive calls to fit will produce the same results
-            * RandomState - Truly random, each call to fit will produce
-              different results, even with the same object.
-
         backend : Backend
             Gives access to the backend of Auto-sklearn. Not used by Ensemble Selection.
 
@@ -62,6 +54,14 @@ def __init__(
             * 'slow' - The original method used in Rich Caruana's ensemble selection.
             * 'fast' - A faster version of Rich Caruanas' ensemble selection.
 
+        random_state: int | RandomState | None = None
+            The random_state used for ensemble selection.
+
+            * None - Uses numpy's default RandomState object
+            * int - Successive calls to fit will produce the same results
+            * RandomState - Truly random, each call to fit will produce
+              different results, even with the same object.
+
         References
         ----------
         | Ensemble selection from libraries of models
@@ -92,23 +92,13 @@ def __init__(
         # https://scikit-learn.org/stable/common_pitfalls.html#controlling-randomness
         self.random_state = random_state
 
-    def __getstate__(self) -> Dict[str, Any]:
-        # Cannot serialize a metric if
-        # it is user defined.
-        # That is, if doing pickle dump
-        # the metric won't be the same as the
-        # one in __main__. we don't use the metric
-        # in the EnsembleSelection so this should
-        # be fine
-        return {key: value for key, value in self.__dict__.items() if key != "metrics"}
-
     def fit(
         self,
         base_models_predictions: List[np.ndarray],
-        X_data: SUPPORTED_FEAT_TYPES,
         true_targets: np.ndarray,
         model_identifiers: List[Tuple[int, int, float]],
         runs: Sequence[Run],
+        X_data: SUPPORTED_FEAT_TYPES | None = None,
     ) -> EnsembleSelection:
         self.ensemble_size = int(self.ensemble_size)
         if self.ensemble_size < 1:
@@ -141,20 +131,22 @@ def fit(
     def _fit(
         self,
         predictions: List[np.ndarray],
-        X_data: SUPPORTED_FEAT_TYPES,
         labels: np.ndarray,
+        *,
+        X_data: SUPPORTED_FEAT_TYPES | None = None,
     ) -> EnsembleSelection:
         if self.mode == "fast":
-            self._fast(predictions, X_data, labels)
+            self._fast(predictions=predictions, X_data=X_data, labels=labels)
         else:
-            self._slow(predictions, X_data, labels)
+            self._slow(predictions=predictions, X_data=X_data, labels=labels)
         return self
 
     def _fast(
         self,
         predictions: List[np.ndarray],
-        X_data: SUPPORTED_FEAT_TYPES,
         labels: np.ndarray,
+        *,
+        X_data: SUPPORTED_FEAT_TYPES | None = None,
     ) -> None:
         """Fast version of Rich Caruana's ensemble selection method."""
         self.num_input_models_ = len(predictions)
@@ -231,8 +223,9 @@ def _fast(
     def _slow(
         self,
         predictions: List[np.ndarray],
-        X_data: SUPPORTED_FEAT_TYPES,
         labels: np.ndarray,
+        *,
+        X_data: SUPPORTED_FEAT_TYPES | None = None,
     ) -> None:
         """Rich Caruana's ensemble selection method."""
         self.num_input_models_ = len(predictions)
@@ -311,7 +304,7 @@ def _bagging(
             # Bagging a set of models
             indices = sorted(random.sample(range(0, n_models), bag_size))
             bag = predictions[indices, :, :]
-            order, _ = self._fit(bag, labels)
+            order, _ = self._fit(predictions=bag, labels=labels)
             order_of_each_bag.append(order)
 
         return np.array(