automl · eddiebergman · Dec 17, 2021 · Dec 8, 2021 · Dec 8, 2021 · Dec 8, 2021
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -8,7 +8,7 @@
 import os
 import sys
 import time
-from typing import Any, Dict, Optional, List, Tuple, Union
+from typing import Any, Dict, Mapping, Optional, List, Tuple, Union, cast
 import uuid
 import unittest.mock
 import tempfile
@@ -50,7 +50,13 @@
 from autosklearn.evaluation.train_evaluator import TrainEvaluator, _fit_with_budget
 from autosklearn.metrics import calculate_metric
 from autosklearn.util.backend import Backend, create
-from autosklearn.util.data import reduce_dataset_size_if_too_large, supported_precision_reductions
+from autosklearn.util.data import (
+    reduce_dataset_size_if_too_large,
+    supported_precision_reductions,
+    validate_dataset_compression_arg,
+    default_dataset_compression_arg,
+    DatasetCompressionSpec,
+)
 from autosklearn.util.stopwatch import StopWatch
 from autosklearn.util.logging_ import (
     setup_logger,
@@ -159,34 +165,36 @@ def _model_predict(
 
 class AutoML(BaseEstimator):
 
-    def __init__(self,
-                 time_left_for_this_task,
-                 per_run_time_limit,
-                 temporary_directory: Optional[str] = None,
-                 delete_tmp_folder_after_terminate: bool = True,
-                 initial_configurations_via_metalearning=25,
-                 ensemble_size=1,
-                 ensemble_nbest=1,
-                 max_models_on_disc=1,
-                 seed=1,
-                 memory_limit=3072,
-                 metadata_directory=None,
-                 debug_mode=False,
-                 include=None,
-                 exclude=None,
-                 resampling_strategy='holdout-iterative-fit',
-                 resampling_strategy_arguments=None,
-                 n_jobs=None,
-                 dask_client: Optional[dask.distributed.Client] = None,
-                 precision=32,
-                 disable_evaluator_output=False,
-                 get_smac_object_callback=None,
-                 smac_scenario_args=None,
-                 logging_config=None,
-                 metric=None,
-                 scoring_functions=None,
-                 get_trials_callback=None
-                 ):
+    def __init__(
+        self,
+        time_left_for_this_task,
+        per_run_time_limit,
+        temporary_directory: Optional[str] = None,
+        delete_tmp_folder_after_terminate: bool = True,
+        initial_configurations_via_metalearning=25,
+        ensemble_size=1,
+        ensemble_nbest=1,
+        max_models_on_disc=1,
+        seed=1,
+        memory_limit=3072,
+        metadata_directory=None,
+        debug_mode=False,
+        include=None,
+        exclude=None,
+        resampling_strategy='holdout-iterative-fit',
+        resampling_strategy_arguments=None,
+        n_jobs=None,
+        dask_client: Optional[dask.distributed.Client] = None,
+        precision=32,
+        disable_evaluator_output=False,
+        get_smac_object_callback=None,
+        smac_scenario_args=None,
+        logging_config=None,
+        metric=None,
+        scoring_functions=None,
+        get_trials_callback=None,
+        dataset_compression: Union[bool, Mapping[str, Any]] = True
+    ):
         super(AutoML, self).__init__()
         self.configuration_space = None
         self._backend: Optional[Backend] = None
@@ -231,6 +239,18 @@ def __init__(self,
         self._smac_scenario_args = smac_scenario_args
         self.logging_config = logging_config
 
+        # Validate dataset_compression and set its values
+        self._dataset_compression: Optional[DatasetCompressionSpec]
+        if isinstance(dataset_compression, bool):
+            if dataset_compression is True:
+                self._dataset_compression = default_dataset_compression_arg
+            else:
+                self._dataset_compression = None
+        else:
+            self._dataset_compression = validate_dataset_compression_arg(
+                dataset_compression, memory_limit=self._memory_limit
+            )
+
         self._datamanager = None
         self._dataset_name = None
         self._feat_type = None
@@ -641,20 +661,31 @@ def fit(
         if X_test is not None and y_test is not None:
             X_test, y_test = self.InputValidator.transform(X_test, y_test)
 
-        # We don't support size reduction on pandas dataframes yet
-        if not isinstance(X, pd.DataFrame):
-            operations = ['subsample']
-            if X.dtype in supported_precision_reductions:
-                operations.append('precision')
+        # We don't support size reduction on pandas type object yet
+        if (
+            self._dataset_compression is not None
+            and not isinstance(X, pd.DataFrame)
+            and not (isinstance(y, pd.Series) or isinstance(y, pd.DataFrame))
+        ):
+            methods = self._dataset_compression["methods"]
+            memory_allocation = self._dataset_compression["memory_allocation"]
+
+            # Remove precision reduction if we can't perform it
+            if (
+                X.dtype not in supported_precision_reductions
+                and "precision" in cast(List[str], methods)  # Removable with TypedDict
+            ):
+                methods = [method for method in methods if method != "precision"]
 
             with warnings_to(self._logger):
                 X, y = reduce_dataset_size_if_too_large(
                     X=X,
                     y=y,
+                    memory_limit=self._memory_limit,
                     is_classification=is_classification,
-                    operations=operations,
                     random_state=self._seed,
-                    memory_limit=self._memory_limit
+                    operations=methods,
+                    memory_allocation=memory_allocation
                 )
 
         # Check the re-sampling strategy

diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
@@ -1,5 +1,5 @@
 # -*- encoding: utf-8 -*-
-from typing import Optional, Dict, List, Tuple, Union, Iterable
+from typing import Any, Optional, Dict, List, Mapping, Tuple, Union, Iterable
 from typing_extensions import Literal
 
 from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
@@ -50,7 +50,8 @@ def __init__(
         metric=None,
         scoring_functions: Optional[List[Scorer]] = None,
         load_models: bool = True,
-        get_trials_callback=None
+        get_trials_callback=None,
+        dataset_compression: Union[bool, Mapping[str, Any]] = True
     ):
         """
         Parameters
@@ -103,7 +104,7 @@ def __init__(
 
         include : dict, optional (None)
             If None, all possible algorithms are used. Otherwise specifies
-            set of algorithms for each added component is used. Include and 
+            set of algorithms for each added component is used. Include and
             exclude are incompatible if used together on the same component
 
         exclude : dict, optional (None)
@@ -112,22 +113,37 @@ def __init__(
             Incompatible with include. Include and exclude are incompatible
             if used together on the same component
 
-        resampling_strategy : string or object, optional ('holdout')
-            how to to handle overfitting, might need 'resampling_strategy_arguments'
-
-            * 'holdout': 67:33 (train:test) split
-            * 'holdout-iterative-fit':  67:33 (train:test) split, calls iterative
-              fit where possible
-            * 'cv': crossvalidation, requires 'folds'
-            * 'cv-iterative-fit': crossvalidation, calls iterative fit where possible
-            * 'partial-cv': crossvalidation with intensification, requires
-              'folds'
-            * BaseCrossValidator object: any BaseCrossValidator class found
-                                        in scikit-learn model_selection module
-            * _RepeatedSplits object: any _RepeatedSplits class found
-                                      in scikit-learn model_selection module
-            * BaseShuffleSplit object: any BaseShuffleSplit class found
-                                      in scikit-learn model_selection module
+        resampling_strategy : Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit] = "holdout"
+            How to to handle overfitting, might need to use ``resampling_strategy_arguments``
+            if using ``"cv"`` based method or a Splitter object.
+
+            If using a Splitter object that relies on the dataset retaining it's current
+            size and order, you will need to look at the ``dataset_compression`` argument
+            and ensure that ``"subsample"`` is not included in the applied compression
+            ``"methods"`` or disable it entirely with ``False``.
+
+            **Options**
+
+            *   ``"holdout"``:
+                    67:33 (train:test) split
+            *   ``"holdout-iterative-fit"``:
+                    67:33 (train:test) split, iterative fit where possible
+            *   ``"cv"``:
+                    crossvalidation,
+                    requires ``"folds"`` in ``resampling_strategy_arguments``
+            *   ``"cv-iterative-fit"``:
+                    crossvalidation,
+                    calls iterative fit where possible,
+                    requires ``"folds"`` in ``resampling_strategy_arguments``
+            *   'partial-cv':
+                    crossvalidation with intensification,
+                    requires ``"folds"`` in ``resampling_strategy_arguments``
+            *   ``BaseCrossValidator`` subclass:
+                    any BaseCrossValidator subclass (found in scikit-learn model_selection module)
+            *   ``_RepeatedSplits`` subclass:
+                    any _RepeatedSplits subclass (found in scikit-learn model_selection module)
+            *   ``BaseShuffleSplit`` subclass:
+                    any BaseShuffleSplit subclass (found in scikit-learn model_selection module)
 
         resampling_strategy_arguments : dict, optional if 'holdout' (train_size default=0.67)
             Additional arguments for resampling_strategy:
@@ -218,16 +234,71 @@ def __init__(
 
         load_models : bool, optional (True)
             Whether to load the models after fitting Auto-sklearn.
-           
+
         get_trials_callback: callable
             Callback function to create an object of subclass defined in module
             `smac.callbacks <https://automl.github.io/SMAC3/master/apidoc/smac.callbacks.html>`_.
             This is an advanced feature. Use only if you are familiar with
             `SMAC <https://automl.github.io/SMAC3/master/index.html>`_.
 
+        dataset_compression: Union[bool, Mapping[str, Any]] = True
+            We compress datasets so that they fit into some predefined amount of memory.
+            Currently this does not apply to dataframes or sparse arrays, only to raw numpy arrays.
+
+            **NOTE**
+
+            If using a custom ``resampling_strategy`` that relies on specific
+            size or ordering of data, this must be disabled to preserve these properties.
+
+            You can disable this entirely by passing ``False``.
+
+            Default configuration when left as ``True``:
+
+            .. code-block:: python
+
+                {
+                    "memory_allocation": 0.1,
+                    "methods": ["precision", "subsample"]
+                }
+
+            You can also pass your own configuration with the same keys and choosing
+            from the available ``"methods"``.
+
+            The available options are described here:
+
+            **memory_allocation**
+
+            By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This
+            float value can be set with ``"memory_allocation": 0.1``. We also allow for
+            specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.
+
+            The memory used by the dataset is checked after each reduction method is
+            performed. If the dataset fits into the allocated memory, any further methods
+            listed in ``"methods"`` will not be performed.
+
+            For example, if ``methods: ["precision", "subsample"]`` and the
+            ``"precision"`` reduction step was enough to make the dataset fit into memory,
+            then the ``"subsample"`` reduction step will not be performed.
+
+            **methods**
+
+            We currently provide the following methods for reducing the dataset size.
+            These can be provided in a list and are performed in the order as given.
+
+            *   ``"precision"`` - We reduce floating point precision as follows:
+                *   ``np.float128 -> np.float64``
+                *   ``np.float96 -> np.float64``
+                *   ``np.float64 -> np.float32``
+
+            *   ``subsample`` - We subsample data such that it **fits directly into the
+                memory allocation** ``memory_allocation * memory_limit``. Therefore, this
+                should likely be the last method listed in ``"methods"``.
+                Subsampling takes into account classification labels and stratifies
+                accordingly. We guarantee that at least one occurrence of each label is
+                included in the sampled set.
+
         Attributes
         ----------
-
         cv_results\_ : dict of numpy (masked) ndarrays
             A dict with keys as column headers and values as columns, that can be
             imported into a pandas ``DataFrame``.
@@ -269,6 +340,7 @@ def __init__(
         self.scoring_functions = scoring_functions
         self.load_models = load_models
         self.get_trials_callback = get_trials_callback
+        self.dataset_compression = dataset_compression
 
         self.automl_ = None  # type: Optional[AutoML]
 
@@ -314,7 +386,8 @@ def build_automl(self):
             metadata_directory=self.metadata_directory,
             metric=self.metric,
             scoring_functions=self.scoring_functions,
-            get_trials_callback=self.get_trials_callback
+            get_trials_callback=self.get_trials_callback,
+            dataset_compression=self.dataset_compression
         )
 
         return automl
@@ -862,10 +935,7 @@ def get_configuration_space(
 
 
 class AutoSklearnClassifier(AutoSklearnEstimator, ClassifierMixin):
-    """
-    This class implements the classification task.
-
-    """
+    """This class implements the classification task. """
 
     def fit(self, X, y,
             X_test=None,
@@ -879,7 +949,6 @@ def fit(self, X, y,
 
         Parameters
         ----------
-
         X : array-like or sparse matrix of shape = [n_samples, n_features]
             The training input samples.
 
@@ -911,7 +980,6 @@ def fit(self, X, y,
         Returns
         -------
         self
-
         """
         # AutoSklearn does not handle sparse y for now
         y = convert_if_sparse(y)
@@ -963,12 +1031,10 @@ def predict(self, X, batch_size=None, n_jobs=1):
         -------
         y : array of shape = [n_samples] or [n_samples, n_labels]
             The predicted classes.
-
         """
         return super().predict(X, batch_size=batch_size, n_jobs=n_jobs)
 
     def predict_proba(self, X, batch_size=None, n_jobs=1):
-
         """Predict probabilities of classes for all samples X.
 
         Parameters
@@ -984,7 +1050,6 @@ def predict_proba(self, X, batch_size=None, n_jobs=1):
         -------
         y : array of shape = [n_samples, n_classes] or [n_samples, n_labels]
             The predicted class probabilities.
-
         """
         pred_proba = super().predict_proba(
             X, batch_size=batch_size, n_jobs=n_jobs)

diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py
@@ -3,7 +3,7 @@
 import os
 import pathlib
 import pickle
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Mapping
 
 import dask.distributed
 import scipy.sparse
@@ -198,6 +198,7 @@ def __init__(
         metric: Optional[Scorer] = None,
         scoring_functions: Optional[List[Scorer]] = None,
         load_models: bool = True,
+        dataset_compression: Union[bool, Mapping[str, Any]] = True
     ):
 
         """