automl · eddiebergman · Feb 1, 2022 · Sep 15, 2021 · Sep 15, 2021 · Sep 16, 2021
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -9,7 +9,7 @@
 import os
 import sys
 import time
-from typing import Any, Dict, Optional, List, Tuple, Union
+from typing import Any, Dict, Mapping, Optional, List, Tuple, Union, cast
 import uuid
 import unittest.mock
 import tempfile
@@ -52,6 +52,13 @@
 from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings
 from autosklearn.evaluation.train_evaluator import TrainEvaluator, _fit_with_budget
 from autosklearn.metrics import calculate_metric
+from autosklearn.util.data import (
+    reduce_dataset_size_if_too_large,
+    supported_precision_reductions,
+    validate_dataset_compression_arg,
+    default_dataset_compression_arg,
+    DatasetCompressionSpec,
+)
 from autosklearn.util.stopwatch import StopWatch
 from autosklearn.util.logging_ import (
     setup_logger,
@@ -118,7 +125,7 @@ def _model_predict(
         The predictions produced by the model
     """
     # Copy the array and ensure is has the attr 'shape'
-    X_ = np.asarray(X) if isinstance(X, list) else X.copy()
+    X_ = np.asarray(X) if isinstance(X, List) else X.copy()
 
     assert X_.shape[0] >= 1, f"X must have more than 1 sample but has {X_.shape[0]}"
 
@@ -160,34 +167,36 @@ def _model_predict(
 
 class AutoML(BaseEstimator):
 
-    def __init__(self,
-                 time_left_for_this_task,
-                 per_run_time_limit,
-                 temporary_directory: Optional[str] = None,
-                 delete_tmp_folder_after_terminate: bool = True,
-                 initial_configurations_via_metalearning=25,
-                 ensemble_size=1,
-                 ensemble_nbest=1,
-                 max_models_on_disc=1,
-                 seed=1,
-                 memory_limit=3072,
-                 metadata_directory=None,
-                 debug_mode=False,
-                 include: Optional[Dict[str, List[str]]] = None,
-                 exclude: Optional[Dict[str, List[str]]] = None,
-                 resampling_strategy='holdout-iterative-fit',
-                 resampling_strategy_arguments=None,
-                 n_jobs=None,
-                 dask_client: Optional[dask.distributed.Client] = None,
-                 precision=32,
-                 disable_evaluator_output=False,
-                 get_smac_object_callback=None,
-                 smac_scenario_args=None,
-                 logging_config=None,
-                 metric=None,
-                 scoring_functions=None,
-                 get_trials_callback=None
-                 ):
+    def __init__(
+        self,
+        time_left_for_this_task,
+        per_run_time_limit,
+        temporary_directory: Optional[str] = None,
+        delete_tmp_folder_after_terminate: bool = True,
+        initial_configurations_via_metalearning=25,
+        ensemble_size=1,
+        ensemble_nbest=1,
+        max_models_on_disc=1,
+        seed=1,
+        memory_limit=3072,
+        metadata_directory=None,
+        debug_mode=False,
+        include=None,
+        exclude=None,
+        resampling_strategy='holdout-iterative-fit',
+        resampling_strategy_arguments=None,
+        n_jobs=None,
+        dask_client: Optional[dask.distributed.Client] = None,
+        precision=32,
+        disable_evaluator_output=False,
+        get_smac_object_callback=None,
+        smac_scenario_args=None,
+        logging_config=None,
+        metric=None,
+        scoring_functions=None,
+        get_trials_callback=None,
+        dataset_compression: Union[bool, Mapping[str, Any]] = True
+    ):
         super(AutoML, self).__init__()
         self.configuration_space = None
         self._backend: Optional[Backend] = None
@@ -217,10 +226,10 @@ def __init__(self,
         self.precision = precision
         self._disable_evaluator_output = disable_evaluator_output
         # Check arguments prior to doing anything!
-        if not isinstance(self._disable_evaluator_output, (bool, list)):
+        if not isinstance(self._disable_evaluator_output, (bool, List)):
             raise ValueError('disable_evaluator_output must be of type bool '
                              'or list.')
-        if isinstance(self._disable_evaluator_output, list):
+        if isinstance(self._disable_evaluator_output, List):
             allowed_elements = ['model', 'cv_model', 'y_optimization', 'y_test', 'y_valid']
             for element in self._disable_evaluator_output:
                 if element not in allowed_elements:
@@ -232,6 +241,18 @@ def __init__(self,
         self._smac_scenario_args = smac_scenario_args
         self.logging_config = logging_config
 
+        # Validate dataset_compression and set its values
+        self._dataset_compression: Optional[DatasetCompressionSpec]
+        if isinstance(dataset_compression, bool):
+            if dataset_compression is True:
+                self._dataset_compression = default_dataset_compression_arg
+            else:
+                self._dataset_compression = None
+        else:
+            self._dataset_compression = validate_dataset_compression_arg(
+                dataset_compression, memory_limit=self._memory_limit
+            )
+
         self._datamanager = None
         self._dataset_name = None
         self._feat_type = None
@@ -490,10 +511,10 @@ def _supports_task_type(cls, task_type: str) -> bool:
     def fit(
         self,
         X: SUPPORTED_FEAT_TYPES,
-        y: Union[SUPPORTED_TARGET_TYPES, spmatrix],
+        y: SUPPORTED_TARGET_TYPES,
         task: Optional[int] = None,
         X_test: Optional[SUPPORTED_FEAT_TYPES] = None,
-        y_test: Optional[Union[SUPPORTED_TARGET_TYPES, spmatrix]] = None,
+        y_test: Optional[SUPPORTED_TARGET_TYPES] = None,
         feat_type: Optional[List[str]] = None,
         dataset_name: Optional[str] = None,
         only_return_configuration_space: bool = False,
@@ -509,8 +530,8 @@ def fit(
         #
         #   `task: Optional[int]` and `is_classification`
         #
-        #   `AutoML` tries to identify the task itself with
-        #   `sklearn.type_of_target`, leaving little for the subclasses to do.
+        #   `AutoML` tries to identify the task itself with `sklearn.type_of_target`,
+        #   leaving little for the subclasses to do.
         #   Except this failes when type_of_target(y) == "multiclass".
         #
         #   "multiclass" be mean either REGRESSION or MULTICLASS_CLASSIFICATION,
@@ -588,6 +609,8 @@ def fit(
         self
 
         """
+        if (X_test is not None) ^ (y_test is not None):
+            raise ValueError("Must provide both X_test and y_test together")
 
         # AutoSklearn does not handle sparse y for now
         y = convert_if_sparse(y)
@@ -639,17 +662,35 @@ def fit(
         self.InputValidator.fit(X_train=X, y_train=y, X_test=X_test, y_test=y_test)
         X, y = self.InputValidator.transform(X, y)
 
-        if X_test is not None:
+        if X_test is not None and y_test is not None:
             X_test, y_test = self.InputValidator.transform(X_test, y_test)
 
-        X, y = self.subsample_if_too_large(
-            X=X,
-            y=y,
-            logger=self._logger,
-            seed=self._seed,
-            memory_limit=self._memory_limit,
-            task=self._task,
-        )
+        # We don't support size reduction on pandas type object yet
+        if (
+            self._dataset_compression is not None
+            and not isinstance(X, pd.DataFrame)
+            and not (isinstance(y, pd.Series) or isinstance(y, pd.DataFrame))
+        ):
+            methods = self._dataset_compression["methods"]
+            memory_allocation = self._dataset_compression["memory_allocation"]
+
+            # Remove precision reduction if we can't perform it
+            if (
+                X.dtype not in supported_precision_reductions
+                and "precision" in cast(List[str], methods)  # Removable with TypedDict
+            ):
+                methods = [method for method in methods if method != "precision"]
+
+            with warnings_to(self._logger):
+                X, y = reduce_dataset_size_if_too_large(
+                    X=X,
+                    y=y,
+                    memory_limit=self._memory_limit,
+                    is_classification=is_classification,
+                    random_state=self._seed,
+                    operations=methods,
+                    memory_allocation=memory_allocation
+                )
 
         # Check the re-sampling strategy
         try:
@@ -1042,117 +1083,6 @@ def _check_resampling_strategy(
 
         return
 
-    @staticmethod
-    def subsample_if_too_large(
-        X: SUPPORTED_FEAT_TYPES,
-        y: SUPPORTED_TARGET_TYPES,
-        logger,
-        seed: int,
-        memory_limit: int,
-        task: int,
-    ):
-        if memory_limit and isinstance(X, np.ndarray):
-
-            if X.dtype == np.float32:
-                multiplier = 4
-            elif X.dtype in (np.float64, float):
-                multiplier = 8
-            elif (
-                # In spite of the names, np.float96 and np.float128
-                # provide only as much precision as np.longdouble,
-                # that is, 80 bits on most x86 machines and 64 bits
-                # in standard Windows builds.
-                (hasattr(np, 'float128') and X.dtype == np.float128)
-                or (hasattr(np, 'float96') and X.dtype == np.float96)
-            ):
-                multiplier = 16
-            else:
-                # Just assuming some value - very unlikely
-                multiplier = 8
-                logger.warning('Unknown dtype for X: %s, assuming it takes 8 bit/number',
-                               str(X.dtype))
-
-            megabytes = X.shape[0] * X.shape[1] * multiplier / 1024 / 1024
-            if memory_limit <= megabytes * 10 and X.dtype != np.float32:
-                cast_to = {
-                    8: np.float32,
-                    16: np.float64,
-                }.get(multiplier, np.float32)
-                logger.warning(
-                    'Dataset too large for memory limit %dMB, reducing the precision from %s to %s',
-                    memory_limit,
-                    X.dtype,
-                    cast_to,
-                )
-                X = X.astype(cast_to)
-
-            megabytes = X.shape[0] * X.shape[1] * multiplier / 1024 / 1024
-            if memory_limit <= megabytes * 10:
-                new_num_samples = int(
-                    memory_limit / (10 * X.shape[1] * multiplier / 1024 / 1024)
-                )
-                logger.warning(
-                    'Dataset too large for memory limit %dMB, reducing number of samples from '
-                    '%d to %d.',
-                    memory_limit,
-                    X.shape[0],
-                    new_num_samples,
-                )
-                if task in CLASSIFICATION_TASKS:
-                    # Identify if it has unique labels and allow for
-                    # stratification, with unique labels in training set
-                    values, idxs, counts = np.unique(y, axis=0,
-                                                     return_index=True,
-                                                     return_counts=True)
-                    unique_labels = {
-                        idx: value
-                        for value, idx, count in zip(values, idxs, counts)
-                        if count == 1
-                    }
-
-                    # If there are unique labeled elements, remove them and
-                    # place them back in later
-                    if len(unique_labels) > 0:
-                        idxs_of_unique = np.asarray(list(unique_labels.keys()))
-                        unique_X = X[idxs_of_unique]
-                        unique_y = y[idxs_of_unique]
-
-                        # NOTE optimization
-                        #   If this ever turns out to be slow, this actually
-                        #   copies the entire array. There might be a better
-                        #   solution but it will probably require a lot more
-                        #   manual work in how splitting is done.
-                        X = np.delete(X, idxs_of_unique, axis=0)
-                        y = np.delete(y, idxs_of_unique, axis=0)
-
-                        X, _, y, _ = sklearn.model_selection.train_test_split(
-                            X, y,
-                            train_size=new_num_samples - len(unique_y),
-                            random_state=seed,
-                            stratify=y,
-                        )
-
-                        X = np.append(X, unique_X, axis=0)
-                        y = np.append(y, unique_y, axis=0)
-
-                    # Otherwise we should be able to stratify as normal
-                    else:
-                        X, _, y, _ = sklearn.model_selection.train_test_split(
-                            X, y,
-                            train_size=new_num_samples,
-                            random_state=seed,
-                            stratify=y,
-                        )
-                elif task in REGRESSION_TASKS:
-                    X, _, y, _ = sklearn.model_selection.train_test_split(
-                        X, y,
-                        train_size=new_num_samples,
-                        random_state=seed,
-                    )
-                else:
-                    raise ValueError(task)
-        return X, y
-
     def refit(self, X, y):
         # AutoSklearn does not handle sparse y for now
         y = convert_if_sparse(y)
@@ -1247,7 +1177,7 @@ def fit_pipeline(
         is_classification: bool
             Whether the task is for classification or regression. This affects
             how the targets are treated
-        feat_type : list, optional (default=None)
+        feat_type : List, optional (default=None)
             List of str of `len(X.shape[1])` describing the attribute type.
             Possible types are `Categorical` and `Numerical`. `Categorical`
             attributes will be automatically One-Hot encoded. The values
@@ -1536,7 +1466,7 @@ def _load_models(self):
                 raise ValueError('No models fitted!')
 
         elif self._disable_evaluator_output is False or \
-                (isinstance(self._disable_evaluator_output, list) and
+                (isinstance(self._disable_evaluator_output, List) and
                  'model' not in self._disable_evaluator_output):
             model_names = self._backend.list_all_models(self._seed)