Skip to content

Allow argument to specfiy how auto-sklearn handles compressing dataset size #1341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Dec 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 68 additions & 37 deletions autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import os
import sys
import time
from typing import Any, Dict, Optional, List, Tuple, Union
from typing import Any, Dict, Mapping, Optional, List, Tuple, Union, cast
import uuid
import unittest.mock
import tempfile
Expand Down Expand Up @@ -50,7 +50,13 @@
from autosklearn.evaluation.train_evaluator import TrainEvaluator, _fit_with_budget
from autosklearn.metrics import calculate_metric
from autosklearn.util.backend import Backend, create
from autosklearn.util.data import reduce_dataset_size_if_too_large, supported_precision_reductions
from autosklearn.util.data import (
reduce_dataset_size_if_too_large,
supported_precision_reductions,
validate_dataset_compression_arg,
default_dataset_compression_arg,
DatasetCompressionSpec,
)
from autosklearn.util.stopwatch import StopWatch
from autosklearn.util.logging_ import (
setup_logger,
Expand Down Expand Up @@ -159,34 +165,36 @@ def _model_predict(

class AutoML(BaseEstimator):

def __init__(self,
time_left_for_this_task,
per_run_time_limit,
temporary_directory: Optional[str] = None,
delete_tmp_folder_after_terminate: bool = True,
initial_configurations_via_metalearning=25,
ensemble_size=1,
ensemble_nbest=1,
max_models_on_disc=1,
seed=1,
memory_limit=3072,
metadata_directory=None,
debug_mode=False,
include=None,
exclude=None,
resampling_strategy='holdout-iterative-fit',
resampling_strategy_arguments=None,
n_jobs=None,
dask_client: Optional[dask.distributed.Client] = None,
precision=32,
disable_evaluator_output=False,
get_smac_object_callback=None,
smac_scenario_args=None,
logging_config=None,
metric=None,
scoring_functions=None,
get_trials_callback=None
):
def __init__(
self,
time_left_for_this_task,
per_run_time_limit,
temporary_directory: Optional[str] = None,
delete_tmp_folder_after_terminate: bool = True,
initial_configurations_via_metalearning=25,
ensemble_size=1,
ensemble_nbest=1,
max_models_on_disc=1,
seed=1,
memory_limit=3072,
metadata_directory=None,
debug_mode=False,
include=None,
exclude=None,
resampling_strategy='holdout-iterative-fit',
resampling_strategy_arguments=None,
n_jobs=None,
dask_client: Optional[dask.distributed.Client] = None,
precision=32,
disable_evaluator_output=False,
get_smac_object_callback=None,
smac_scenario_args=None,
logging_config=None,
metric=None,
scoring_functions=None,
get_trials_callback=None,
dataset_compression: Union[bool, Mapping[str, Any]] = True
):
super(AutoML, self).__init__()
self.configuration_space = None
self._backend: Optional[Backend] = None
Expand Down Expand Up @@ -231,6 +239,18 @@ def __init__(self,
self._smac_scenario_args = smac_scenario_args
self.logging_config = logging_config

# Validate dataset_compression and set its values
self._dataset_compression: Optional[DatasetCompressionSpec]
if isinstance(dataset_compression, bool):
if dataset_compression is True:
self._dataset_compression = default_dataset_compression_arg
else:
self._dataset_compression = None
else:
self._dataset_compression = validate_dataset_compression_arg(
dataset_compression, memory_limit=self._memory_limit
)

self._datamanager = None
self._dataset_name = None
self._feat_type = None
Expand Down Expand Up @@ -641,20 +661,31 @@ def fit(
if X_test is not None and y_test is not None:
X_test, y_test = self.InputValidator.transform(X_test, y_test)

# We don't support size reduction on pandas dataframes yet
if not isinstance(X, pd.DataFrame):
operations = ['subsample']
if X.dtype in supported_precision_reductions:
operations.append('precision')
# We don't support size reduction on pandas type object yet
if (
self._dataset_compression is not None
and not isinstance(X, pd.DataFrame)
and not (isinstance(y, pd.Series) or isinstance(y, pd.DataFrame))
):
methods = self._dataset_compression["methods"]
memory_allocation = self._dataset_compression["memory_allocation"]

# Remove precision reduction if we can't perform it
if (
X.dtype not in supported_precision_reductions
and "precision" in cast(List[str], methods) # Removable with TypedDict
):
methods = [method for method in methods if method != "precision"]

with warnings_to(self._logger):
X, y = reduce_dataset_size_if_too_large(
X=X,
y=y,
memory_limit=self._memory_limit,
is_classification=is_classification,
operations=operations,
random_state=self._seed,
memory_limit=self._memory_limit
operations=methods,
memory_allocation=memory_allocation
)

# Check the re-sampling strategy
Expand Down
127 changes: 96 additions & 31 deletions autosklearn/estimators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# -*- encoding: utf-8 -*-
from typing import Optional, Dict, List, Tuple, Union, Iterable
from typing import Any, Optional, Dict, List, Mapping, Tuple, Union, Iterable
from typing_extensions import Literal

from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
Expand Down Expand Up @@ -50,7 +50,8 @@ def __init__(
metric=None,
scoring_functions: Optional[List[Scorer]] = None,
load_models: bool = True,
get_trials_callback=None
get_trials_callback=None,
dataset_compression: Union[bool, Mapping[str, Any]] = True
):
"""
Parameters
Expand Down Expand Up @@ -103,7 +104,7 @@ def __init__(

include : dict, optional (None)
If None, all possible algorithms are used. Otherwise specifies
set of algorithms for each added component is used. Include and
set of algorithms for each added component is used. Include and
exclude are incompatible if used together on the same component

exclude : dict, optional (None)
Expand All @@ -112,22 +113,37 @@ def __init__(
Incompatible with include. Include and exclude are incompatible
if used together on the same component

resampling_strategy : string or object, optional ('holdout')
how to to handle overfitting, might need 'resampling_strategy_arguments'

* 'holdout': 67:33 (train:test) split
* 'holdout-iterative-fit': 67:33 (train:test) split, calls iterative
fit where possible
* 'cv': crossvalidation, requires 'folds'
* 'cv-iterative-fit': crossvalidation, calls iterative fit where possible
* 'partial-cv': crossvalidation with intensification, requires
'folds'
* BaseCrossValidator object: any BaseCrossValidator class found
in scikit-learn model_selection module
* _RepeatedSplits object: any _RepeatedSplits class found
in scikit-learn model_selection module
* BaseShuffleSplit object: any BaseShuffleSplit class found
in scikit-learn model_selection module
resampling_strategy : Union[str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit] = "holdout"
How to to handle overfitting, might need to use ``resampling_strategy_arguments``
if using ``"cv"`` based method or a Splitter object.

If using a Splitter object that relies on the dataset retaining it's current
size and order, you will need to look at the ``dataset_compression`` argument
and ensure that ``"subsample"`` is not included in the applied compression
``"methods"`` or disable it entirely with ``False``.

**Options**

* ``"holdout"``:
67:33 (train:test) split
* ``"holdout-iterative-fit"``:
67:33 (train:test) split, iterative fit where possible
* ``"cv"``:
crossvalidation,
requires ``"folds"`` in ``resampling_strategy_arguments``
* ``"cv-iterative-fit"``:
crossvalidation,
calls iterative fit where possible,
requires ``"folds"`` in ``resampling_strategy_arguments``
* 'partial-cv':
crossvalidation with intensification,
requires ``"folds"`` in ``resampling_strategy_arguments``
* ``BaseCrossValidator`` subclass:
any BaseCrossValidator subclass (found in scikit-learn model_selection module)
* ``_RepeatedSplits`` subclass:
any _RepeatedSplits subclass (found in scikit-learn model_selection module)
* ``BaseShuffleSplit`` subclass:
any BaseShuffleSplit subclass (found in scikit-learn model_selection module)

resampling_strategy_arguments : dict, optional if 'holdout' (train_size default=0.67)
Additional arguments for resampling_strategy:
Expand Down Expand Up @@ -218,16 +234,71 @@ def __init__(

load_models : bool, optional (True)
Whether to load the models after fitting Auto-sklearn.

get_trials_callback: callable
Callback function to create an object of subclass defined in module
`smac.callbacks <https://automl.github.io/SMAC3/master/apidoc/smac.callbacks.html>`_.
This is an advanced feature. Use only if you are familiar with
`SMAC <https://automl.github.io/SMAC3/master/index.html>`_.

dataset_compression: Union[bool, Mapping[str, Any]] = True
We compress datasets so that they fit into some predefined amount of memory.
Currently this does not apply to dataframes or sparse arrays, only to raw numpy arrays.

**NOTE**

If using a custom ``resampling_strategy`` that relies on specific
size or ordering of data, this must be disabled to preserve these properties.

You can disable this entirely by passing ``False``.

Default configuration when left as ``True``:

.. code-block:: python

{
"memory_allocation": 0.1,
"methods": ["precision", "subsample"]
}

You can also pass your own configuration with the same keys and choosing
from the available ``"methods"``.

The available options are described here:

**memory_allocation**

By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This
float value can be set with ``"memory_allocation": 0.1``. We also allow for
specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``.

The memory used by the dataset is checked after each reduction method is
performed. If the dataset fits into the allocated memory, any further methods
listed in ``"methods"`` will not be performed.

For example, if ``methods: ["precision", "subsample"]`` and the
``"precision"`` reduction step was enough to make the dataset fit into memory,
then the ``"subsample"`` reduction step will not be performed.

**methods**

We currently provide the following methods for reducing the dataset size.
These can be provided in a list and are performed in the order as given.

* ``"precision"`` - We reduce floating point precision as follows:
* ``np.float128 -> np.float64``
* ``np.float96 -> np.float64``
* ``np.float64 -> np.float32``

* ``subsample`` - We subsample data such that it **fits directly into the
memory allocation** ``memory_allocation * memory_limit``. Therefore, this
should likely be the last method listed in ``"methods"``.
Subsampling takes into account classification labels and stratifies
accordingly. We guarantee that at least one occurrence of each label is
included in the sampled set.

Attributes
----------

cv_results\_ : dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
imported into a pandas ``DataFrame``.
Expand Down Expand Up @@ -269,6 +340,7 @@ def __init__(
self.scoring_functions = scoring_functions
self.load_models = load_models
self.get_trials_callback = get_trials_callback
self.dataset_compression = dataset_compression

self.automl_ = None # type: Optional[AutoML]

Expand Down Expand Up @@ -314,7 +386,8 @@ def build_automl(self):
metadata_directory=self.metadata_directory,
metric=self.metric,
scoring_functions=self.scoring_functions,
get_trials_callback=self.get_trials_callback
get_trials_callback=self.get_trials_callback,
dataset_compression=self.dataset_compression
)

return automl
Expand Down Expand Up @@ -862,10 +935,7 @@ def get_configuration_space(


class AutoSklearnClassifier(AutoSklearnEstimator, ClassifierMixin):
"""
This class implements the classification task.

"""
"""This class implements the classification task. """

def fit(self, X, y,
X_test=None,
Expand All @@ -879,7 +949,6 @@ def fit(self, X, y,

Parameters
----------

X : array-like or sparse matrix of shape = [n_samples, n_features]
The training input samples.

Expand Down Expand Up @@ -911,7 +980,6 @@ def fit(self, X, y,
Returns
-------
self

"""
# AutoSklearn does not handle sparse y for now
y = convert_if_sparse(y)
Expand Down Expand Up @@ -963,12 +1031,10 @@ def predict(self, X, batch_size=None, n_jobs=1):
-------
y : array of shape = [n_samples] or [n_samples, n_labels]
The predicted classes.

"""
return super().predict(X, batch_size=batch_size, n_jobs=n_jobs)

def predict_proba(self, X, batch_size=None, n_jobs=1):

"""Predict probabilities of classes for all samples X.

Parameters
Expand All @@ -984,7 +1050,6 @@ def predict_proba(self, X, batch_size=None, n_jobs=1):
-------
y : array of shape = [n_samples, n_classes] or [n_samples, n_labels]
The predicted class probabilities.

"""
pred_proba = super().predict_proba(
X, batch_size=batch_size, n_jobs=n_jobs)
Expand Down
3 changes: 2 additions & 1 deletion autosklearn/experimental/askl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import pathlib
import pickle
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union, Mapping

import dask.distributed
import scipy.sparse
Expand Down Expand Up @@ -198,6 +198,7 @@ def __init__(
metric: Optional[Scorer] = None,
scoring_functions: Optional[List[Scorer]] = None,
load_models: bool = True,
dataset_compression: Union[bool, Mapping[str, Any]] = True
):

"""
Expand Down
Loading