From c111acb84adf4ac838301e08811f0d7a05f8679a Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 26 Mar 2021 17:47:14 +0100 Subject: [PATCH 1/9] WIP: add FAQ --- doc/conf.py | 1 + doc/index.rst | 1 + doc/installation.rst | 27 ++++--- doc/manual.rst | 4 +- .../example_interpretable_models.py | 75 +++++++++++++++++++ .../example_extending_classification.py | 2 +- 6 files changed, 96 insertions(+), 14 deletions(-) create mode 100644 examples/40_advanced/example_interpretable_models.py diff --git a/doc/conf.py b/doc/conf.py index bb5ac7480c..e4a85fe336 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -184,6 +184,7 @@ ('Examples', 'examples/index'), ('API', 'api'), ('Extending', 'extending'), + ('FAQ', 'faq'), ], # Render the next and previous page links in navbar. (Default: true) diff --git a/doc/index.rst b/doc/index.rst index b6c331c50f..4de22ef01b 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -61,6 +61,7 @@ Manual * :ref:`manual` * :ref:`api` * :ref:`extending` +* :ref:`faq` License diff --git a/doc/installation.rst b/doc/installation.rst index 3a7e387ecc..2167a7d952 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -11,10 +11,15 @@ System requirements auto-sklearn has the following system requirements: -* Linux operating system (for example Ubuntu) `(get Linux here) `_, -* Python (>=3.6) `(get Python here) `_. -* C++ compiler (with C++11 supports) `(get GCC here) `_ and -* SWIG (version 3.0.* is required; >=4.0.0 is not supported) `(get SWIG here) `_. +* Linux operating system (for example Ubuntu) (`get Linux here `_) +* Python (>=3.6) (`get Python here `_), +* C++ compiler (with C++11 supports) (`get GCC here `_). + +In case you try to install Auto-sklearn on a system where no wheel files for the pyrfr package +are provided (see `here `_ for available wheels) you also +need: + +* SWIG (version 3.0.* is required; >=4.0.0 is not supported) (`get SWIG here `_). For an explanation of missing Microsoft Windows and MAC OSX support please check the Section `Windows/OSX compatibility`_. @@ -81,9 +86,10 @@ is part of Python's `Unix Specific Services `_ and + `860 `_ for suggestions) * virtual machine * docker image @@ -95,21 +101,18 @@ issues holding us back from actively supporting OSX: * The ``resource`` module cannot enforce a memory limit on a Python process (see `SMAC3/issues/115 `_). -* OSX machines on `travis-ci `_ take more than 30 - minutes to spawn. This makes it impossible for us to run unit tests for - *auto-sklearn* and its dependencies `SMAC3 `_ - and `ConfigSpace `_. +* Not all dependencies we are using are set up to work on OSX. In case you're having issues installing the `pyrfr package `_, check out `this installation suggestion on github `_. -Possible other solutions (not tested): +Possible other: * virtual machine * docker image Docker Image -========================= +============ A Docker image is also provided on dockerhub. To download from dockerhub, use: diff --git a/doc/manual.rst b/doc/manual.rst index a8d9605cf7..b051fcf9bc 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -78,6 +78,8 @@ For a full list please have a look at the source code (in `autosklearn/pipeline/ * `Regressors `_ * `Preprocessors `_ +We do also provide an example `on how to restrict the classifiers to search over `_. + Turning off preprocessing ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -152,7 +154,7 @@ Parallel computation In it's default mode, *auto-sklearn* already uses two cores. The first one is used for model building, the second for building an ensemble every time a new -machine learning model has finished training. An example on how to do this sequentially (first searching for individual models, and then building an ensemble from them) can be seen in `sequential auto-sklearn example `_. +machine learning model has finished training. An example on how to do this sequentially (first searching for individual models, and then building an ensemble from them) can be seen in `sequential auto-sklearn example `_. Nevertheless, *auto-sklearn* also supports parallel Bayesian optimization via the use of `Dask.distributed `_. By providing the arguments ``n_jobs`` to the estimator construction, one can control the number of cores available to *auto-sklearn* (As exemplified in `sequential auto-sklearn example `_). Distributed processes are also supported by providing a custom client object to *auto-sklearn* like in the example: `sequential auto-sklearn example `_. When multiple cores are available, *auto-sklearn* diff --git a/examples/40_advanced/example_interpretable_models.py b/examples/40_advanced/example_interpretable_models.py new file mode 100644 index 0000000000..da8a22fa56 --- /dev/null +++ b/examples/40_advanced/example_interpretable_models.py @@ -0,0 +1,75 @@ +# -*- encoding: utf-8 -*- +""" +==================== +Interpretable models +==================== + +The following example shows how to inspect the models which *auto-sklearn* +optimizes over and how to restrict them to an interpretable subset. +""" +import sklearn.datasets +import sklearn.metrics + +import autosklearn.classification + + +############################################################################ +# Show available classification models +# ==================================== +# +# We will first list all classifiers Auto-sklearn chooses from. A similar +# call is available for preprocessors (see below) and regression (not shown) +# as well. + +import autosklearn.pipeline.components.classification +for name in autosklearn.pipeline.components.classification.ClassifierChoice.get_components(): + print(name) + +############################################################################ +# Show available preprocessors +# ============================ + +import autosklearn.pipeline.components.feature_preprocessing +for name in autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice.get_components(): + print(name) + +############################################################################ +# Data Loading +# ============ + +X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) +X_train, X_test, y_train, y_test = \ + sklearn.model_selection.train_test_split(X, y, random_state=1) + +############################################################################ +# Build and fit a classifier +# ========================== +# +# We will now only use a subset of the given classifiers and preprocessors. +# Furthermore, we will restrict the ensemble size to ``1`` to only use the +# single best model in the end. However, we would like to note that the +# choice of which models is deemed interpretable is very much up to the user +# and can change from use case to use case. + +automl = autosklearn.classification.AutoSklearnClassifier( + time_left_for_this_task=120, + per_run_time_limit=30, + tmp_folder='/tmp/autosklearn_interpretable_models_example_tmp', + include_estimators=['decision_tree', 'lda', 'sgd'], + include_preprocessors=['no_preprocessing', 'polynomial', 'select_percentile_classification'], + ensemble_size=1, +) +automl.fit(X_train, y_train, dataset_name='breast_cancer') + +############################################################################ +# Print the final ensemble constructed by auto-sklearn +# ==================================================== + +print(automl.show_models()) + +########################################################################### +# Get the Score of the final ensemble +# =================================== + +predictions = automl.predict(X_test) +print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions)) diff --git a/examples/80_extending/example_extending_classification.py b/examples/80_extending/example_extending_classification.py index 888e55dbbd..daa1141216 100644 --- a/examples/80_extending/example_extending_classification.py +++ b/examples/80_extending/example_extending_classification.py @@ -130,7 +130,7 @@ def get_hyperparameter_search_space(dataset_properties=None): # ============================== clf = autosklearn.classification.AutoSklearnClassifier( - time_left_for_this_task=30, + time_left_for_this_task=3600, per_run_time_limit=10, include_estimators=['MLPClassifier'], # Bellow two flags are provided to speed up calculations From b7cd9ba9a045280027d4519920bddef40b0e7d16 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 26 Mar 2021 17:49:39 +0100 Subject: [PATCH 2/9] undo accidental change --- examples/80_extending/example_extending_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/80_extending/example_extending_classification.py b/examples/80_extending/example_extending_classification.py index daa1141216..888e55dbbd 100644 --- a/examples/80_extending/example_extending_classification.py +++ b/examples/80_extending/example_extending_classification.py @@ -130,7 +130,7 @@ def get_hyperparameter_search_space(dataset_properties=None): # ============================== clf = autosklearn.classification.AutoSklearnClassifier( - time_left_for_this_task=3600, + time_left_for_this_task=30, per_run_time_limit=10, include_estimators=['MLPClassifier'], # Bellow two flags are provided to speed up calculations From c57b8bd1745a4b456672658d21e7fcc3258fe30a Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 26 Mar 2021 17:50:17 +0100 Subject: [PATCH 3/9] check in actual FAQ --- doc/faq.rst | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 doc/faq.rst diff --git a/doc/faq.rst b/doc/faq.rst new file mode 100644 index 0000000000..2f243d77ef --- /dev/null +++ b/doc/faq.rst @@ -0,0 +1,150 @@ +:orphan: + +.. _manual: + +=== +FAQ +=== + +Errors +====== + +Auto-sklearn is extremely memory hungry in a sequential setting +--------------------------------------------------------------- + +Auto-sklearn can appear very memory hungry (i.e. requiring a lot of memory for small datasets) due +to the use of ``fork`` for creating new processes when running in sequential manner (if this +happens in a parallel setting or if you pass your own dask client this is due to a different +issue). + +Let's go into some more detail and discuss how to fix it: +Auto-sklearn executes each machine learning algorithm in its own process to be able to apply a +memory limit and a time limit. To start such a process, Python gives three options: ``fork``, +``forkserver`` and ``spawn``. The default ``fork`` copies the whole process memory into the +subprocess. If the main process already uses 1.5GB of main memory and we apply a 3GB memory +limit to Auto-sklearn, it will only be able to use 1.5GB of that. We would have loved to use +``forkserver`` or ``spawn`` instead, which both don't suffer from this issue (and have some +further improvements, see `here `_), but +they require the Auto-sklearn code to be guarded by ``if __name__ == "__main__"`` or executed in a +notebook and we decided that we do not want to require this by default. + +There are now two possible solutions: + +1. Use parallel Auto-sklearn: if you use Auto-sklean in parallel, it defaults to ``forkserver`` + as the parallelization mechanism itself requires Auto-sklearn the code to be guarded. +2. Pass a `dask client `_. If the user passes + a dask client, Auto-sklearn can no longer assume that it runs in sequential mode and will use + a ``forkserver`` to start new processes. + +We therefore suggest using one of the above settings by default. + +Auto-sklearn is extremely memory hungry in a sequential setting +--------------------------------------------------------------- + +When running Auto-sklearn in a parallel setting it starts new processes for evaluating machine +learning models using the ``forkserver`` mechanism. If not all code in the main script is guarded +by ``if __name__ == "__main__"`` it is executed for each subprocess. If now part of the code that +is not guarded is the code that is loading your dataset, it is executed for every evaluation of a +machine learning algorithm, blocking your RAM. + +We therefore suggest moving all code inside the main block or functions. + +Auto-sklearn crashes with a segmentation fault +---------------------------------------------- + +Please make sure that you have read and followed the :ref:`installation` section! In case +everything is set up correctly, this is most likely due to the dependency +`pyrfr `_ not being compiled correctly. If this is the +case please execute + +.. code:: python + + import pyrfr.regression as reg + data = reg.default_data_container(64) + +If this fails, the pyrfr dependency is indeed not compiled correctly. We advice you to do the +following: + +1. Check if you can use a pre-compiled version of the pyrfr to avoid compiling it yourself. +2. If you are not yet using Conda, consider using it. +3. Install correct build dependencies before installing the pyrfr, you can check the following + github issues for suggestions: `1025 `_, + `856 `_ + +Log files and output +==================== + +Where does Auto-sklearn output files by default? +------------------------------------------------ + +*auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can +be used to inspect the behavior auto Auto-sklearn. Each run of Auto-sklearn requires +its own directory *auto-sklearn* requests a temporary directory from +Python, which by default is located under ``/tmp`` and starts with ``autosklearn_tmp_`` followed +by a random string. + +The :class:`autosklearn.classification.AutoSklearnClassifier` and all other *auto-sklearn* +estimators accept the argument ``tmp_directory`` which change where such output is written to. + +There's an additional argument ``output_directory`` which can be passed to *auto-sklearn* and it +controls where test predictions of the ensemble are stored if the test set is passed to ``fit()``. + +Auto-sklearn eats up all my disk space +-------------------------------------- + +*auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can +be used to inspect the behavior auto Auto-sklearn. By default, *auto-sklearn* stores around 50 +models and their predictions on the validation data (which is a subset of the training data in +case of holdout and the full training data in case of cross-validation) on the hard drive. Models +are removed everytime the ensemble builder finishes an iteration, which means that the number of +models stored on disk is only an approximation. One can therefore change the number of models +that will be stored on disk by passing an integer for the argument ``max_models_on_disc``. + +As the number of models is only an indicator of the disk space used it is also possible to pass +the memory in MB the models are allowed to use as a float. As above this is rather a guideline on +how much memory is used as the models are removed from disk everytime the ensemble builder +finishes an iteration. Especially when running in parallel it can happen that multiple models are +constructed during one run of the ensemble builder. + +Extending +========= + +* Will we add non-scikit-learn models? +* How to add new models? +* can the preprocessing be changed? + +Usage +===== + +Only use interpretable models +----------------------------- + +Issue #1033 + +Auto-sklearn can be restricted to only use interpretable models and preprocessing algorithms. +Please see the section :ref:`Restricting the searchspace` to learn how to restrict the models +which are searched over or see +`this example `_. + +We don't provide a judgement which of the models are interpretable as this is very much up to the +specific use case, but would like to note that decision trees and linear models usually most +interpretable. + +Passing the number of models to try +----------------------------------- + +In certain cases, for example for debugging, it can be helpful to limit the number of +models to try. We do not provide this as an argument in the API as we believe that it +should NOT be used in practice, but that the user should rather provide time limits. +An example on how to add the number of models to try as an additional stopping condition +can be found `in this github issue `_. +Please note that Auto-sklearn will stop when either the time limit or the number of +models termination condition is reached. + +Ensemble contains only a dummy model +------------------------------------ + +3. How to set useful budgets: https://github.com/automl/auto-sklearn/issues/57 + +Parallel processing and oversubscription +---------------------------------------- From 2d9499130cdc75943eed8dded01a282487944ee8 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 26 Mar 2021 18:17:27 +0100 Subject: [PATCH 4/9] some more answers --- doc/faq.rst | 52 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/doc/faq.rst b/doc/faq.rst index 2f243d77ef..bd18bee249 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -106,12 +106,31 @@ how much memory is used as the models are removed from disk everytime the ensemb finishes an iteration. Especially when running in parallel it can happen that multiple models are constructed during one run of the ensemble builder. -Extending -========= +Available machine learning models +================================= -* Will we add non-scikit-learn models? -* How to add new models? -* can the preprocessing be changed? +Will non-scikit-learn models be added to Auto-sklearn? +------------------------------------------------------ + +The short answer is unfortunately no. + +The long answer answer is a bit more nuanced: maintaining Auto-sklearn requires a lot of time and +effort, which would grow even larger when depending on more libraries. Also, adding more +libraries would require us to generate meta-data more often. Lastly, having more choices does not +guarantee a better performance for most users as having more choices demands a longer search for +good models and can lead to more overfitting. + +Nevertheless, everyone can still add his or her favorite model to Auto-sklearn's search space by +following the `examples on how to extend Auto-sklearn +`_. + +If there is interest in creating a auto-sklearn-contrib repository with 3rd-party models please +open an issue for that. + +Can the preprocessing be disabled +--------------------------------- + +No, but we're working on that. Usage ===== @@ -119,8 +138,6 @@ Usage Only use interpretable models ----------------------------- -Issue #1033 - Auto-sklearn can be restricted to only use interpretable models and preprocessing algorithms. Please see the section :ref:`Restricting the searchspace` to learn how to restrict the models which are searched over or see @@ -144,7 +161,28 @@ models termination condition is reached. Ensemble contains only a dummy model ------------------------------------ +TODO + 3. How to set useful budgets: https://github.com/automl/auto-sklearn/issues/57 Parallel processing and oversubscription ---------------------------------------- + +TODO + +Meta-Learning +============= + +Which datasets are used for meta-learning? +------------------------------------------ + +We updated the list of datasets used for meta-learning several times and this list now differes +significantly from the original 140 datasets we used in 2015 when the paper and the package were +released. An up-to-date list of `OpenML task IDs `_ can be found +on `github `_ + +Which meta-features are used for meta-learning? +----------------------------------------------- + +We do not have a user guide on meta-features but they are all pretty simple and can be found +`in the source code `_. From ab561704ca55ad7e03aa6cb8b14614ae72e0859c Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 1 Apr 2021 10:46:49 +0200 Subject: [PATCH 5/9] add section to FAQ --- doc/faq.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/faq.rst b/doc/faq.rst index bd18bee249..8efd9dc9e7 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -186,3 +186,6 @@ Which meta-features are used for meta-learning? We do not have a user guide on meta-features but they are all pretty simple and can be found `in the source code `_. + +How is the meta-data generated? +------------------------------- From 75aed82e004b5097cab6221c91aaec0f4574b023 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 21 May 2021 14:00:49 +0200 Subject: [PATCH 6/9] Update FAQ based on feedback --- doc/faq.rst | 114 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 38 deletions(-) diff --git a/doc/faq.rst b/doc/faq.rst index 8efd9dc9e7..3c078eaecf 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -22,30 +22,38 @@ Auto-sklearn executes each machine learning algorithm in its own process to be a memory limit and a time limit. To start such a process, Python gives three options: ``fork``, ``forkserver`` and ``spawn``. The default ``fork`` copies the whole process memory into the subprocess. If the main process already uses 1.5GB of main memory and we apply a 3GB memory -limit to Auto-sklearn, it will only be able to use 1.5GB of that. We would have loved to use -``forkserver`` or ``spawn`` instead, which both don't suffer from this issue (and have some -further improvements, see `here `_), but -they require the Auto-sklearn code to be guarded by ``if __name__ == "__main__"`` or executed in a -notebook and we decided that we do not want to require this by default. +limit to Auto-sklearn, executing a machine learning pipeline is limited to use at most 1.5GB. +We would have loved to use ``forkserver`` or ``spawn`` as the default option instead, which both +copy only relevant data into the subprocess and thereby alleaviate the issue of eating up a lot +of your main memory +(and also do not suffer from potential deadlocks as ``fork`` does, see +`here `_), +but they require the Auto-sklearn code to be guarded by ``if __name__ == "__main__"`` or executed +in a notebook and we decided that we do not want to require this by default. There are now two possible solutions: -1. Use parallel Auto-sklearn: if you use Auto-sklean in parallel, it defaults to ``forkserver`` - as the parallelization mechanism itself requires Auto-sklearn the code to be guarded. +1. Use Auto-sklearn in parallel: if you use Auto-sklean in parallel, it defaults to ``forkserver`` + as the parallelization mechanism itself requires Auto-sklearn the code to be guarded. Please + find more information on how to do this in the following two examples: + + 1. :ref:`sphx_glr_examples_60_search_example_parallel_n_jobs.py` + 2. :ref:`sphx_glr_examples_60_search_example_parallel_manual_spawning_cli.py` + 2. Pass a `dask client `_. If the user passes a dask client, Auto-sklearn can no longer assume that it runs in sequential mode and will use a ``forkserver`` to start new processes. We therefore suggest using one of the above settings by default. -Auto-sklearn is extremely memory hungry in a sequential setting ---------------------------------------------------------------- +Auto-sklearn is extremely memory hungry in a parallel setting +------------------------------------------------------------- When running Auto-sklearn in a parallel setting it starts new processes for evaluating machine learning models using the ``forkserver`` mechanism. If not all code in the main script is guarded -by ``if __name__ == "__main__"`` it is executed for each subprocess. If now part of the code that -is not guarded is the code that is loading your dataset, it is executed for every evaluation of a -machine learning algorithm, blocking your RAM. +by ``if __name__ == "__main__"`` it is executed for each subprocess. If such code is for example +loading your dataset, it is executed for every evaluation of a machine learning algorithm, +blocking your RAM. We therefore suggest moving all code inside the main block or functions. @@ -55,19 +63,23 @@ Auto-sklearn crashes with a segmentation fault Please make sure that you have read and followed the :ref:`installation` section! In case everything is set up correctly, this is most likely due to the dependency `pyrfr `_ not being compiled correctly. If this is the -case please execute +case please execute: .. code:: python import pyrfr.regression as reg data = reg.default_data_container(64) -If this fails, the pyrfr dependency is indeed not compiled correctly. We advice you to do the +If this fails, the pyrfr dependency is most likely not compiled correctly. We advice you to do the following: -1. Check if you can use a pre-compiled version of the pyrfr to avoid compiling it yourself. -2. If you are not yet using Conda, consider using it. -3. Install correct build dependencies before installing the pyrfr, you can check the following +1. Check if you can use a pre-compiled version of the pyrfr to avoid compiling it yourself. We + provide pre-compiled versions of the pyrfr on `pypi `_. +3. Check if the dependencies specified under :ref:`installation` are correctly installed, + especially that you have ``swig`` and a ``C++`` compiler. +2. If you are not yet using Conda, consider using it; it simplifies installation of the correct + dependencies. +4. Install correct build dependencies before installing the pyrfr, you can check the following github issues for suggestions: `1025 `_, `856 `_ @@ -77,34 +89,40 @@ Log files and output Where does Auto-sklearn output files by default? ------------------------------------------------ -*auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can -be used to inspect the behavior auto Auto-sklearn. Each run of Auto-sklearn requires -its own directory *auto-sklearn* requests a temporary directory from +*Auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can +be used to inspect the behavior of Auto-sklearn. Each run of Auto-sklearn requires +its own directory. If not provided by the user, *Auto-sklearn* requests a temporary directory from Python, which by default is located under ``/tmp`` and starts with ``autosklearn_tmp_`` followed by a random string. The :class:`autosklearn.classification.AutoSklearnClassifier` and all other *auto-sklearn* estimators accept the argument ``tmp_directory`` which change where such output is written to. -There's an additional argument ``output_directory`` which can be passed to *auto-sklearn* and it +There's an additional argument ``output_directory`` which can be passed to *Auto-sklearn* and it controls where test predictions of the ensemble are stored if the test set is passed to ``fit()``. Auto-sklearn eats up all my disk space -------------------------------------- -*auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can -be used to inspect the behavior auto Auto-sklearn. By default, *auto-sklearn* stores around 50 +*Auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can +be used to inspect the behavior of Auto-sklearn. By default, *Auto-sklearn* stores 50 models and their predictions on the validation data (which is a subset of the training data in -case of holdout and the full training data in case of cross-validation) on the hard drive. Models -are removed everytime the ensemble builder finishes an iteration, which means that the number of -models stored on disk is only an approximation. One can therefore change the number of models -that will be stored on disk by passing an integer for the argument ``max_models_on_disc``. +case of holdout and the full training data in case of cross-validation) on the hard drive. +Redundant models (i.e. when we have more than 50 models) are removed everytime the ensemble builder +finishes an iteration, which means that the number of models stored on disk can temporarily be +higher if a model is output while the ensemble builder is running. One can therefore change the +number of models that will be stored on disk by passing an integer for the argument +``max_models_on_disc``. As the number of models is only an indicator of the disk space used it is also possible to pass -the memory in MB the models are allowed to use as a float. As above this is rather a guideline on -how much memory is used as the models are removed from disk everytime the ensemble builder -finishes an iteration. Especially when running in parallel it can happen that multiple models are -constructed during one run of the ensemble builder. +the memory in MB the models are allowed to use as a ``float``. As above this is rather a +guideline on how much memory is used as redundant models are only removed from disk when the +ensemble builder finishes an iteration. + +.. note:: + + Especially when running in parallel it can happen that multiple models are constructed during + one run of the ensemble builder and thus exceed the disk limit. Available machine learning models ================================= @@ -112,7 +130,7 @@ Available machine learning models Will non-scikit-learn models be added to Auto-sklearn? ------------------------------------------------------ -The short answer is unfortunately no. +The short answer: no. The long answer answer is a bit more nuanced: maintaining Auto-sklearn requires a lot of time and effort, which would grow even larger when depending on more libraries. Also, adding more @@ -124,13 +142,15 @@ Nevertheless, everyone can still add his or her favorite model to Auto-sklearn's following the `examples on how to extend Auto-sklearn `_. -If there is interest in creating a auto-sklearn-contrib repository with 3rd-party models please +If there is interest in creating a Auto-sklearn-contrib repository with 3rd-party models please open an issue for that. Can the preprocessing be disabled --------------------------------- -No, but we're working on that. +Feature preprocessing can be disabled as discussed und :ref:`Restricting the searchspace`. Other +preprocessing steps such as one hot encoding, missing feature imputation and normalization cannot +yet be disabled, but we're working on that. Usage ===== @@ -147,11 +167,11 @@ We don't provide a judgement which of the models are interpretable as this is ve specific use case, but would like to note that decision trees and linear models usually most interpretable. -Passing the number of models to try ------------------------------------ +Limiting the number of model evaluations +---------------------------------------- In certain cases, for example for debugging, it can be helpful to limit the number of -models to try. We do not provide this as an argument in the API as we believe that it +model evaluations. We do not provide this as an argument in the API as we believe that it should NOT be used in practice, but that the user should rather provide time limits. An example on how to add the number of models to try as an additional stopping condition can be found `in this github issue `_. @@ -176,11 +196,20 @@ Meta-Learning Which datasets are used for meta-learning? ------------------------------------------ -We updated the list of datasets used for meta-learning several times and this list now differes +We updated the list of datasets used for meta-learning several times and this list now differs significantly from the original 140 datasets we used in 2015 when the paper and the package were released. An up-to-date list of `OpenML task IDs `_ can be found on `github `_ +How can datasets from the meta-data be excluded? +------------------------------------------------ + +For *Auto-sklearn 1.0* one can pass the dataset name via the ``fit()`` function. If a dataset +with the same name is within the meta-data, that datasets will not be used. + +For *Auto-sklearn 2.0* it is not possible to do so because of the method used to construct the +meta-data. + Which meta-features are used for meta-learning? ----------------------------------------------- @@ -189,3 +218,12 @@ We do not have a user guide on meta-features but they are all pretty simple and How is the meta-data generated? ------------------------------- + +We currently generate meta-data the following way. First, for each of the datasets mentioned +above, we run Auto-sklearn without meta-learning for a total of two days on multiple metrics (for +classification these are accuracy, balanced accuracy, log loss and the area under the curce). +Second, for each run we then have a look at each models that improved the score, i.e. the +trajectory of the best known model at a time, and refit it on the whole training data. Third, for +each of these models we then compute all scores we're interested in, these also include other +ones such F1 and precision. Finally, for each combination of dataset and metric we store the best +model we know of. From 227365a8f3351380a039445be1632a32802c53cc Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 21 May 2021 14:11:10 +0200 Subject: [PATCH 7/9] fix link --- doc/manual.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/manual.rst b/doc/manual.rst index b051fcf9bc..37d2810366 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -78,7 +78,8 @@ For a full list please have a look at the source code (in `autosklearn/pipeline/ * `Regressors `_ * `Preprocessors `_ -We do also provide an example `on how to restrict the classifiers to search over `_. +We do also provide an example +`on how to restrict the classifiers to search over `_. Turning off preprocessing ~~~~~~~~~~~~~~~~~~~~~~~~~ From b891c22c1e287229fac331449163023326debe29 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Sun, 23 May 2021 17:20:44 +0200 Subject: [PATCH 8/9] incorporate feedback --- doc/faq.rst | 91 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/doc/faq.rst b/doc/faq.rst index 3c078eaecf..d3120a12c2 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -6,7 +6,7 @@ FAQ === -Errors +Issues ====== Auto-sklearn is extremely memory hungry in a sequential setting @@ -15,7 +15,7 @@ Auto-sklearn is extremely memory hungry in a sequential setting Auto-sklearn can appear very memory hungry (i.e. requiring a lot of memory for small datasets) due to the use of ``fork`` for creating new processes when running in sequential manner (if this happens in a parallel setting or if you pass your own dask client this is due to a different -issue). +issue, see the other issues below). Let's go into some more detail and discuss how to fix it: Auto-sklearn executes each machine learning algorithm in its own process to be able to apply a @@ -28,8 +28,8 @@ copy only relevant data into the subprocess and thereby alleaviate the issue of of your main memory (and also do not suffer from potential deadlocks as ``fork`` does, see `here `_), -but they require the Auto-sklearn code to be guarded by ``if __name__ == "__main__"`` or executed -in a notebook and we decided that we do not want to require this by default. +but they have the downside that code must be guarded by ``if __name__ == "__main__"`` or executed +in a notebook, and we decided that we do not want to require this by default. There are now two possible solutions: @@ -40,22 +40,30 @@ There are now two possible solutions: 1. :ref:`sphx_glr_examples_60_search_example_parallel_n_jobs.py` 2. :ref:`sphx_glr_examples_60_search_example_parallel_manual_spawning_cli.py` + .. note:: + + This requires all code to be guarded by ``if __name__ == "__main__"``. + 2. Pass a `dask client `_. If the user passes a dask client, Auto-sklearn can no longer assume that it runs in sequential mode and will use a ``forkserver`` to start new processes. + .. note:: + + This requires all code to be guarded by ``if __name__ == "__main__"``. + We therefore suggest using one of the above settings by default. Auto-sklearn is extremely memory hungry in a parallel setting ------------------------------------------------------------- When running Auto-sklearn in a parallel setting it starts new processes for evaluating machine -learning models using the ``forkserver`` mechanism. If not all code in the main script is guarded -by ``if __name__ == "__main__"`` it is executed for each subprocess. If such code is for example -loading your dataset, it is executed for every evaluation of a machine learning algorithm, -blocking your RAM. +learning models using the ``forkserver`` mechanism. Code that is in the main script and that is +not guarded by ``if __name__ == "__main__"`` will be executed for each subprocess. If, for example, +you are loading your dataset outside of the guarded code, your dataset will be loaded for each +evaluation of a machine learning algorithm and thus blocking your RAM. -We therefore suggest moving all code inside the main block or functions. +We therefore suggest moving all code inside function or the main block. Auto-sklearn crashes with a segmentation fault ---------------------------------------------- @@ -76,7 +84,9 @@ following: 1. Check if you can use a pre-compiled version of the pyrfr to avoid compiling it yourself. We provide pre-compiled versions of the pyrfr on `pypi `_. 3. Check if the dependencies specified under :ref:`installation` are correctly installed, - especially that you have ``swig`` and a ``C++`` compiler. + especially that you have ``swig`` and a ``C++`` compiler. If you are using an older version of + the pyrfr (``<=0.8.0``) the dependency on SWIG as stricter and you actually need SWIG3 to + compile the pyrfr.. 2. If you are not yet using Conda, consider using it; it simplifies installation of the correct dependencies. 4. Install correct build dependencies before installing the pyrfr, you can check the following @@ -93,7 +103,9 @@ Where does Auto-sklearn output files by default? be used to inspect the behavior of Auto-sklearn. Each run of Auto-sklearn requires its own directory. If not provided by the user, *Auto-sklearn* requests a temporary directory from Python, which by default is located under ``/tmp`` and starts with ``autosklearn_tmp_`` followed -by a random string. +by a random string. By default, this directory is deleted when the *Auto-sklearn* object is +destroyed. If you want to keep these files you can pass the argument +``delete_tmp_folder_after_terminate=True`` to the *Auto-sklearn* object. The :class:`autosklearn.classification.AutoSklearnClassifier` and all other *auto-sklearn* estimators accept the argument ``tmp_directory`` which change where such output is written to. @@ -108,21 +120,28 @@ Auto-sklearn eats up all my disk space be used to inspect the behavior of Auto-sklearn. By default, *Auto-sklearn* stores 50 models and their predictions on the validation data (which is a subset of the training data in case of holdout and the full training data in case of cross-validation) on the hard drive. -Redundant models (i.e. when we have more than 50 models) are removed everytime the ensemble builder -finishes an iteration, which means that the number of models stored on disk can temporarily be -higher if a model is output while the ensemble builder is running. One can therefore change the -number of models that will be stored on disk by passing an integer for the argument -``max_models_on_disc``. +Redundant models and their predictions (i.e. when we have more than 50 models) are removed +everytime the ensemble builder finishes an iteration, which means that the number of models stored +on disk can temporarily be higher if a model is output while the ensemble builder is running. + +One can therefore change the number of models that will be stored on disk by passing an integer +for the argument ``max_models_on_disc`` to *Auto-sklearn*, for example reduce the number of models +stored on disk if you have space issues. As the number of models is only an indicator of the disk space used it is also possible to pass -the memory in MB the models are allowed to use as a ``float``. As above this is rather a +the memory in MB the models are allowed to use as a ``float``. As above, this is rather a guideline on how much memory is used as redundant models are only removed from disk when the ensemble builder finishes an iteration. .. note:: Especially when running in parallel it can happen that multiple models are constructed during - one run of the ensemble builder and thus exceed the disk limit. + one run of the ensemble builder and thus *Auto-sklearn* can exceed the given limit. + +.. note:: + + These limits do only apply to models and their predictions, but not to other files stored in + the temporary directory such as the log files. Available machine learning models ================================= @@ -138,7 +157,7 @@ libraries would require us to generate meta-data more often. Lastly, having more guarantee a better performance for most users as having more choices demands a longer search for good models and can lead to more overfitting. -Nevertheless, everyone can still add his or her favorite model to Auto-sklearn's search space by +Nevertheless, everyone can still add their favorite model to Auto-sklearn's search space by following the `examples on how to extend Auto-sklearn `_. @@ -181,14 +200,32 @@ models termination condition is reached. Ensemble contains only a dummy model ------------------------------------ -TODO - -3. How to set useful budgets: https://github.com/automl/auto-sklearn/issues/57 +This is a symptom of the problem that all runs started by Auto-sklearn failed. Usually, the issue +is that the runtime or memory limit were too tight. Please check the output of +``sprint_statistics`` to see the distribution of why runs failed. If there are mostly crashed +runs, please check the log file for further details. If there are mostly runs that exceed the +memory or time limit, please increase the respective limit and rerun the optimization. Parallel processing and oversubscription ---------------------------------------- -TODO +Auto-sklearn wraps scikit-learn and therefore inherits its parallelism implementation. In short, +scikit-learn uses two modes of parallelizing computations: + +1. By using joblib to distribute independent function calls on multiple cores. +2. By using lower level libraries such as OpenML and numpy to distribute more fine-grained + computation. + +This means that Auto-sklearn can use more resources than expected by the user. For technical +reasons we can only control the 1st way of parallel execution, but not the 2nd. Thus, the user +needs to make sure that the lower level parallelization libraries only use as many cores as +allocated (on a laptop or workstation running a single copy of Auto-sklearn it can be fine to not +adjust this, but when using a compute cluster it is necessary to align the parallelism setting +with the number of requested CPUs). This can be done by setting the following environment +variables: ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, ``BLIS_NUM_THREADS`` and +``OMP_NUM_THREADS``. + +More details can be found in the `scikit-learn docs ` Meta-Learning ============= @@ -219,6 +256,9 @@ We do not have a user guide on meta-features but they are all pretty simple and How is the meta-data generated? ------------------------------- +Auto-sklearn 1.0 +~~~~~~~~~~~~~~~~ + We currently generate meta-data the following way. First, for each of the datasets mentioned above, we run Auto-sklearn without meta-learning for a total of two days on multiple metrics (for classification these are accuracy, balanced accuracy, log loss and the area under the curce). @@ -227,3 +267,8 @@ trajectory of the best known model at a time, and refit it on the whole training each of these models we then compute all scores we're interested in, these also include other ones such F1 and precision. Finally, for each combination of dataset and metric we store the best model we know of. + +Auto-sklearn 2.0 +~~~~~~~~~~~~~~~~ + +Please check `our paper `_ for details. From b675954cf059a250cbf2b640e7b7441fe15d5657 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 27 May 2021 14:02:42 +0200 Subject: [PATCH 9/9] include feedback --- doc/faq.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/faq.rst b/doc/faq.rst index d3120a12c2..51df11e7e9 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -63,7 +63,7 @@ not guarded by ``if __name__ == "__main__"`` will be executed for each subproces you are loading your dataset outside of the guarded code, your dataset will be loaded for each evaluation of a machine learning algorithm and thus blocking your RAM. -We therefore suggest moving all code inside function or the main block. +We therefore suggest moving all code inside functions or the main block. Auto-sklearn crashes with a segmentation fault ---------------------------------------------- @@ -86,7 +86,7 @@ following: 3. Check if the dependencies specified under :ref:`installation` are correctly installed, especially that you have ``swig`` and a ``C++`` compiler. If you are using an older version of the pyrfr (``<=0.8.0``) the dependency on SWIG as stricter and you actually need SWIG3 to - compile the pyrfr.. + compile the pyrfr. 2. If you are not yet using Conda, consider using it; it simplifies installation of the correct dependencies. 4. Install correct build dependencies before installing the pyrfr, you can check the following @@ -129,9 +129,9 @@ for the argument ``max_models_on_disc`` to *Auto-sklearn*, for example reduce th stored on disk if you have space issues. As the number of models is only an indicator of the disk space used it is also possible to pass -the memory in MB the models are allowed to use as a ``float``. As above, this is rather a -guideline on how much memory is used as redundant models are only removed from disk when the -ensemble builder finishes an iteration. +the memory in MB the models are allowed to use as a ``float`` (also via the ``max_models_on_disc`` +arguments). As above, this is rather a guideline on how much memory is used as redundant models +are only removed from disk when the ensemble builder finishes an iteration. .. note::