diff --git a/doc/conf.py b/doc/conf.py
index bb5ac7480c..e4a85fe336 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -184,6 +184,7 @@
('Examples', 'examples/index'),
('API', 'api'),
('Extending', 'extending'),
+ ('FAQ', 'faq'),
],
# Render the next and previous page links in navbar. (Default: true)
diff --git a/doc/faq.rst b/doc/faq.rst
new file mode 100644
index 0000000000..51df11e7e9
--- /dev/null
+++ b/doc/faq.rst
@@ -0,0 +1,274 @@
+:orphan:
+
+.. _manual:
+
+===
+FAQ
+===
+
+Issues
+======
+
+Auto-sklearn is extremely memory hungry in a sequential setting
+---------------------------------------------------------------
+
+Auto-sklearn can appear very memory hungry (i.e. requiring a lot of memory for small datasets) due
+to the use of ``fork`` for creating new processes when running in sequential manner (if this
+happens in a parallel setting or if you pass your own dask client this is due to a different
+issue, see the other issues below).
+
+Let's go into some more detail and discuss how to fix it:
+Auto-sklearn executes each machine learning algorithm in its own process to be able to apply a
+memory limit and a time limit. To start such a process, Python gives three options: ``fork``,
+``forkserver`` and ``spawn``. The default ``fork`` copies the whole process memory into the
+subprocess. If the main process already uses 1.5GB of main memory and we apply a 3GB memory
+limit to Auto-sklearn, executing a machine learning pipeline is limited to use at most 1.5GB.
+We would have loved to use ``forkserver`` or ``spawn`` as the default option instead, which both
+copy only relevant data into the subprocess and thereby alleaviate the issue of eating up a lot
+of your main memory
+(and also do not suffer from potential deadlocks as ``fork`` does, see
+`here `_),
+but they have the downside that code must be guarded by ``if __name__ == "__main__"`` or executed
+in a notebook, and we decided that we do not want to require this by default.
+
+There are now two possible solutions:
+
+1. Use Auto-sklearn in parallel: if you use Auto-sklean in parallel, it defaults to ``forkserver``
+ as the parallelization mechanism itself requires Auto-sklearn the code to be guarded. Please
+ find more information on how to do this in the following two examples:
+
+ 1. :ref:`sphx_glr_examples_60_search_example_parallel_n_jobs.py`
+ 2. :ref:`sphx_glr_examples_60_search_example_parallel_manual_spawning_cli.py`
+
+ .. note::
+
+ This requires all code to be guarded by ``if __name__ == "__main__"``.
+
+2. Pass a `dask client `_. If the user passes
+ a dask client, Auto-sklearn can no longer assume that it runs in sequential mode and will use
+ a ``forkserver`` to start new processes.
+
+ .. note::
+
+ This requires all code to be guarded by ``if __name__ == "__main__"``.
+
+We therefore suggest using one of the above settings by default.
+
+Auto-sklearn is extremely memory hungry in a parallel setting
+-------------------------------------------------------------
+
+When running Auto-sklearn in a parallel setting it starts new processes for evaluating machine
+learning models using the ``forkserver`` mechanism. Code that is in the main script and that is
+not guarded by ``if __name__ == "__main__"`` will be executed for each subprocess. If, for example,
+you are loading your dataset outside of the guarded code, your dataset will be loaded for each
+evaluation of a machine learning algorithm and thus blocking your RAM.
+
+We therefore suggest moving all code inside functions or the main block.
+
+Auto-sklearn crashes with a segmentation fault
+----------------------------------------------
+
+Please make sure that you have read and followed the :ref:`installation` section! In case
+everything is set up correctly, this is most likely due to the dependency
+`pyrfr `_ not being compiled correctly. If this is the
+case please execute:
+
+.. code:: python
+
+ import pyrfr.regression as reg
+ data = reg.default_data_container(64)
+
+If this fails, the pyrfr dependency is most likely not compiled correctly. We advice you to do the
+following:
+
+1. Check if you can use a pre-compiled version of the pyrfr to avoid compiling it yourself. We
+ provide pre-compiled versions of the pyrfr on `pypi `_.
+3. Check if the dependencies specified under :ref:`installation` are correctly installed,
+ especially that you have ``swig`` and a ``C++`` compiler. If you are using an older version of
+ the pyrfr (``<=0.8.0``) the dependency on SWIG as stricter and you actually need SWIG3 to
+ compile the pyrfr.
+2. If you are not yet using Conda, consider using it; it simplifies installation of the correct
+ dependencies.
+4. Install correct build dependencies before installing the pyrfr, you can check the following
+ github issues for suggestions: `1025 `_,
+ `856 `_
+
+Log files and output
+====================
+
+Where does Auto-sklearn output files by default?
+------------------------------------------------
+
+*Auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can
+be used to inspect the behavior of Auto-sklearn. Each run of Auto-sklearn requires
+its own directory. If not provided by the user, *Auto-sklearn* requests a temporary directory from
+Python, which by default is located under ``/tmp`` and starts with ``autosklearn_tmp_`` followed
+by a random string. By default, this directory is deleted when the *Auto-sklearn* object is
+destroyed. If you want to keep these files you can pass the argument
+``delete_tmp_folder_after_terminate=True`` to the *Auto-sklearn* object.
+
+The :class:`autosklearn.classification.AutoSklearnClassifier` and all other *auto-sklearn*
+estimators accept the argument ``tmp_directory`` which change where such output is written to.
+
+There's an additional argument ``output_directory`` which can be passed to *Auto-sklearn* and it
+controls where test predictions of the ensemble are stored if the test set is passed to ``fit()``.
+
+Auto-sklearn eats up all my disk space
+--------------------------------------
+
+*Auto-sklearn* heavily uses the hard drive to store temporary data, models and log files which can
+be used to inspect the behavior of Auto-sklearn. By default, *Auto-sklearn* stores 50
+models and their predictions on the validation data (which is a subset of the training data in
+case of holdout and the full training data in case of cross-validation) on the hard drive.
+Redundant models and their predictions (i.e. when we have more than 50 models) are removed
+everytime the ensemble builder finishes an iteration, which means that the number of models stored
+on disk can temporarily be higher if a model is output while the ensemble builder is running.
+
+One can therefore change the number of models that will be stored on disk by passing an integer
+for the argument ``max_models_on_disc`` to *Auto-sklearn*, for example reduce the number of models
+stored on disk if you have space issues.
+
+As the number of models is only an indicator of the disk space used it is also possible to pass
+the memory in MB the models are allowed to use as a ``float`` (also via the ``max_models_on_disc``
+arguments). As above, this is rather a guideline on how much memory is used as redundant models
+are only removed from disk when the ensemble builder finishes an iteration.
+
+.. note::
+
+ Especially when running in parallel it can happen that multiple models are constructed during
+ one run of the ensemble builder and thus *Auto-sklearn* can exceed the given limit.
+
+.. note::
+
+ These limits do only apply to models and their predictions, but not to other files stored in
+ the temporary directory such as the log files.
+
+Available machine learning models
+=================================
+
+Will non-scikit-learn models be added to Auto-sklearn?
+------------------------------------------------------
+
+The short answer: no.
+
+The long answer answer is a bit more nuanced: maintaining Auto-sklearn requires a lot of time and
+effort, which would grow even larger when depending on more libraries. Also, adding more
+libraries would require us to generate meta-data more often. Lastly, having more choices does not
+guarantee a better performance for most users as having more choices demands a longer search for
+good models and can lead to more overfitting.
+
+Nevertheless, everyone can still add their favorite model to Auto-sklearn's search space by
+following the `examples on how to extend Auto-sklearn
+`_.
+
+If there is interest in creating a Auto-sklearn-contrib repository with 3rd-party models please
+open an issue for that.
+
+Can the preprocessing be disabled
+---------------------------------
+
+Feature preprocessing can be disabled as discussed und :ref:`Restricting the searchspace`. Other
+preprocessing steps such as one hot encoding, missing feature imputation and normalization cannot
+yet be disabled, but we're working on that.
+
+Usage
+=====
+
+Only use interpretable models
+-----------------------------
+
+Auto-sklearn can be restricted to only use interpretable models and preprocessing algorithms.
+Please see the section :ref:`Restricting the searchspace` to learn how to restrict the models
+which are searched over or see
+`this example `_.
+
+We don't provide a judgement which of the models are interpretable as this is very much up to the
+specific use case, but would like to note that decision trees and linear models usually most
+interpretable.
+
+Limiting the number of model evaluations
+----------------------------------------
+
+In certain cases, for example for debugging, it can be helpful to limit the number of
+model evaluations. We do not provide this as an argument in the API as we believe that it
+should NOT be used in practice, but that the user should rather provide time limits.
+An example on how to add the number of models to try as an additional stopping condition
+can be found `in this github issue `_.
+Please note that Auto-sklearn will stop when either the time limit or the number of
+models termination condition is reached.
+
+Ensemble contains only a dummy model
+------------------------------------
+
+This is a symptom of the problem that all runs started by Auto-sklearn failed. Usually, the issue
+is that the runtime or memory limit were too tight. Please check the output of
+``sprint_statistics`` to see the distribution of why runs failed. If there are mostly crashed
+runs, please check the log file for further details. If there are mostly runs that exceed the
+memory or time limit, please increase the respective limit and rerun the optimization.
+
+Parallel processing and oversubscription
+----------------------------------------
+
+Auto-sklearn wraps scikit-learn and therefore inherits its parallelism implementation. In short,
+scikit-learn uses two modes of parallelizing computations:
+
+1. By using joblib to distribute independent function calls on multiple cores.
+2. By using lower level libraries such as OpenML and numpy to distribute more fine-grained
+ computation.
+
+This means that Auto-sklearn can use more resources than expected by the user. For technical
+reasons we can only control the 1st way of parallel execution, but not the 2nd. Thus, the user
+needs to make sure that the lower level parallelization libraries only use as many cores as
+allocated (on a laptop or workstation running a single copy of Auto-sklearn it can be fine to not
+adjust this, but when using a compute cluster it is necessary to align the parallelism setting
+with the number of requested CPUs). This can be done by setting the following environment
+variables: ``MKL_NUM_THREADS``, ``OPENBLAS_NUM_THREADS``, ``BLIS_NUM_THREADS`` and
+``OMP_NUM_THREADS``.
+
+More details can be found in the `scikit-learn docs `
+
+Meta-Learning
+=============
+
+Which datasets are used for meta-learning?
+------------------------------------------
+
+We updated the list of datasets used for meta-learning several times and this list now differs
+significantly from the original 140 datasets we used in 2015 when the paper and the package were
+released. An up-to-date list of `OpenML task IDs `_ can be found
+on `github `_
+
+How can datasets from the meta-data be excluded?
+------------------------------------------------
+
+For *Auto-sklearn 1.0* one can pass the dataset name via the ``fit()`` function. If a dataset
+with the same name is within the meta-data, that datasets will not be used.
+
+For *Auto-sklearn 2.0* it is not possible to do so because of the method used to construct the
+meta-data.
+
+Which meta-features are used for meta-learning?
+-----------------------------------------------
+
+We do not have a user guide on meta-features but they are all pretty simple and can be found
+`in the source code `_.
+
+How is the meta-data generated?
+-------------------------------
+
+Auto-sklearn 1.0
+~~~~~~~~~~~~~~~~
+
+We currently generate meta-data the following way. First, for each of the datasets mentioned
+above, we run Auto-sklearn without meta-learning for a total of two days on multiple metrics (for
+classification these are accuracy, balanced accuracy, log loss and the area under the curce).
+Second, for each run we then have a look at each models that improved the score, i.e. the
+trajectory of the best known model at a time, and refit it on the whole training data. Third, for
+each of these models we then compute all scores we're interested in, these also include other
+ones such F1 and precision. Finally, for each combination of dataset and metric we store the best
+model we know of.
+
+Auto-sklearn 2.0
+~~~~~~~~~~~~~~~~
+
+Please check `our paper `_ for details.
diff --git a/doc/index.rst b/doc/index.rst
index b6c331c50f..4de22ef01b 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -61,6 +61,7 @@ Manual
* :ref:`manual`
* :ref:`api`
* :ref:`extending`
+* :ref:`faq`
License
diff --git a/doc/installation.rst b/doc/installation.rst
index 3a7e387ecc..2167a7d952 100644
--- a/doc/installation.rst
+++ b/doc/installation.rst
@@ -11,10 +11,15 @@ System requirements
auto-sklearn has the following system requirements:
-* Linux operating system (for example Ubuntu) `(get Linux here) `_,
-* Python (>=3.6) `(get Python here) `_.
-* C++ compiler (with C++11 supports) `(get GCC here) `_ and
-* SWIG (version 3.0.* is required; >=4.0.0 is not supported) `(get SWIG here) `_.
+* Linux operating system (for example Ubuntu) (`get Linux here `_)
+* Python (>=3.6) (`get Python here `_),
+* C++ compiler (with C++11 supports) (`get GCC here `_).
+
+In case you try to install Auto-sklearn on a system where no wheel files for the pyrfr package
+are provided (see `here `_ for available wheels) you also
+need:
+
+* SWIG (version 3.0.* is required; >=4.0.0 is not supported) (`get SWIG here `_).
For an explanation of missing Microsoft Windows and MAC OSX support please
check the Section `Windows/OSX compatibility`_.
@@ -81,9 +86,10 @@ is part of Python's `Unix Specific Services `_ and
+ `860 `_ for suggestions)
* virtual machine
* docker image
@@ -95,21 +101,18 @@ issues holding us back from actively supporting OSX:
* The ``resource`` module cannot enforce a memory limit on a Python process
(see `SMAC3/issues/115 `_).
-* OSX machines on `travis-ci `_ take more than 30
- minutes to spawn. This makes it impossible for us to run unit tests for
- *auto-sklearn* and its dependencies `SMAC3 `_
- and `ConfigSpace `_.
+* Not all dependencies we are using are set up to work on OSX.
In case you're having issues installing the `pyrfr package `_, check out
`this installation suggestion on github `_.
-Possible other solutions (not tested):
+Possible other:
* virtual machine
* docker image
Docker Image
-=========================
+============
A Docker image is also provided on dockerhub. To download from dockerhub,
use:
diff --git a/doc/manual.rst b/doc/manual.rst
index a8d9605cf7..37d2810366 100644
--- a/doc/manual.rst
+++ b/doc/manual.rst
@@ -78,6 +78,9 @@ For a full list please have a look at the source code (in `autosklearn/pipeline/
* `Regressors `_
* `Preprocessors `_
+We do also provide an example
+`on how to restrict the classifiers to search over `_.
+
Turning off preprocessing
~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -152,7 +155,7 @@ Parallel computation
In it's default mode, *auto-sklearn* already uses two cores. The first one is
used for model building, the second for building an ensemble every time a new
-machine learning model has finished training. An example on how to do this sequentially (first searching for individual models, and then building an ensemble from them) can be seen in `sequential auto-sklearn example `_.
+machine learning model has finished training. An example on how to do this sequentially (first searching for individual models, and then building an ensemble from them) can be seen in `sequential auto-sklearn example `_.
Nevertheless, *auto-sklearn* also supports parallel Bayesian optimization via the use of `Dask.distributed `_. By providing the arguments ``n_jobs`` to the estimator construction, one can control the number of cores available to *auto-sklearn* (As exemplified in `sequential auto-sklearn example `_). Distributed processes are also supported by providing a custom client object to *auto-sklearn* like in the
example: `sequential auto-sklearn example `_. When multiple cores are available, *auto-sklearn*
diff --git a/examples/40_advanced/example_interpretable_models.py b/examples/40_advanced/example_interpretable_models.py
new file mode 100644
index 0000000000..da8a22fa56
--- /dev/null
+++ b/examples/40_advanced/example_interpretable_models.py
@@ -0,0 +1,75 @@
+# -*- encoding: utf-8 -*-
+"""
+====================
+Interpretable models
+====================
+
+The following example shows how to inspect the models which *auto-sklearn*
+optimizes over and how to restrict them to an interpretable subset.
+"""
+import sklearn.datasets
+import sklearn.metrics
+
+import autosklearn.classification
+
+
+############################################################################
+# Show available classification models
+# ====================================
+#
+# We will first list all classifiers Auto-sklearn chooses from. A similar
+# call is available for preprocessors (see below) and regression (not shown)
+# as well.
+
+import autosklearn.pipeline.components.classification
+for name in autosklearn.pipeline.components.classification.ClassifierChoice.get_components():
+ print(name)
+
+############################################################################
+# Show available preprocessors
+# ============================
+
+import autosklearn.pipeline.components.feature_preprocessing
+for name in autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice.get_components():
+ print(name)
+
+############################################################################
+# Data Loading
+# ============
+
+X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
+X_train, X_test, y_train, y_test = \
+ sklearn.model_selection.train_test_split(X, y, random_state=1)
+
+############################################################################
+# Build and fit a classifier
+# ==========================
+#
+# We will now only use a subset of the given classifiers and preprocessors.
+# Furthermore, we will restrict the ensemble size to ``1`` to only use the
+# single best model in the end. However, we would like to note that the
+# choice of which models is deemed interpretable is very much up to the user
+# and can change from use case to use case.
+
+automl = autosklearn.classification.AutoSklearnClassifier(
+ time_left_for_this_task=120,
+ per_run_time_limit=30,
+ tmp_folder='/tmp/autosklearn_interpretable_models_example_tmp',
+ include_estimators=['decision_tree', 'lda', 'sgd'],
+ include_preprocessors=['no_preprocessing', 'polynomial', 'select_percentile_classification'],
+ ensemble_size=1,
+)
+automl.fit(X_train, y_train, dataset_name='breast_cancer')
+
+############################################################################
+# Print the final ensemble constructed by auto-sklearn
+# ====================================================
+
+print(automl.show_models())
+
+###########################################################################
+# Get the Score of the final ensemble
+# ===================================
+
+predictions = automl.predict(X_test)
+print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))