automl · mfeurer · May 28, 2020 · May 26, 2020 · May 27, 2020 · May 27, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -21,13 +21,13 @@ matrix:
 
   include:
   - os: linux
-    env: DISTRIB="conda" EXAMPLES="true" PYTHON="3.7" SKIP_TESTS="true"
+    env: DISTRIB="conda" DOCPUSH="true" PYTHON="3.7" SKIP_TESTS="true"
   - os: linux
     env: DISTRIB="conda" RUN_FLAKE8="true" SKIP_TESTS="true"
   - os: linux
     env: DISTRIB="conda" PYTHON="3.5"
   - os: linux
-    env: DISTRIB="conda" COVERAGE="true" DOCPUSH="true" PYTHON="3.6"
+    env: DISTRIB="conda" COVERAGE="true" PYTHON="3.6"
   - os: linux
     env: DISTRIB="conda" TEST_DIST="true" PYTHON="3.7"
   - os: linux
@@ -76,10 +76,10 @@ after_success: source ci_scripts/success.sh && source ci_scripts/create_doc.sh $
 
 deploy:
   provider: pages
-  skip-cleanup: true
-  github-token: $GITHUB_TOKEN # set in the settings page of my repository
-  keep-hisotry: true
-  commiter-from-gh: true
+  skip_cleanup: true
+  github_token: $GITHUB_TOKEN # set in the settings page of my repository
+  keep-history: true
+  committer-from-gh: true
   on:
     all_branches: true
     condition: $doc_result = "success"

diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py
@@ -98,6 +98,7 @@ def __init__(
             `auto-sklearn` will stop fitting the machine learning algorithm if
             it tries to allocate more than `ml_memory_limit` MB.
             If None is provided, no memory limit is set.
+            In case of multi-processing, `ml_memory_limit` will be per job.
 
         include_estimators : list, optional (None)
             If None, all possible estimators are used. Otherwise specifies

diff --git a/ci_scripts/create_doc.sh b/ci_scripts/create_doc.sh
@@ -8,7 +8,7 @@ if ! [[ -z ${DOCPUSH+x} ]]; then
     if [[ "$DOCPUSH" == "true" ]]; then
 
         # install documentation building dependencies
-        pip install --upgrade matplotlib seaborn setuptools pytest coverage sphinx pillow sphinx-gallery==0.5 sphinx_bootstrap_theme cython numpydoc nbformat nbconvert mock
+        pip install --upgrade matplotlib seaborn setuptools pytest coverage sphinx pillow sphinx-gallery sphinx_bootstrap_theme cython numpydoc nbformat nbconvert mock
 
         # $1 is the branch name
         # $2 is the global variable where we set the script status

diff --git a/doc/api.rst b/doc/api.rst
@@ -83,8 +83,6 @@ For more information about how these metrics are used, please read
 
 .. autoclass:: autosklearn.metrics.log_loss
 
-.. autoclass:: autosklearn.metrics.pac_score
-
 Regression metrics
 ~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/conf.py b/doc/conf.py
@@ -69,7 +69,8 @@
     #'reference_url': {
     #    'autosklearn': None
     #},
-    'backreferences_dir': False
+    'backreferences_dir': None,
+    'filename_pattern': 'example.*.py$',
 }
 
 # Add any paths that contain templates here, relative to this directory.

diff --git a/doc/installation.rst b/doc/installation.rst
@@ -14,7 +14,7 @@ auto-sklearn has the following system requirements:
 * Linux operating system (for example Ubuntu) `(get Linux here) <https://www.wikihow.com/Install-Linux>`_,
 * Python (>=3.5) `(get Python here) <https://www.python.org/downloads/>`_.
 * C++ compiler (with C++11 supports) `(get GCC here) <https://www.tutorialspoint.com/How-to-Install-Cplusplus-Compiler-on-Linux>`_ and
-* SWIG (version 3.0 or later) `(get SWIG here) <http://www.swig.org/survey.html>`_.
+* SWIG (version 3.0.* is required; >=4.0.0 is not supported) `(get SWIG here) <http://www.swig.org/survey.html>`_.
 
 For an explanation of missing Microsoft Windows and MAC OSX support please
 check the Section `Windows/OSX compatibility`_.

diff --git a/doc/manual.rst b/doc/manual.rst
@@ -25,9 +25,11 @@ aspects of its usage:
 * `Using custom metrics <examples/example_metrics.html>`_
 * `Random search <examples/example_random_search.html>`_
 * `EIPS <examples/example_eips.html>`_
+* `Successive Halving <examples/example_successive_halving.html>`_
 * `Extending with a new classifier <examples/example_extending_classification.html>`_
 * `Extending with a new regressor <examples/example_extending_regression.html>`_
 * `Extending with a new preprocessor <examples/example_extending_preprocessor.html>`_
+* `Iterating over the models <examples/example_get_pipeline_components.html>`_
 
 
 Time and memory limits
@@ -90,6 +92,16 @@ Resampling strategies
 
 Examples for using holdout and cross-validation can be found in `auto-sklearn/examples/ <examples/>`_
 
+Ensemble Building Process
+======================
+
+*auto-sklearn* uses ensemble selection by `Caruana et al. (2004) <https://dl.acm.org/doi/pdf/10.1145/1015330.1015432>`_ 
+to build an ensemble based on the models’ prediction for the validation set. The following hyperparameters control how the ensemble is constructed:
+
+* ``ensemble_size`` determines the maximal size of the ensemble. If it is set to zero, no ensemble will be constructed.
+* ``ensemble_nbest`` allows the user to directly specify the number of models considered for the ensemble.  This hyperparameter can be an integer *n*, such that only the best *n* models are used in the final ensemble. If a float between 0.0 and 1.0 is provided, ``ensemble_nbest`` would be interpreted as a fraction suggesting the percentage of models to use in the ensemble building process (namely, if ensemble_nbest is a float, library pruning is implemented as described in `Caruana et al. (2006) <https://dl.acm.org/doi/10.1109/ICDM.2006.76>`_).
+ * ``max_models_on_disc`` defines the maximum number of models that are kept on the disc, as a mechanism to control the amount of disc space consumed by *auto-sklearn*. Throughout the automl process, different individual models are optimized, and their predictions (and other metadata) is stored on disc. The user can set the upper bound on how many models are acceptable to keep on disc, yet this variable takes priority in the definition of the number of models used by the ensemble builder (that is, the minimum of ``ensemble_size``, ``ensemble_nbest`` and ``max_models_on_disc`` determines the maximal amount of models used in the ensemble). If set to None, this feature is disabled. 
+
 Inspecting the results
 ======================
 
@@ -107,7 +119,10 @@ statistics can be printed for the inspection.
 ``sprint_statistics()`` is a method that prints the name of the  dataset, the metric used, and the best validation score
 obtained by running *auto-sklearn*. It additionally prints the number of both successful and unsuccessful
 algorithm runs.
-The results obtained from the final ensemble can be printed by calling ``show_models()``.
+
+The results obtained from the final ensemble can be printed by calling ``show_models()``. *auto-sklearn* ensemble is composed of scikit-learn models that can be inspected as exemplified by  
+`model inspection example <examples/example_get_pipeline_components.html>`_
+.
 
 Parallel computation
 ====================

diff --git a/examples/example_crossvalidation.py b/examples/example_crossvalidation.py
@@ -20,41 +20,50 @@
 import autosklearn.classification
 
 
-def main():
-    X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-    X_train, X_test, y_train, y_test = \
-        sklearn.model_selection.train_test_split(X, y, random_state=1)
-
-    automl = autosklearn.classification.AutoSklearnClassifier(
-        time_left_for_this_task=120,
-        per_run_time_limit=30,
-        tmp_folder='/tmp/autosklearn_cv_example_tmp',
-        output_folder='/tmp/autosklearn_cv_example_out',
-        resampling_strategy='cv',
-        resampling_strategy_arguments={'folds': 5},
-    )
-
-    # fit() changes the data in place, but refit needs the original data. We
-    # therefore copy the data. In practice, one should reload the data
-    automl.fit(X_train.copy(), y_train.copy(), dataset_name='breast_cancer')
-
-    print(automl.sprint_statistics())
-
-    # One can use models trained during cross-validation directly to predict
-    # for unseen data. For this, all k models trained during k-fold
-    # cross-validation are considered as a single soft-voting ensemble inside
-    # the ensemble constructed with ensemble selection.
-    print('Before re-fit')
-    predictions = automl.predict(X_test)
-    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
-
-    # During fit(), models are fit on individual cross-validation folds. To use
-    # all available data, we call refit() which trains all models in the
-    # final ensemble on the whole dataset.
-    automl.refit(X_train.copy(), y_train.copy())
-    predictions = automl.predict(X_test)
-    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
-
-
-if __name__ == '__main__':
-    main()
+############################################################################
+# Data Loading
+# ============
+
+X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
+X_train, X_test, y_train, y_test = \
+    sklearn.model_selection.train_test_split(X, y, random_state=1)
+
+############################################################################
+# Building  and fitting the classifier
+# ====================================
+
+automl = autosklearn.classification.AutoSklearnClassifier(
+    time_left_for_this_task=120,
+    per_run_time_limit=30,
+    tmp_folder='/tmp/autosklearn_cv_example_tmp',
+    output_folder='/tmp/autosklearn_cv_example_out',
+    resampling_strategy='cv',
+    resampling_strategy_arguments={'folds': 5},
+)
+
+# fit() changes the data in place, but refit needs the original data. We
+# therefore copy the data. In practice, one should reload the data
+automl.fit(X_train.copy(), y_train.copy(), dataset_name='breast_cancer')
+
+############################################################################
+# Print Results before refit
+# ==========================
+print(automl.sprint_statistics())
+
+# One can use models trained during cross-validation directly to predict
+# for unseen data. For this, all k models trained during k-fold
+# cross-validation are considered as a single soft-voting ensemble inside
+# the ensemble constructed with ensemble selection.
+print('Before re-fit')
+predictions = automl.predict(X_test)
+print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
+
+############################################################################
+# Perform a refit
+# ===============
+# During fit(), models are fit on individual cross-validation folds. To use
+# all available data, we call refit() which trains all models in the
+# final ensemble on the whole dataset.
+automl.refit(X_train.copy(), y_train.copy())
+predictions = automl.predict(X_test)
+print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
diff --git a/examples/example_eips.py b/examples/example_eips.py
@@ -22,6 +22,10 @@
 import autosklearn.classification
 
 
+############################################################################
+# EIPS callback
+# =============
+# create a callack to change the acquisition function inside SMAC
 def get_eips_object_callback(
         scenario_dict,
         seed,
@@ -55,29 +59,46 @@ def get_eips_object_callback(
     )
 
 
-def main():
-    X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
-    X_train, X_test, y_train, y_test = \
-        sklearn.model_selection.train_test_split(X, y, random_state=1)
+############################################################################
+# Data Loading
+# ============
 
-    automl = autosklearn.classification.AutoSklearnClassifier(
-        time_left_for_this_task=120,
-        per_run_time_limit=30,
-        tmp_folder='/tmp/autosklearn_eips_example_tmp',
-        output_folder='/tmp/autosklearn_eips_example_out',
-        get_smac_object_callback=get_eips_object_callback,
-        initial_configurations_via_metalearning=0,
-    )
-    automl.fit(X_train, y_train, dataset_name='breast_cancer')
+X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
+X_train, X_test, y_train, y_test = \
+    sklearn.model_selection.train_test_split(X, y, random_state=1)
+
+############################################################################
+# Building and fitting the classifier
+# ===================================
+
+automl = autosklearn.classification.AutoSklearnClassifier(
+    time_left_for_this_task=120,
+    per_run_time_limit=30,
+    tmp_folder='/tmp/autosklearn_eips_example_tmp',
+    output_folder='/tmp/autosklearn_eips_example_out',
+    get_smac_object_callback=get_eips_object_callback,
+    initial_configurations_via_metalearning=0,
+)
+automl.fit(X_train, y_train, dataset_name='breast_cancer')
+
+############################################################################
+# Print the final ensemble constructed by auto-sklearn
+# ====================================================
+
+# Print the final ensemble constructed by auto-sklearn via ROAR.
+print(automl.show_models())
+
+############################################################################
+# Print statistics about the auto-sklearn run
+# ===========================================
 
-    # Print the final ensemble constructed by auto-sklearn via ROAR.
-    print(automl.show_models())
-    predictions = automl.predict(X_test)
-    # Print statistics about the auto-sklearn run such as number of
-    # iterations, number of models failed with a time out.
-    print(automl.sprint_statistics())
-    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
+# Print statistics about the auto-sklearn run such as number of
+# iterations, number of models failed with a time out.
+print(automl.sprint_statistics())
 
+############################################################################
+# Get the Score of the final ensemble
+# ===================================
 
-if __name__ == '__main__':
-    main()
+predictions = automl.predict(X_test)
+print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
diff --git a/examples/example_extending_classification.py b/examples/example_extending_classification.py
@@ -1,7 +1,7 @@
 """
-====================================================================
+====================================================
 Extending Auto-Sklearn with Classification Component
-====================================================================
+====================================================
 
 The following example demonstrates how to create a new classification
 component for using in auto-sklearn.
@@ -19,8 +19,14 @@
 from autosklearn.pipeline.constants import DENSE, SIGNED_DATA, UNSIGNED_DATA, \
     PREDICTIONS
 
+from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import train_test_split
+
+
+############################################################################
+# Create MLP classifier component for auto-sklearn
+# ================================================
 
-# Create MLP classifier component for auto-sklearn.
 class MLPClassifier(AutoSklearnClassificationAlgorithm):
     def __init__(self,
                  hidden_layer_depth,
@@ -106,28 +112,34 @@ def get_hyperparameter_search_space(dataset_properties=None):
         return cs
 
 
-if __name__ == '__main__':
-    # Add MLP classifier component to auto-sklearn.
-    autosklearn.pipeline.components.classification.add_classifier(MLPClassifier)
-    cs = MLPClassifier.get_hyperparameter_search_space()
-    print(cs)
-
-    # Generate data.
-    from sklearn.datasets import load_breast_cancer
-    from sklearn.model_selection import train_test_split
-    X, y = load_breast_cancer(return_X_y=True)
-    X_train, X_test, y_train, y_test = train_test_split(X, y)
-
-    # Fit MLP classifier to the data.
-    clf = autosklearn.classification.AutoSklearnClassifier(
-        time_left_for_this_task=30,
-        per_run_time_limit=10,
-        include_estimators=['MLPClassifier'],
-    )
-    clf.fit(X_train, y_train)
-
-    # Print test accuracy and statistics.
-    y_pred = clf.predict(X_test)
-    print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
-    print(clf.sprint_statistics())
-    print(clf.show_models())
+# Add MLP classifier component to auto-sklearn.
+autosklearn.pipeline.components.classification.add_classifier(MLPClassifier)
+cs = MLPClassifier.get_hyperparameter_search_space()
+print(cs)
+
+############################################################################
+# Data Loading
+# ============
+
+X, y = load_breast_cancer(return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+############################################################################
+# Fit MLP classifier to the data
+# ==============================
+
+clf = autosklearn.classification.AutoSklearnClassifier(
+    time_left_for_this_task=30,
+    per_run_time_limit=10,
+    include_estimators=['MLPClassifier'],
+)
+clf.fit(X_train, y_train)
+
+############################################################################
+# Print test accuracy and statistics
+# ==================================
+
+y_pred = clf.predict(X_test)
+print("accuracy: ", sklearn.metrics.accuracy_score(y_pred, y_test))
+print(clf.sprint_statistics())
+print(clf.show_models())