automl
diff --git a/‎development/_downloads/4f9b78e1d6464520c85232e30bf19d2b/example_text_preprocessing.ipynb‎
Lines changed: 93 additions & 3 deletions b/‎development/_downloads/4f9b78e1d6464520c85232e30bf19d2b/example_text_preprocessing.ipynb‎
Lines changed: 93 additions & 3 deletions
diff --git a/‎development/_downloads/89acefb6af0174645412e5af4eafade1/example_text_preprocessing.py‎
Lines changed: 59 additions & 54 deletions b/‎development/_downloads/89acefb6af0174645412e5af4eafade1/example_text_preprocessing.py‎
Lines changed: 59 additions & 54 deletions
diff --git a/‎development/_downloads/bc82bea3a5dd7bdba60b65220891d9e5/examples_python.zip‎
889 Bytes b/‎development/_downloads/bc82bea3a5dd7bdba60b65220891d9e5/examples_python.zip‎
889 Bytes
diff --git a/‎development/_downloads/fb625db3c50d423b1b7881136ffdeec8/examples_jupyter.zip‎
1.68 KB b/‎development/_downloads/fb625db3c50d423b1b7881136ffdeec8/examples_jupyter.zip‎
1.68 KB
diff --git a/‎development/_images/sphx_glr_example_inspect_predictions_002.png‎
596 Bytes b/‎development/_images/sphx_glr_example_inspect_predictions_002.png‎
596 Bytes
diff --git a/‎development/_images/sphx_glr_example_pandas_train_test_001.png‎
-1.1 KB b/‎development/_images/sphx_glr_example_pandas_train_test_001.png‎
-1.1 KB
diff --git a/‎development/_images/sphx_glr_example_pandas_train_test_thumb.png‎
-1.13 KB b/‎development/_images/sphx_glr_example_pandas_train_test_thumb.png‎
-1.13 KB
@@ -15,7 +15,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Text Preprocessing\nThis example shows, how to use text features in *auto-sklearn*. *auto-sklearn* can automatically\nencode text features if they are provided as string type in a pandas dataframe.\n\nFor processing text features you need a pandas dataframe and set the desired\ntext columns to string and the categorical columns to category.\n\n*auto-sklearn* text embedding creates a bag of words count.\n"
+        "\n# Text preprocessing\n\nThe following example shows how to fit a simple NLP problem with\n*auto-sklearn*.\n\nFor an introduction to text preprocessing you can follow these links:\n    1. https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html\n    2. https://machinelearningmastery.com/clean-text-machine-learning-python/\n"
       ]
     },
     {
@@ -26,7 +26,7 @@
       },
       "outputs": [],
       "source": [
-        "import sklearn.metrics\nimport sklearn.datasets\nimport autosklearn.classification"
+        "from pprint import pprint\n\nimport pandas as pd\nimport sklearn.metrics\nfrom sklearn.datasets import fetch_20newsgroups\n\nimport autosklearn.classification"
       ]
     },
     {
@@ -44,7 +44,97 @@
       },
       "outputs": [],
       "source": [
-        "X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True)\n\n# by default, the columns which should be strings are not formatted as such\nprint(f\"{X.info()}\\n\")\n\n# manually convert these to string columns\nX = X.astype(\n    {\n        \"name\": \"string\",\n        \"ticket\": \"string\",\n        \"cabin\": \"string\",\n        \"boat\": \"string\",\n        \"home.dest\": \"string\",\n    }\n)\n\n# now *auto-sklearn* handles the string columns with its text feature preprocessing pipeline\n\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n    X, y, random_state=1\n)\n\ncls = autosklearn.classification.AutoSklearnClassifier(\n    time_left_for_this_task=30,\n    # Bellow two flags are provided to speed up calculations\n    # Not recommended for a real implementation\n    initial_configurations_via_metalearning=0,\n    smac_scenario_args={\"runcount_limit\": 1},\n)\n\ncls.fit(X_train, y_train, X_test, y_test)\n\npredictions = cls.predict(X_test)\nprint(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, predictions))\n\n\nX, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True, as_frame=True)\nX = X.select_dtypes(exclude=[\"object\"])\n\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n    X, y, random_state=1\n)\n\ncls = autosklearn.classification.AutoSklearnClassifier(\n    time_left_for_this_task=30,\n    # Bellow two flags are provided to speed up calculations\n    # Not recommended for a real implementation\n    initial_configurations_via_metalearning=0,\n    smac_scenario_args={\"runcount_limit\": 1},\n)\n\ncls.fit(X_train, y_train, X_test, y_test)\n\npredictions = cls.predict(X_test)\nprint(\n    \"Accuracy score without text preprocessing\",\n    sklearn.metrics.accuracy_score(y_test, predictions),\n)"
+        "cats = [\"comp.sys.ibm.pc.hardware\", \"rec.sport.baseball\"]\nX_train, y_train = fetch_20newsgroups(\n    subset=\"train\",  # select train set\n    shuffle=True,  # shuffle the data set for unbiased validation results\n    random_state=42,  # set a random seed for reproducibility\n    categories=cats,  # select only 2 out of 20 labels\n    return_X_y=True,  # 20NG dataset consists of 2 columns X: the text data, y: the label\n)  # load this two columns separately as numpy array\n\nX_test, y_test = fetch_20newsgroups(\n    subset=\"test\",  # select test set for unbiased evaluation\n    categories=cats,  # select only 2 out of 20 labels\n    return_X_y=True,  # 20NG dataset consists of 2 columns X: the text data, y: the label\n)  # load this two columns separately as numpy array"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Creating a pandas dataframe\nBoth categorical and text features are often strings. Python Pandas stores python stings\nin the generic `object` type. Please ensure that the correct\n`dtype <https://pandas.pydata.org/docs/user_guide/basics.html#dtypes>`_ is applied to the correct\ncolumn.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# create a pandas dataframe for training labeling the \"Text\" column as sting\nX_train = pd.DataFrame({\"Text\": pd.Series(X_train, dtype=\"string\")})\n\n# create a pandas dataframe for testing labeling the \"Text\" column as sting\nX_test = pd.DataFrame({\"Text\": pd.Series(X_test, dtype=\"string\")})"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Build and fit a classifier\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# create an autosklearn Classifier or Regressor depending on your task at hand.\nautoml = autosklearn.classification.AutoSklearnClassifier(\n    time_left_for_this_task=60,\n    per_run_time_limit=30,\n    tmp_folder=\"/tmp/autosklearn_text_example_tmp\",\n)\n\nautoml.fit(X_train, y_train, dataset_name=\"20_Newsgroups\")  # fit the automl model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## View the models found by auto-sklearn\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(automl.leaderboard())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Print the final ensemble constructed by auto-sklearn\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "pprint(automl.show_models(), indent=4)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Get the Score of the final ensemble\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "predictions = automl.predict(X_test)\nprint(\"Accuracy score:\", sklearn.metrics.accuracy_score(y_test, predictions))"
       ]
     }
   ],
 
@@ -1,79 +1,84 @@
 # -*- encoding: utf-8 -*-
 """
 ==================
-Text Preprocessing
+Text preprocessing
 ==================
-This example shows, how to use text features in *auto-sklearn*. *auto-sklearn* can automatically
-encode text features if they are provided as string type in a pandas dataframe.
 
-For processing text features you need a pandas dataframe and set the desired
-text columns to string and the categorical columns to category.
+The following example shows how to fit a simple NLP problem with
+*auto-sklearn*.
 
-*auto-sklearn* text embedding creates a bag of words count.
+For an introduction to text preprocessing you can follow these links:
+    1. https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
+    2. https://machinelearningmastery.com/clean-text-machine-learning-python/
 """
+from pprint import pprint
+
+import pandas as pd
 import sklearn.metrics
-import sklearn.datasets
+from sklearn.datasets import fetch_20newsgroups
+
 import autosklearn.classification
 
 ############################################################################
 # Data Loading
 # ============
+cats = ["comp.sys.ibm.pc.hardware", "rec.sport.baseball"]
+X_train, y_train = fetch_20newsgroups(
+    subset="train",  # select train set
+    shuffle=True,  # shuffle the data set for unbiased validation results
+    random_state=42,  # set a random seed for reproducibility
+    categories=cats,  # select only 2 out of 20 labels
+    return_X_y=True,  # 20NG dataset consists of 2 columns X: the text data, y: the label
+)  # load this two columns separately as numpy array
+
+X_test, y_test = fetch_20newsgroups(
+    subset="test",  # select test set for unbiased evaluation
+    categories=cats,  # select only 2 out of 20 labels
+    return_X_y=True,  # 20NG dataset consists of 2 columns X: the text data, y: the label
+)  # load this two columns separately as numpy array
 
-X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True)
-
-# by default, the columns which should be strings are not formatted as such
-print(f"{X.info()}\n")
-
-# manually convert these to string columns
-X = X.astype(
-    {
-        "name": "string",
-        "ticket": "string",
-        "cabin": "string",
-        "boat": "string",
-        "home.dest": "string",
-    }
-)
+############################################################################
+# Creating a pandas dataframe
+# ===========================
+# Both categorical and text features are often strings. Python Pandas stores python stings
+# in the generic `object` type. Please ensure that the correct
+# `dtype <https://pandas.pydata.org/docs/user_guide/basics.html#dtypes>`_ is applied to the correct
+# column.
 
-# now *auto-sklearn* handles the string columns with its text feature preprocessing pipeline
+# create a pandas dataframe for training labeling the "Text" column as sting
+X_train = pd.DataFrame({"Text": pd.Series(X_train, dtype="string")})
 
-X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-    X, y, random_state=1
-)
+# create a pandas dataframe for testing labeling the "Text" column as sting
+X_test = pd.DataFrame({"Text": pd.Series(X_test, dtype="string")})
 
-cls = autosklearn.classification.AutoSklearnClassifier(
-    time_left_for_this_task=30,
-    # Bellow two flags are provided to speed up calculations
-    # Not recommended for a real implementation
-    initial_configurations_via_metalearning=0,
-    smac_scenario_args={"runcount_limit": 1},
+############################################################################
+# Build and fit a classifier
+# ==========================
+
+# create an autosklearn Classifier or Regressor depending on your task at hand.
+automl = autosklearn.classification.AutoSklearnClassifier(
+    time_left_for_this_task=60,
+    per_run_time_limit=30,
+    tmp_folder="/tmp/autosklearn_text_example_tmp",
 )
 
-cls.fit(X_train, y_train, X_test, y_test)
-
-predictions = cls.predict(X_test)
-print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
+automl.fit(X_train, y_train, dataset_name="20_Newsgroups")  # fit the automl model
 
+############################################################################
+# View the models found by auto-sklearn
+# =====================================
 
-X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True, as_frame=True)
-X = X.select_dtypes(exclude=["object"])
+print(automl.leaderboard())
 
-X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
-    X, y, random_state=1
-)
+############################################################################
+# Print the final ensemble constructed by auto-sklearn
+# ====================================================
 
-cls = autosklearn.classification.AutoSklearnClassifier(
-    time_left_for_this_task=30,
-    # Bellow two flags are provided to speed up calculations
-    # Not recommended for a real implementation
-    initial_configurations_via_metalearning=0,
-    smac_scenario_args={"runcount_limit": 1},
-)
+pprint(automl.show_models(), indent=4)
 
-cls.fit(X_train, y_train, X_test, y_test)
+###########################################################################
+# Get the Score of the final ensemble
+# ===================================
 
-predictions = cls.predict(X_test)
-print(
-    "Accuracy score without text preprocessing",
-    sklearn.metrics.accuracy_score(y_test, predictions),
-)
+predictions = automl.predict(X_test)
+print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions))