|
1 | 1 | # -*- encoding: utf-8 -*-
|
2 | 2 | """
|
3 | 3 | ==================
|
4 |
| -Text Preprocessing |
| 4 | +Text preprocessing |
5 | 5 | ==================
|
6 |
| -This example shows, how to use text features in *auto-sklearn*. *auto-sklearn* can automatically |
7 |
| -encode text features if they are provided as string type in a pandas dataframe. |
8 | 6 |
|
9 |
| -For processing text features you need a pandas dataframe and set the desired |
10 |
| -text columns to string and the categorical columns to category. |
| 7 | +The following example shows how to fit a simple NLP problem with |
| 8 | +*auto-sklearn*. |
11 | 9 |
|
12 |
| -*auto-sklearn* text embedding creates a bag of words count. |
| 10 | +For an introduction to text preprocessing you can follow these links: |
| 11 | + 1. https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html |
| 12 | + 2. https://machinelearningmastery.com/clean-text-machine-learning-python/ |
13 | 13 | """
|
| 14 | +from pprint import pprint |
| 15 | + |
| 16 | +import pandas as pd |
14 | 17 | import sklearn.metrics
|
15 |
| -import sklearn.datasets |
| 18 | +from sklearn.datasets import fetch_20newsgroups |
| 19 | + |
16 | 20 | import autosklearn.classification
|
17 | 21 |
|
18 | 22 | ############################################################################
|
19 | 23 | # Data Loading
|
20 | 24 | # ============
|
| 25 | +cats = ["comp.sys.ibm.pc.hardware", "rec.sport.baseball"] |
| 26 | +X_train, y_train = fetch_20newsgroups( |
| 27 | + subset="train", # select train set |
| 28 | + shuffle=True, # shuffle the data set for unbiased validation results |
| 29 | + random_state=42, # set a random seed for reproducibility |
| 30 | + categories=cats, # select only 2 out of 20 labels |
| 31 | + return_X_y=True, # 20NG dataset consists of 2 columns X: the text data, y: the label |
| 32 | +) # load this two columns separately as numpy array |
| 33 | + |
| 34 | +X_test, y_test = fetch_20newsgroups( |
| 35 | + subset="test", # select test set for unbiased evaluation |
| 36 | + categories=cats, # select only 2 out of 20 labels |
| 37 | + return_X_y=True, # 20NG dataset consists of 2 columns X: the text data, y: the label |
| 38 | +) # load this two columns separately as numpy array |
21 | 39 |
|
22 |
| -X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True) |
23 |
| - |
24 |
| -# by default, the columns which should be strings are not formatted as such |
25 |
| -print(f"{X.info()}\n") |
26 |
| - |
27 |
| -# manually convert these to string columns |
28 |
| -X = X.astype( |
29 |
| - { |
30 |
| - "name": "string", |
31 |
| - "ticket": "string", |
32 |
| - "cabin": "string", |
33 |
| - "boat": "string", |
34 |
| - "home.dest": "string", |
35 |
| - } |
36 |
| -) |
| 40 | +############################################################################ |
| 41 | +# Creating a pandas dataframe |
| 42 | +# =========================== |
| 43 | +# Both categorical and text features are often strings. Python Pandas stores python stings |
| 44 | +# in the generic `object` type. Please ensure that the correct |
| 45 | +# `dtype <https://pandas.pydata.org/docs/user_guide/basics.html#dtypes>`_ is applied to the correct |
| 46 | +# column. |
37 | 47 |
|
38 |
| -# now *auto-sklearn* handles the string columns with its text feature preprocessing pipeline |
| 48 | +# create a pandas dataframe for training labeling the "Text" column as sting |
| 49 | +X_train = pd.DataFrame({"Text": pd.Series(X_train, dtype="string")}) |
39 | 50 |
|
40 |
| -X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( |
41 |
| - X, y, random_state=1 |
42 |
| -) |
| 51 | +# create a pandas dataframe for testing labeling the "Text" column as sting |
| 52 | +X_test = pd.DataFrame({"Text": pd.Series(X_test, dtype="string")}) |
43 | 53 |
|
44 |
| -cls = autosklearn.classification.AutoSklearnClassifier( |
45 |
| - time_left_for_this_task=30, |
46 |
| - # Bellow two flags are provided to speed up calculations |
47 |
| - # Not recommended for a real implementation |
48 |
| - initial_configurations_via_metalearning=0, |
49 |
| - smac_scenario_args={"runcount_limit": 1}, |
| 54 | +############################################################################ |
| 55 | +# Build and fit a classifier |
| 56 | +# ========================== |
| 57 | + |
| 58 | +# create an autosklearn Classifier or Regressor depending on your task at hand. |
| 59 | +automl = autosklearn.classification.AutoSklearnClassifier( |
| 60 | + time_left_for_this_task=60, |
| 61 | + per_run_time_limit=30, |
| 62 | + tmp_folder="/tmp/autosklearn_text_example_tmp", |
50 | 63 | )
|
51 | 64 |
|
52 |
| -cls.fit(X_train, y_train, X_test, y_test) |
53 |
| - |
54 |
| -predictions = cls.predict(X_test) |
55 |
| -print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions)) |
| 65 | +automl.fit(X_train, y_train, dataset_name="20_Newsgroups") # fit the automl model |
56 | 66 |
|
| 67 | +############################################################################ |
| 68 | +# View the models found by auto-sklearn |
| 69 | +# ===================================== |
57 | 70 |
|
58 |
| -X, y = sklearn.datasets.fetch_openml(data_id=40945, return_X_y=True, as_frame=True) |
59 |
| -X = X.select_dtypes(exclude=["object"]) |
| 71 | +print(automl.leaderboard()) |
60 | 72 |
|
61 |
| -X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( |
62 |
| - X, y, random_state=1 |
63 |
| -) |
| 73 | +############################################################################ |
| 74 | +# Print the final ensemble constructed by auto-sklearn |
| 75 | +# ==================================================== |
64 | 76 |
|
65 |
| -cls = autosklearn.classification.AutoSklearnClassifier( |
66 |
| - time_left_for_this_task=30, |
67 |
| - # Bellow two flags are provided to speed up calculations |
68 |
| - # Not recommended for a real implementation |
69 |
| - initial_configurations_via_metalearning=0, |
70 |
| - smac_scenario_args={"runcount_limit": 1}, |
71 |
| -) |
| 77 | +pprint(automl.show_models(), indent=4) |
72 | 78 |
|
73 |
| -cls.fit(X_train, y_train, X_test, y_test) |
| 79 | +########################################################################### |
| 80 | +# Get the Score of the final ensemble |
| 81 | +# =================================== |
74 | 82 |
|
75 |
| -predictions = cls.predict(X_test) |
76 |
| -print( |
77 |
| - "Accuracy score without text preprocessing", |
78 |
| - sklearn.metrics.accuracy_score(y_test, predictions), |
79 |
| -) |
| 83 | +predictions = automl.predict(X_test) |
| 84 | +print("Accuracy score:", sklearn.metrics.accuracy_score(y_test, predictions)) |
0 commit comments