Skip to content

Commit a8effd8

Browse files
authored
Add sparse support for Auto-sklearn 2.0 (#1245)
1 parent ff11e5a commit a8effd8

File tree

3 files changed

+63
-7
lines changed

3 files changed

+63
-7
lines changed

autosklearn/automl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1187,7 +1187,7 @@ def refit(self, X, y):
11871187
budget_type=self._budget_type,
11881188
logger=self._logger,
11891189
model=model,
1190-
train_indices=np.arange(len(X), dtype=int),
1190+
train_indices=np.arange(X.shape[0], dtype=int),
11911191
task_type=self._task,
11921192
)
11931193
break

autosklearn/experimental/askl2.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Any, Dict, List, Optional, Union
77

88
import dask.distributed
9+
import scipy.sparse
910

1011
from ConfigSpace import Configuration
1112
import numpy as np
@@ -92,9 +93,14 @@ def __call__(
9293

9394
scenario = Scenario(scenario_dict)
9495

95-
initial_configurations = [
96-
Configuration(configuration_space=scenario.cs, values=member)
97-
for member in self.portfolio.values()]
96+
initial_configurations = []
97+
for member in self.portfolio.values():
98+
try:
99+
initial_configurations.append(
100+
Configuration(configuration_space=scenario.cs, values=member)
101+
)
102+
except ValueError:
103+
pass
98104

99105
rh2EPM = RunHistory2EPM4LogCost
100106
return SMAC4AC(
@@ -134,9 +140,15 @@ def __call__(
134140
from smac.scenario.scenario import Scenario
135141

136142
scenario = Scenario(scenario_dict)
137-
initial_configurations = [
138-
Configuration(configuration_space=scenario.cs, values=member)
139-
for member in self.portfolio.values()]
143+
144+
initial_configurations = []
145+
for member in self.portfolio.values():
146+
try:
147+
initial_configurations.append(
148+
Configuration(configuration_space=scenario.cs, values=member)
149+
)
150+
except ValueError:
151+
pass
140152

141153
rh2EPM = RunHistory2EPM4LogCost
142154
ta_kwargs['budget_type'] = self.budget_type
@@ -341,6 +353,25 @@ def fit(self, X, y,
341353
feat_type=None,
342354
dataset_name=None):
343355

356+
# TODO
357+
# regularly check https://github.com/scikit-learn/scikit-learn/issues/15336 whether
358+
# histogram gradient boosting in scikit-learn finally support sparse data
359+
is_sparse = scipy.sparse.issparse(X)
360+
if is_sparse:
361+
include_estimators = [
362+
'extra_trees', 'passive_aggressive', 'random_forest', 'sgd', 'mlp',
363+
]
364+
else:
365+
include_estimators = [
366+
'extra_trees',
367+
'passive_aggressive',
368+
'random_forest',
369+
'sgd',
370+
'gradient_boosting',
371+
'mlp',
372+
]
373+
self.include['classifier'] = include_estimators
374+
344375
if self.metric is None:
345376
if len(y.shape) == 1 or y.shape[1] == 1:
346377
self.metric = accuracy

test/test_automl/test_estimators.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,31 @@ def test_autosklearn2_classification_methods_returns_self(dask_client):
816816
pickle.dumps(automl_fitted)
817817

818818

819+
def test_autosklearn2_classification_methods_returns_self_sparse(dask_client):
820+
X_train, y_train, X_test, y_test = putil.get_dataset('breast_cancer', make_sparse=True)
821+
automl = AutoSklearn2Classifier(time_left_for_this_task=60, ensemble_size=0,
822+
delete_tmp_folder_after_terminate=False,
823+
dask_client=dask_client)
824+
825+
automl_fitted = automl.fit(X_train, y_train)
826+
assert automl is automl_fitted
827+
828+
automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5)
829+
assert automl is automl_ensemble_fitted
830+
831+
automl_refitted = automl.refit(X_train.copy(), y_train.copy())
832+
assert automl is automl_refitted
833+
834+
predictions = automl_fitted.predict(X_test)
835+
assert sklearn.metrics.accuracy_score(
836+
y_test, predictions
837+
) >= 2 / 3, print_debug_information(automl)
838+
839+
assert "boosting" not in str(automl.get_configuration_space(X=X_train, y=y_train))
840+
841+
pickle.dumps(automl_fitted)
842+
843+
819844
@pytest.mark.parametrize("class_", [AutoSklearnClassifier, AutoSklearnRegressor,
820845
AutoSklearn2Classifier])
821846
def test_check_estimator_signature(class_):

0 commit comments

Comments
 (0)