Autosklearn doesn't support sparse y

As noted in openml/automlbenchmark#370, autosklearn fails to process sparse labels for regression. This extends further and also comes up in classification.

To reproduce:
```python
import traceback
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from autosklearn.regression import AutoSklearnRegressor
from autosklearn.classification import AutoSklearnClassifier

# Create np arrays
X = np.random.random(size=(1000, 5))
y = np.random.random(size=(1000, 1))
X[X < .9] = 0
y[y < .9] = 0


# Convert to scipy sparse matrices
X_sparse = csr_matrix(X)
y_sparse = csr_matrix(y)

model = AutoSklearnRegressor(time_left_for_this_task=30)
try:
    model.fit(X_sparse, y_sparse)
except Exception:
    print(f'\nRegression\n{type(X)=}\n{type(y)=}\n')
    traceback.print_exc()

# Turn it into binary classification
y = np.random.random(size=(1000, 1))
y[y >= .9] = 1
y[y < .9] = 0

y_sparse = csr_matrix(y)

model = AutoSklearnClassifier(time_left_for_this_task=30)
try:
    model.fit(X_sparse, y_sparse)
except Exception as e:
    print(f'\nBinary Classification\n{type(X)=}\n{type(y)=}\n')
    traceback.print_exc()

# Test regression with pandas sparse structures
y = np.random.random(size=(1000))
y[y < 0.9] = 0
y_sparse = pd.Series(y, dtype="Sparse[float]")

model = AutoSklearnRegressor(time_left_for_this_task=30)
try:
    model.fit(X_sparse, y_sparse)
except Exception as e:
    print(f'\nRegression\n{type(X)=}\n{type(y)=}\n')
    traceback.print_exc()

# Turn it into multi label classification
y = np.random.random(size=(1000, 1))
y[y >= .9] = 1
y[y < .9] = 0
y[0:10] = 2

y_sparse = csr_matrix(y)

model = AutoSklearnClassifier(time_left_for_this_task=30)
try:
    model.fit(X_sparse, y_sparse)
except Exception as e:
    print(f'\n Multilabel Classification\n{type(X)=}\n{type(y)=}\n')
    traceback.print_exc()

# Turn it into multi class classification
y = np.random.random(size=(1000, 2))
y[y >= .9] = 1
y[y < .9] = 0

y_sparse = csr_matrix(y)

model = AutoSklearnClassifier(time_left_for_this_task=30)
try:
    model.fit(X_sparse, y_sparse)
except Exception as e:
    print(f'\n Multiclass Classification\n{type(X)=}\n{type(y)=}\n')
    traceback.print_exc()
```

```
Regression
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 21, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 1068, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

Binary Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 35, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 924, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

 Multilabel Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 50, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 924, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

 Multiclass Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

[ERROR] [2021-08-06 13:37:44,690:Client-AutoML(1):b73360ac-f6aa-11eb-af00-44850026e7d8] Dummy prediction failed with run state StatusType.CRASHED and additional output: {'traceback': 'TypeError: float() argument must be a string or a number, not \'csr_matrix\'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/__init__.py", line 40, in fit_predict_try_except_decorator\n    return ta(queue=queue, **kwargs)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 1166, in eval_holdout\n    evaluator.fit_predict_and_loss(iterative=iterative)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 503, in fit_predict_and_loss\n    self._partial_fit_and_predict_standard(\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 881, in _partial_fit_and_predict_standard\n    self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(\nValueError: setting an array element with a sequence.\n', 'error': "ValueError('setting an array element with a sequence.')", 'configuration_origin': 'DUMMY'}.
Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 64, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 941, in fit
    super().fit(
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 340, in fit
    self.automl_.fit(load_models=self.load_models, **kwargs)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 1655, in fit
    return super().fit(
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 642, in fit
    self.num_run += self._do_dummy_prediction(datamanager, num_run=1)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 422, in _do_dummy_prediction
    raise ValueError(
ValueError: Dummy prediction failed with run state StatusType.CRASHED and additional output: {'traceback': 'TypeError: float() argument must be a string or a number, not \'csr_matrix\'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/__init__.py", line 40, in fit_predict_try_except_decorator\n    return ta(queue=queue, **kwargs)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 1166, in eval_holdout\n    evaluator.fit_predict_and_loss(iterative=iterative)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 503, in fit_predict_and_loss\n    self._partial_fit_and_predict_standard(\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 881, in _partial_fit_and_predict_standard\n    self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(\nValueError: setting an array element with a sequence.\n', 'error': "ValueError('setting an array element with a sequence.')", 'configuration_origin': 'DUMMY'}.
^CProcess ForkProcess-3:
Traceback (most recent call last):
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/logging_.py", line 318, in start_log_server
    receiver.serve_until_stopped()
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/logging_.py", line 348, in serve_until_stopped
    rd, wr, ex = select.select([self.socket.fileno()],
KeyboardInterrupt
^CException ignored in: <function AutoML.__del__ at 0x7fbfad362ca0>
Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 1622, in __del__
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/backend.py", line 112, in delete_directories
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/backend.py", line 87, in temporary_directory
AttributeError: 'NoneType' object has no attribute 'path'
(.venv) ➜  auto-sklearn git:(sparse_y_fix) ✗ 
(.venv) ➜  auto-sklearn git:(sparse_y_fix) ✗ python test_case2.py

Regression
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 21, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 1068, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

Binary Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 35, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 924, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

 Multilabel Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 62, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 924, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

 Multiclass Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

[ERROR] [2021-08-06 13:39:49,383:Client-AutoML(1):01812118-f6ab-11eb-b07c-44850026e7d8] Dummy prediction failed with run state StatusType.CRASHED and additional output: {'traceback': 'TypeError: float() argument must be a string or a number, not \'csr_matrix\'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/__init__.py", line 40, in fit_predict_try_except_decorator\n    return ta(queue=queue, **kwargs)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 1166, in eval_holdout\n    evaluator.fit_predict_and_loss(iterative=iterative)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 503, in fit_predict_and_loss\n    self._partial_fit_and_predict_standard(\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 881, in _partial_fit_and_predict_standard\n    self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(\nValueError: setting an array element with a sequence.\n', 'error': "ValueError('setting an array element with a sequence.')", 'configuration_origin': 'DUMMY'}.
Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 76, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 941, in fit
    super().fit(
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 340, in fit
    self.automl_.fit(load_models=self.load_models, **kwargs)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 1655, in fit
    return super().fit(
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 642, in fit
    self.num_run += self._do_dummy_prediction(datamanager, num_run=1)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 422, in _do_dummy_prediction
    raise ValueError(
ValueError: Dummy prediction failed with run state StatusType.CRASHED and additional output: {'traceback': 'TypeError: float() argument must be a string or a number, not \'csr_matrix\'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/__init__.py", line 40, in fit_predict_try_except_decorator\n    return ta(queue=queue, **kwargs)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 1166, in eval_holdout\n    evaluator.fit_predict_and_loss(iterative=iterative)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 503, in fit_predict_and_loss\n    self._partial_fit_and_predict_standard(\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 881, in _partial_fit_and_predict_standard\n    self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(\nValueError: setting an array element with a sequence.\n', 'error': "ValueError('setting an array element with a sequence.')", 'configuration_origin': 'DUMMY'}.
^CError in atexit._run_exitfuncs:
Traceback (most recent call last):
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/popen_fork.py", line 27, in poll
Process ForkProcess-59:
    pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
Traceback (most recent call last):
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/logging_.py", line 318, in start_log_server
    receiver.serve_until_stopped()
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/logging_.py", line 348, in serve_until_stopped
    rd, wr, ex = select.select([self.socket.fileno()],
KeyboardInterrupt
Exception ignored in: <function AutoML.__del__ at 0x7f4368201ca0>
Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 1615, in __del__
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 329, in _clean_logger
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/process.py", line 165, in is_alive
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/popen_fork.py", line 27, in poll
AttributeError: 'NoneType' object has no attribute 'waitpid'
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Autosklearn doesn't support sparse y #1210

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Autosklearn doesn't support sparse y #1210

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions