Skip to content

Autosklearn doesn't support sparse y #1210

@eddiebergman

Description

@eddiebergman

As noted in openml/automlbenchmark#370, autosklearn fails to process sparse labels for regression. This extends further and also comes up in classification.

To reproduce:

import traceback
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from autosklearn.regression import AutoSklearnRegressor
from autosklearn.classification import AutoSklearnClassifier

# Create np arrays
X = np.random.random(size=(1000, 5))
y = np.random.random(size=(1000, 1))
X[X < .9] = 0
y[y < .9] = 0


# Convert to scipy sparse matrices
X_sparse = csr_matrix(X)
y_sparse = csr_matrix(y)

model = AutoSklearnRegressor(time_left_for_this_task=30)
try:
    model.fit(X_sparse, y_sparse)
except Exception:
    print(f'\nRegression\n{type(X)=}\n{type(y)=}\n')
    traceback.print_exc()

# Turn it into binary classification
y = np.random.random(size=(1000, 1))
y[y >= .9] = 1
y[y < .9] = 0

y_sparse = csr_matrix(y)

model = AutoSklearnClassifier(time_left_for_this_task=30)
try:
    model.fit(X_sparse, y_sparse)
except Exception as e:
    print(f'\nBinary Classification\n{type(X)=}\n{type(y)=}\n')
    traceback.print_exc()

# Test regression with pandas sparse structures
y = np.random.random(size=(1000))
y[y < 0.9] = 0
y_sparse = pd.Series(y, dtype="Sparse[float]")

model = AutoSklearnRegressor(time_left_for_this_task=30)
try:
    model.fit(X_sparse, y_sparse)
except Exception as e:
    print(f'\nRegression\n{type(X)=}\n{type(y)=}\n')
    traceback.print_exc()

# Turn it into multi label classification
y = np.random.random(size=(1000, 1))
y[y >= .9] = 1
y[y < .9] = 0
y[0:10] = 2

y_sparse = csr_matrix(y)

model = AutoSklearnClassifier(time_left_for_this_task=30)
try:
    model.fit(X_sparse, y_sparse)
except Exception as e:
    print(f'\n Multilabel Classification\n{type(X)=}\n{type(y)=}\n')
    traceback.print_exc()

# Turn it into multi class classification
y = np.random.random(size=(1000, 2))
y[y >= .9] = 1
y[y < .9] = 0

y_sparse = csr_matrix(y)

model = AutoSklearnClassifier(time_left_for_this_task=30)
try:
    model.fit(X_sparse, y_sparse)
except Exception as e:
    print(f'\n Multiclass Classification\n{type(X)=}\n{type(y)=}\n')
    traceback.print_exc()
Regression
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 21, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 1068, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

Binary Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 35, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 924, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

 Multilabel Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 50, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 924, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

 Multiclass Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

[ERROR] [2021-08-06 13:37:44,690:Client-AutoML(1):b73360ac-f6aa-11eb-af00-44850026e7d8] Dummy prediction failed with run state StatusType.CRASHED and additional output: {'traceback': 'TypeError: float() argument must be a string or a number, not \'csr_matrix\'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/__init__.py", line 40, in fit_predict_try_except_decorator\n    return ta(queue=queue, **kwargs)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 1166, in eval_holdout\n    evaluator.fit_predict_and_loss(iterative=iterative)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 503, in fit_predict_and_loss\n    self._partial_fit_and_predict_standard(\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 881, in _partial_fit_and_predict_standard\n    self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(\nValueError: setting an array element with a sequence.\n', 'error': "ValueError('setting an array element with a sequence.')", 'configuration_origin': 'DUMMY'}.
Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 64, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 941, in fit
    super().fit(
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 340, in fit
    self.automl_.fit(load_models=self.load_models, **kwargs)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 1655, in fit
    return super().fit(
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 642, in fit
    self.num_run += self._do_dummy_prediction(datamanager, num_run=1)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 422, in _do_dummy_prediction
    raise ValueError(
ValueError: Dummy prediction failed with run state StatusType.CRASHED and additional output: {'traceback': 'TypeError: float() argument must be a string or a number, not \'csr_matrix\'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/__init__.py", line 40, in fit_predict_try_except_decorator\n    return ta(queue=queue, **kwargs)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 1166, in eval_holdout\n    evaluator.fit_predict_and_loss(iterative=iterative)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 503, in fit_predict_and_loss\n    self._partial_fit_and_predict_standard(\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 881, in _partial_fit_and_predict_standard\n    self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(\nValueError: setting an array element with a sequence.\n', 'error': "ValueError('setting an array element with a sequence.')", 'configuration_origin': 'DUMMY'}.
^CProcess ForkProcess-3:
Traceback (most recent call last):
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/logging_.py", line 318, in start_log_server
    receiver.serve_until_stopped()
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/logging_.py", line 348, in serve_until_stopped
    rd, wr, ex = select.select([self.socket.fileno()],
KeyboardInterrupt
^CException ignored in: <function AutoML.__del__ at 0x7fbfad362ca0>
Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 1622, in __del__
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/backend.py", line 112, in delete_directories
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/backend.py", line 87, in temporary_directory
AttributeError: 'NoneType' object has no attribute 'path'
(.venv) ➜  auto-sklearn git:(sparse_y_fix) ✗ 
(.venv) ➜  auto-sklearn git:(sparse_y_fix) ✗ python test_case2.py

Regression
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 21, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 1068, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

Binary Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 35, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 924, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

 Multilabel Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 62, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 924, in fit
    target_type = type_of_target(y)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/.venv/lib/python3.9/site-packages/sklearn/utils/multiclass.py", line 288, in type_of_target
    if y.ndim > 2 or (y.dtype == object and len(y) and
TypeError: len() of unsized object

 Multiclass Classification
type(X)=<class 'numpy.ndarray'>
type(y)=<class 'numpy.ndarray'>

[ERROR] [2021-08-06 13:39:49,383:Client-AutoML(1):01812118-f6ab-11eb-b07c-44850026e7d8] Dummy prediction failed with run state StatusType.CRASHED and additional output: {'traceback': 'TypeError: float() argument must be a string or a number, not \'csr_matrix\'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/__init__.py", line 40, in fit_predict_try_except_decorator\n    return ta(queue=queue, **kwargs)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 1166, in eval_holdout\n    evaluator.fit_predict_and_loss(iterative=iterative)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 503, in fit_predict_and_loss\n    self._partial_fit_and_predict_standard(\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 881, in _partial_fit_and_predict_standard\n    self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(\nValueError: setting an array element with a sequence.\n', 'error': "ValueError('setting an array element with a sequence.')", 'configuration_origin': 'DUMMY'}.
Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/test_case2.py", line 76, in <module>
    model.fit(X_sparse, y_sparse)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 941, in fit
    super().fit(
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/estimators.py", line 340, in fit
    self.automl_.fit(load_models=self.load_models, **kwargs)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 1655, in fit
    return super().fit(
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 642, in fit
    self.num_run += self._do_dummy_prediction(datamanager, num_run=1)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 422, in _do_dummy_prediction
    raise ValueError(
ValueError: Dummy prediction failed with run state StatusType.CRASHED and additional output: {'traceback': 'TypeError: float() argument must be a string or a number, not \'csr_matrix\'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/__init__.py", line 40, in fit_predict_try_except_decorator\n    return ta(queue=queue, **kwargs)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 1166, in eval_holdout\n    evaluator.fit_predict_and_loss(iterative=iterative)\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 503, in fit_predict_and_loss\n    self._partial_fit_and_predict_standard(\n  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/evaluation/train_evaluator.py", line 881, in _partial_fit_and_predict_standard\n    self.Y_train_targets[train_indices] = self.Y_train.iloc[train_indices] if hasattr(\nValueError: setting an array element with a sequence.\n', 'error': "ValueError('setting an array element with a sequence.')", 'configuration_origin': 'DUMMY'}.
^CError in atexit._run_exitfuncs:
Traceback (most recent call last):
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/popen_fork.py", line 27, in poll
Process ForkProcess-59:
    pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt
Traceback (most recent call last):
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/logging_.py", line 318, in start_log_server
    receiver.serve_until_stopped()
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/util/logging_.py", line 348, in serve_until_stopped
    rd, wr, ex = select.select([self.socket.fileno()],
KeyboardInterrupt
Exception ignored in: <function AutoML.__del__ at 0x7f4368201ca0>
Traceback (most recent call last):
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 1615, in __del__
  File "/home/skantify/code/asklearn/automlbenchmark_issue_370/auto-sklearn/autosklearn/automl.py", line 329, in _clean_logger
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/process.py", line 165, in is_alive
  File "/home/skantify/.pyenv/versions/3.9.5/lib/python3.9/multiprocessing/popen_fork.py", line 27, in poll
AttributeError: 'NoneType' object has no attribute 'waitpid'

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions