-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Use sklearn's ColumnTransformer for data preprocessing #735
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7201fc0
00c740d
630e0c4
71f1f97
b410eca
fbff34e
1191bee
2019490
e49b505
572162c
21962a8
257351c
d982034
e9bd84d
50ed91e
2270497
12d2349
742d980
ddab5ac
0bf3f48
322a444
a89564d
87c0a5a
225b8bd
d689e33
c5cabda
538c8b7
8674c2c
f8e4cfc
4d04298
add2611
b62d996
888d3eb
7eff044
be69781
f74cc74
5ac6eaa
0177853
2cf668d
6c8f931
a2e2e9a
e1f3cbb
d28041b
0692e8f
e40ca18
2f3a1b8
f50e740
ba7efe5
a997d96
96c61b3
2152afa
124324b
0c4f237
76f0387
6e2cfdc
87b7608
0d8ca34
66454dc
de8f056
404708b
b526749
7847d5c
154596c
0e97ca4
c61f024
654a8f6
05f37f0
561f38d
e764cd9
a0f32a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,3 +53,4 @@ number_submission | |
.pypirc | ||
dmypy.json | ||
*.log | ||
.noseids |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import scipy.sparse | ||
|
||
from sklearn.preprocessing import OneHotEncoder as DenseOneHotEncoder | ||
|
||
from ConfigSpace.configuration_space import ConfigurationSpace | ||
|
||
from autosklearn.pipeline.implementations.SparseOneHotEncoder import SparseOneHotEncoder | ||
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm | ||
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT | ||
|
||
|
||
class OneHotEncoder(AutoSklearnPreprocessingAlgorithm): | ||
def __init__(self, random_state=None): | ||
self.random_state = random_state | ||
|
||
def fit(self, X, y=None): | ||
if scipy.sparse.issparse(X): | ||
self.preprocessor = SparseOneHotEncoder() | ||
else: | ||
self.preprocessor = DenseOneHotEncoder( | ||
sparse=False, categories='auto', handle_unknown='ignore') | ||
self.preprocessor.fit(X, y) | ||
return self | ||
|
||
def transform(self, X): | ||
if self.preprocessor is None: | ||
raise NotImplementedError() | ||
return self.preprocessor.transform(X) | ||
|
||
def fit_transform(self, X, y=None): | ||
return self.fit(X, y).transform(X) | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties=None): | ||
return {'shortname': '1Hot', | ||
'name': 'One Hot Encoder', | ||
'handles_regression': True, | ||
'handles_classification': True, | ||
'handles_multiclass': True, | ||
'handles_multilabel': True, | ||
# TODO find out of this is right! | ||
'handles_sparse': True, | ||
'handles_dense': True, | ||
'input': (DENSE, SPARSE, UNSIGNED_DATA), | ||
'output': (INPUT,), } | ||
|
||
@staticmethod | ||
def get_hyperparameter_search_space(dataset_properties=None): | ||
return ConfigurationSpace() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import autosklearn.pipeline.implementations.CategoryShift | ||
|
||
from ConfigSpace.configuration_space import ConfigurationSpace | ||
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm | ||
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT | ||
|
||
|
||
class CategoryShift(AutoSklearnPreprocessingAlgorithm): | ||
""" Add 3 to every category. | ||
Down in the pipeline, category 2 will be attribute to missing values, | ||
category 1 will be assigned to low occurence categories, and category 0 | ||
is not used, so to provide compatibility with sparse matrices. | ||
""" | ||
|
||
def __init__(self, random_state=None): | ||
pass | ||
|
||
def fit(self, X, y=None): | ||
self.preprocessor = autosklearn.pipeline.implementations.CategoryShift\ | ||
.CategoryShift() | ||
self.preprocessor.fit(X, y) | ||
return self | ||
|
||
def transform(self, X): | ||
if self.preprocessor is None: | ||
raise NotImplementedError() | ||
return self.preprocessor.transform(X) | ||
|
||
def fit_transform(self, X, y=None): | ||
return self.fit(X, y).transform(X) | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties=None): | ||
return {'shortname': 'CategShift', | ||
'name': 'Category Shift', | ||
'handles_missing_values': True, | ||
'handles_nominal_values': True, | ||
'handles_numerical_features': True, | ||
'prefers_data_scaled': False, | ||
'prefers_data_normalized': False, | ||
'handles_regression': True, | ||
'handles_classification': True, | ||
'handles_multiclass': True, | ||
'handles_multilabel': True, | ||
'is_deterministic': True, | ||
# TODO find out of this is right! | ||
'handles_sparse': True, | ||
'handles_dense': True, | ||
'input': (DENSE, SPARSE, UNSIGNED_DATA), | ||
'output': (INPUT,), | ||
'preferred_dtype': None} | ||
|
||
@staticmethod | ||
def get_hyperparameter_search_space(dataset_properties=None): | ||
return ConfigurationSpace() |
Uh oh!
There was an error while loading. Please reload this page.