automl · mfeurer · Jan 29, 2020 · Sep 17, 2019 · Oct 2, 2019 · Oct 9, 2019
diff --git a/.gitignore b/.gitignore
@@ -53,3 +53,4 @@ number_submission
 .pypirc
 dmypy.json
 *.log
+.noseids
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -148,7 +148,7 @@ def fit(
         metric: Scorer,
         X_test: Optional[np.ndarray] = None,
         y_test: Optional[np.ndarray] = None,
-        feat_type: Optional[List[bool]] = None,
+        feat_type: Optional[List[str]] = None,
         dataset_name: Optional[str] = None,
         only_return_configuration_space: Optional[bool] = False,
         load_models: bool = True,

diff --git a/autosklearn/data/xy_data_manager.py b/autosklearn/data/xy_data_manager.py
@@ -61,6 +61,6 @@ def __init__(self, X, y, X_test, y_test, task, feat_type, dataset_name):
         if self.feat_type is None:
             self.feat_type = ['Numerical'] * X.shape[1]
         if X.shape[1] != len(self.feat_type):
-            raise ValueError('X and feat type must have the same dimensions, '
+            raise ValueError('X and feat_type must have the same number of columns, '
                              'but are %d and %d.' %
                              (X.shape[1], len(self.feat_type)))
diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
@@ -156,7 +156,7 @@ def __init__(self, backend, queue, metric,
                 raise ValueError(feat)
         if np.sum(categorical_mask) > 0:
             self._init_params = {
-                'categorical_encoding:one_hot_encoding:categorical_features':
+                'data_preprocessing:categorical_features':
                     categorical_mask
             }
         else:

diff --git a/autosklearn/pipeline/base.py b/autosklearn/pipeline/base.py
@@ -193,15 +193,15 @@ def set_hyperparameters(self, configuration, init_params=None):
             else:
                 sub_init_params_dict = None
 
-            if isinstance(node, (AutoSklearnChoice, AutoSklearnComponent)):
+            if isinstance(node, (AutoSklearnChoice, AutoSklearnComponent, BasePipeline)):
                 node.set_hyperparameters(configuration=sub_configuration,
                                          init_params=sub_init_params_dict)
             else:
                 raise NotImplementedError('Not supported yet!')
 
         return self
 
-    def get_hyperparameter_search_space(self):
+    def get_hyperparameter_search_space(self, dataset_properties=None):
         """Return the configuration space for the CASH problem.
 
         Returns

diff --git a/autosklearn/pipeline/classification.py b/autosklearn/pipeline/classification.py
@@ -8,16 +8,15 @@
 from ConfigSpace.configuration_space import ConfigurationSpace
 from ConfigSpace.forbidden import ForbiddenEqualsClause, ForbiddenAndConjunction
 
+from autosklearn.pipeline.components.data_preprocessing.feature_type_splitter import FeatureTypeSplitter
+from autosklearn.pipeline.components.data_preprocessing.data_preprocessing_categorical import CategoricalPreprocessingPipeline
+from autosklearn.pipeline.components.data_preprocessing.data_preprocessing_numerical import NumericalPreprocessingPipeline
+
 from autosklearn.pipeline.components import classification as \
     classification_components
-from autosklearn.pipeline.components.data_preprocessing import rescaling as \
-    rescaling_components
 from autosklearn.pipeline.components.data_preprocessing.balancing.balancing import \
     Balancing
-from autosklearn.pipeline.components.data_preprocessing.imputation.imputation \
-    import Imputation
-from autosklearn.pipeline.components.data_preprocessing.one_hot_encoding \
-    import OHEChoice
+
 from autosklearn.pipeline.components import feature_preprocessing as \
     feature_preprocessing_components
 from autosklearn.pipeline.components.data_preprocessing.variance_threshold.variance_threshold \
@@ -41,7 +40,7 @@ class SimpleClassificationPipeline(ClassifierMixin, BasePipeline):
 
     Parameters
     ----------
-    configuration : ConfigSpace.configuration_space.Configuration
+    config : ConfigSpace.configuration_space.Configuration
         The configuration to evaluate.
 
     random_state : int, RandomState instance or None, optional (default=None)
@@ -91,7 +90,7 @@ def fit_transformer(self, X, y, fit_params=None):
             balancing = Balancing(strategy='weighting')
             _init_params, _fit_params = balancing.get_weights(
                 y, self.configuration['classifier:__choice__'],
-                self.configuration['preprocessor:__choice__'],
+                self.configuration['feature_preprocessor:__choice__'],
                 {}, {})
             _init_params.update(self._init_params)
             self.set_hyperparameters(configuration=self.configuration,
@@ -181,7 +180,7 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None,
             exclude=exclude, include=include, pipeline=self.steps)
 
         classifiers = cs.get_hyperparameter('classifier:__choice__').choices
-        preprocessors = cs.get_hyperparameter('preprocessor:__choice__').choices
+        preprocessors = cs.get_hyperparameter('feature_preprocessor:__choice__').choices
         available_classifiers = self._final_estimator.get_available_components(
             dataset_properties)
 
@@ -204,7 +203,7 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None,
                                             'classifier:__choice__'), key),
                                     ForbiddenEqualsClause(
                                         cs.get_hyperparameter(
-                                            'preprocessor:__choice__'), 'densifier')
+                                            'feature_preprocessor:__choice__'), 'densifier')
                                 ))
                             # Success
                             break
@@ -236,7 +235,7 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None,
                         ForbiddenEqualsClause(cs.get_hyperparameter(
                             "classifier:__choice__"), c),
                         ForbiddenEqualsClause(cs.get_hyperparameter(
-                            "preprocessor:__choice__"), f)))
+                            "feature_preprocessor:__choice__"), f)))
                     break
                 except KeyError:
                     break
@@ -265,7 +264,7 @@ def _get_hyperparameter_search_space(self, include=None, exclude=None,
                 try:
                     cs.add_forbidden_clause(ForbiddenAndConjunction(
                         ForbiddenEqualsClause(cs.get_hyperparameter(
-                            "preprocessor:__choice__"), f),
+                            "feature_preprocessor:__choice__"), f),
                         ForbiddenEqualsClause(cs.get_hyperparameter(
                             "classifier:__choice__"), c)))
                     break
@@ -290,25 +289,23 @@ def _get_pipeline(self):
 
         default_dataset_properties = {'target_type': 'classification'}
 
-        # Add the always active preprocessing components
-
-        steps.extend(
-            [["categorical_encoding", OHEChoice(default_dataset_properties)],
-             ["imputation", Imputation()],
-             ["variance_threshold", VarianceThreshold()],
-             ["rescaling",
-              rescaling_components.RescalingChoice(default_dataset_properties)],
-             ["balancing", Balancing()]])
-
-        # Add the preprocessing component
-        steps.append(['preprocessor',
-                      feature_preprocessing_components.FeaturePreprocessorChoice(
-                          default_dataset_properties)])
-
-        # Add the classification component
-        steps.append(['classifier',
+        data_preprocessing = FeatureTypeSplitter(
+            CategoricalPreprocessingPipeline(
+                dataset_properties=default_dataset_properties),
+            NumericalPreprocessingPipeline(
+                dataset_properties=default_dataset_properties))
+
+        steps.extend([
+            ["data_preprocessing", data_preprocessing],
+            ["balancing", Balancing()],
+            ["feature_preprocessor", 
+                feature_preprocessing_components.FeaturePreprocessorChoice(
+                    default_dataset_properties)],
+            ['classifier',
                       classification_components.ClassifierChoice(
-                          default_dataset_properties)])
+                          default_dataset_properties)]
+        ])
+
         return steps
 
     def _get_estimator_hyperparameter_name(self):

diff --git a/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py b/autosklearn/pipeline/components/data_preprocessing/balancing/balancing.py
@@ -55,7 +55,7 @@ def get_weights(self, Y, classifier, preprocessor, init_params, fit_params):
             if classifier in clf_:
                 fit_params['classifier:sample_weight'] = sample_weights
             if preprocessor in pre_:
-                fit_params['preprocessor:sample_weight'] = sample_weights
+                fit_params['feature_preprocessor:sample_weight'] = sample_weights
 
         # Classifiers which can adjust sample weights themselves via the
         # argument `class_weight`
@@ -66,7 +66,7 @@ def get_weights(self, Y, classifier, preprocessor, init_params, fit_params):
         if classifier in clf_:
             init_params['classifier:class_weight'] = 'balanced'
         if preprocessor in pre_:
-            init_params['preprocessor:class_weight'] = 'balanced'
+            init_params['feature_preprocessor:class_weight'] = 'balanced'
 
         clf_ = ['ridge']
         if classifier in clf_:

diff --git a/autosklearn/pipeline/components/data_preprocessing/category_shift/__init__.py b/autosklearn/pipeline/components/data_preprocessing/category_shift/__init__.py
@@ -0,0 +1,97 @@
+from collections import OrderedDict
+import os
+from ...base import AutoSklearnPreprocessingAlgorithm, find_components, \
+    ThirdPartyComponents, AutoSklearnChoice
+from ConfigSpace.configuration_space import ConfigurationSpace
+from ConfigSpace.hyperparameters import CategoricalHyperparameter
+
+ohe_directory = os.path.split(__file__)[0]
+_ohes = find_components(__package__,
+                        ohe_directory,
+                        AutoSklearnPreprocessingAlgorithm)
+_addons = ThirdPartyComponents(AutoSklearnPreprocessingAlgorithm)
+
+
+def add_ohe(ohe):
+    _addons.add_component(ohe)
+
+
+class OHEChoice(AutoSklearnChoice):
+    def get_components(self):
+        components = OrderedDict()
+        components.update(_ohes)
+        components.update(_addons.components)
+        return components
+
+    def get_hyperparameter_search_space(self, dataset_properties=None,
+                                        default=None,
+                                        include=None,
+                                        exclude=None):
+        cs = ConfigurationSpace()
+
+        if dataset_properties is None:
+            dataset_properties = {}
+
+        # Compile a list of legal preprocessors for this problem
+        available_preprocessors = self.get_available_components(
+            dataset_properties=dataset_properties,
+            include=include, exclude=exclude)
+
+        if len(available_preprocessors) == 0:
+            raise ValueError(
+                "No ohe hot encoders found, please add any one hot encoder "
+                "component.")
+
+        if default is None:
+            defaults = ['one_hot_encoding', 'no_encoding']
+            for default_ in defaults:
+                if default_ in available_preprocessors:
+                    default = default_
+                    break
+
+        preprocessor = CategoricalHyperparameter('__choice__',
+                                                 list(
+                                                     available_preprocessors.keys()),
+                                                 default_value=default)
+        cs.add_hyperparameter(preprocessor)
+        for name in available_preprocessors:
+            preprocessor_configuration_space = available_preprocessors[name]. \
+                get_hyperparameter_search_space(dataset_properties)
+            parent_hyperparameter = {'parent': preprocessor, 'value': name}
+            cs.add_configuration_space(name, preprocessor_configuration_space,
+                                       parent_hyperparameter=parent_hyperparameter)
+
+        self.configuration_space_ = cs
+        self.dataset_properties_ = dataset_properties
+        return cs
+
+    def set_hyperparameters(self, configuration, init_params=None):
+        new_params = {}
+
+        params = configuration.get_dictionary()
+        choice = params['__choice__']
+        del params['__choice__']
+
+        for param, value in params.items():
+            param = param.replace(choice, '').replace(':', '')
+            new_params[param] = value
+
+        if init_params is not None:
+            for param, value in init_params.items():
+                # These next two lines are different than in the base class -
+                # they allow removing the categorical feature indicator array
+                #  in order to not pass it to the no encoding
+                if choice not in param:
+                    continue
+                param = param.replace(choice, '').replace(':', '')
+                new_params[param] = value
+
+        new_params['random_state'] = self.random_state
+
+        self.new_params = new_params
+        self.choice = self.get_components()[choice](**new_params)
+
+        return self
+
+    def transform(self, X):
+        return self.choice.transform(X)
diff --git a/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py b/autosklearn/pipeline/components/data_preprocessing/category_shift/category_shift.py
@@ -0,0 +1,56 @@
+import numpy as np
+from scipy import sparse
+from sklearn.utils import check_array
+
+import autosklearn.pipeline.implementations.CategoryShift
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
+from autosklearn.pipeline.constants import *
+
+
+class CategoryShift(AutoSklearnPreprocessingAlgorithm):
+    """ Add 3 to every category.
+    Down in the pipeline, category 2 will be attribute to missing values,
+    category 1 will be assigned to low occurence categories, and category 0
+    is not used, so to provide compatibility with sparse matrices.
+    """
+
+    def __init__(self, random_state=None):
+        pass
+
+    def fit(self, X, y=None):
+        self.preprocessor = autosklearn.pipeline.implementations.CategoryShift\
+            .CategoryShift()
+        self.preprocessor.fit(X, y)
+        return self
+
+    def transform(self, X):
+        if self.preprocessor is None:
+            raise NotImplementedError()
+        return self.preprocessor.transform(X)
+
+    @staticmethod
+    def get_properties(dataset_properties=None):
+        return {'shortname': 'CategShift',
+                'name': 'Category Shift',
+                'handles_missing_values': True,
+                'handles_nominal_values': True,
+                'handles_numerical_features': True,
+                'prefers_data_scaled': False,
+                'prefers_data_normalized': False,
+                'handles_regression': True,
+                'handles_classification': True,
+                'handles_multiclass': True,
+                'handles_multilabel': True,
+                'is_deterministic': True,
+                # TODO find out of this is right!
+                'handles_sparse': True,
+                'handles_dense': True,
+                'input': (DENSE, SPARSE, UNSIGNED_DATA),
+                'output': (INPUT,),
+                'preferred_dtype': None}
+
+    @staticmethod
+    def get_hyperparameter_search_space(dataset_properties=None):
+        return ConfigurationSpace()
-Original file line number
+Diff line change
@@ Expand Up / @@ -53,3 +53,4 @@ number_submission @@
     .pypirc
     dmypy.json
     *.log
+    .noseids