Fix rare edge case with extremely inbalanced data (#1244)

mfeurer · web-flow · commit ff11e5ab980d · 2021-09-13T20:14:20.000+02:00
* Fix rare edge case with extremely inbalanced data

For dataset 360112 Auto-sklearn would fail because the data would
first be sub-sampled and then contain some classes only once.
In the internal splitting, the StratifiedShuffleSplit would not
be able to split the dataset into train and valid, and would resort
to only a ShuffleSplit. This could put the single sample for a
class into the test set. At predict time we would then miss one class.

This commit creates two new splitters which move a sample from the
test split to the training split if a class does not exist in the
train split.

* fix unit test
diff --git a/autosklearn/evaluation/splitter.py b/autosklearn/evaluation/splitter.py
@@ -0,0 +1,161 @@
+import numpy as np
+
+from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
+from sklearn.model_selection._split import _validate_shuffle_split
+from sklearn.utils import indexable, check_random_state
+from sklearn.utils import _approximate_mode
+from sklearn.utils.validation import _num_samples, column_or_1d
+from sklearn.utils.validation import check_array
+from sklearn.utils.multiclass import type_of_target
+
+
+class CustomStratifiedShuffleSplit(StratifiedShuffleSplit):
+    """Stratified ShuffleSplit cross-validator that deals with classes with too few samples
+    """
+
+    def _iter_indices(self, X, y, groups=None):  # type: ignore
+        n_samples = _num_samples(X)
+        y = check_array(y, ensure_2d=False, dtype=None)
+        n_train, n_test = _validate_shuffle_split(
+            n_samples, self.test_size, self.train_size,
+            default_test_size=self._default_test_size)
+
+        if y.ndim == 2:
+            # for multi-label y, map each distinct row to a string repr
+            # using join because str(row) uses an ellipsis if len(row) > 1000
+            y = np.array([' '.join(row.astype('str')) for row in y])
+
+        classes, y_indices = np.unique(y, return_inverse=True)
+        n_classes = classes.shape[0]
+
+        class_counts = np.bincount(y_indices)
+        # print(class_counts)
+
+        if n_train < n_classes:
+            raise ValueError('The train_size = %d should be greater or '
+                             'equal to the number of classes = %d' %
+                             (n_train, n_classes))
+        if n_test < n_classes:
+            raise ValueError('The test_size = %d should be greater or '
+                             'equal to the number of classes = %d' %
+                             (n_test, n_classes))
+
+        # Find the sorted list of instances for each class:
+        # (np.unique above performs a sort, so code is O(n logn) already)
+        class_indices = np.split(np.argsort(y_indices, kind='mergesort'),
+                                 np.cumsum(class_counts)[:-1])
+
+        rng = check_random_state(self.random_state)
+
+        for _ in range(self.n_splits):
+            # if there are ties in the class-counts, we want
+            # to make sure to break them anew in each iteration
+            n_i = _approximate_mode(class_counts, n_train, rng)
+            class_counts_remaining = class_counts - n_i
+            t_i = _approximate_mode(class_counts_remaining, n_test, rng)
+            train = []
+            test = []
+
+            for i in range(n_classes):
+                # print("Before", i, class_counts[i], n_i[i], t_i[i])
+                permutation = rng.permutation(class_counts[i])
+                perm_indices_class_i = class_indices[i].take(permutation,
+                                                             mode='clip')
+                if n_i[i] == 0:
+                    n_i[i] = 1
+                    t_i[i] = t_i[i] - 1
+
+                # print("After", i, class_counts[i], n_i[i], t_i[i])
+                train.extend(perm_indices_class_i[:n_i[i]])
+                test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
+
+            train = rng.permutation(train)
+            test = rng.permutation(test)
+
+            yield train, test
+
+
+class CustomStratifiedKFold(StratifiedKFold):
+    """Stratified K-Folds cross-validator that ensures that there is always at least
+    1 sample per class in the training set.
+    """
+
+    def _make_test_folds(self, X, y=None):  # type: ignore
+        rng = check_random_state(self.random_state)
+        y = np.asarray(y)
+        type_of_target_y = type_of_target(y)
+        allowed_target_types = ('binary', 'multiclass')
+        if type_of_target_y not in allowed_target_types:
+            raise ValueError(
+                'Supported target types are: {}. Got {!r} instead.'.format(
+                    allowed_target_types, type_of_target_y))
+
+        y = column_or_1d(y)
+
+        _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
+        # y_inv encodes y according to lexicographic order. We invert y_idx to
+        # map the classes so that they are encoded by order of appearance:
+        # 0 represents the first label appearing in y, 1 the second, etc.
+        _, class_perm = np.unique(y_idx, return_inverse=True)
+        y_encoded = class_perm[y_inv]
+
+        n_classes = len(y_idx)
+
+        # Determine the optimal number of samples from each class in each fold,
+        # using round robin over the sorted y. (This can be done direct from
+        # counts, but that code is unreadable.)
+        y_order = np.sort(y_encoded)
+        allocation = np.asarray(
+            [np.bincount(y_order[i::self.n_splits], minlength=n_classes)
+             for i in range(self.n_splits)])
+
+        # To maintain the data order dependencies as best as possible within
+        # the stratification constraint, we assign samples from each class in
+        # blocks (and then mess that up when shuffle=True).
+        test_folds = np.empty(len(y), dtype='i')
+        for k in range(n_classes):
+            # since the kth column of allocation stores the number of samples
+            # of class k in each test set, this generates blocks of fold
+            # indices corresponding to the allocation for class k.
+            folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])
+            if self.shuffle:
+                rng.shuffle(folds_for_class)
+            test_folds[y_encoded == k] = folds_for_class
+        return test_folds
+
+    def split(self, X, y=None, groups=None):  # type: ignore
+
+        X, y, groups = indexable(X, y, groups)
+        n_samples = _num_samples(X)
+        if self.n_splits > n_samples:
+            raise ValueError(
+                ("Cannot have number of splits n_splits={0} greater"
+                 " than the number of samples: n_samples={1}.")
+                .format(self.n_splits, n_samples))
+
+        for train, test in super().split(X, y, groups):
+            # print(len(np.unique(y)), len(np.unique(y[train])), len(np.unique(y[test])))
+            all_classes = np.unique(y)
+            train_classes = np.unique(y[train])
+            train = list(train)
+            test = list(test)
+            missing_classes = set(all_classes) - set(train_classes)
+            if len(missing_classes) > 0:
+                # print(missing_classes)
+                for diff in missing_classes:
+                    # print(len(train), len(test))
+                    to_move = np.where(y[test] == diff)[0][0]
+                    # print(y[test][to_move])
+                    train = train + [test[to_move]]
+                    del test[to_move]
+                    # print(len(train), len(test))
+            train = np.array(train, dtype=int)
+            test = np.array(test, dtype=int)
+            # print(
+            #     len(np.unique(y)),
+            #     len(np.unique(y[train])),
+            #     len(np.unique(y[test])),
+            #     len(train), len(test),
+            # )
+
+            yield train, test
diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py
@@ -1,5 +1,6 @@
 import logging
 import multiprocessing
+import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 import copy
@@ -21,6 +22,7 @@
     TYPE_ADDITIONAL_INFO,
     _fit_and_suppress_warnings,
 )
+from autosklearn.evaluation.splitter import CustomStratifiedShuffleSplit, CustomStratifiedKFold
 from autosklearn.data.abstract_data_manager import AbstractDataManager
 from autosklearn.constants import (
     CLASSIFICATION_TASKS,
@@ -1037,15 +1039,20 @@ def get_splitter(self, D: AbstractDataManager) -> Union[BaseCrossValidator, _Rep
 
                 if shuffle:
                     try:
-                        cv = StratifiedShuffleSplit(n_splits=1,
-                                                    test_size=test_size,
-                                                    random_state=1)
+                        cv = StratifiedShuffleSplit(
+                            n_splits=1,
+                            test_size=test_size,
+                            random_state=1,
+                        )
                         test_cv = copy.deepcopy(cv)
                         next(test_cv.split(y, y))
                     except ValueError as e:
                         if 'The least populated class in y has only' in e.args[0]:
-                            cv = ShuffleSplit(n_splits=1, test_size=test_size,
-                                              random_state=1)
+                            cv = CustomStratifiedShuffleSplit(
+                                n_splits=1,
+                                test_size=test_size,
+                                random_state=1,
+                            )
                         else:
                             raise e
                 else:
@@ -1057,9 +1064,26 @@ def get_splitter(self, D: AbstractDataManager) -> Union[BaseCrossValidator, _Rep
             elif self.resampling_strategy in ['cv', 'cv-iterative-fit', 'partial-cv',
                                               'partial-cv-iterative-fit']:
                 if shuffle:
-                    cv = StratifiedKFold(
-                        n_splits=self.resampling_strategy_args['folds'],
-                        shuffle=shuffle, random_state=1)
+                    try:
+                        with warnings.catch_warnings():
+                            warnings.simplefilter('error')
+                            cv = StratifiedKFold(
+                                n_splits=self.resampling_strategy_args['folds'],
+                                shuffle=shuffle,
+                                random_state=1,
+                            )
+                            test_cv = copy.deepcopy(cv)
+                            next(test_cv.split(y, y))
+                    except UserWarning as e:
+                        print(e)
+                        if 'The least populated class in y has only' in e.args[0]:
+                            cv = CustomStratifiedKFold(
+                                n_splits=self.resampling_strategy_args['folds'],
+                                shuffle=shuffle,
+                                random_state=1,
+                            )
+                        else:
+                            raise e
                 else:
                     cv = KFold(n_splits=self.resampling_strategy_args['folds'],
                                shuffle=shuffle)
diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py
@@ -19,6 +19,7 @@
 import sklearn.model_selection
 from smac.tae import StatusType, TAEAbortException
 
+import autosklearn.evaluation.splitter
 from autosklearn.data.abstract_data_manager import AbstractDataManager
 from autosklearn.evaluation.util import read_queue
 from autosklearn.evaluation.train_evaluator import TrainEvaluator, \
@@ -1080,17 +1081,17 @@ def test_get_splitter(self, te_mock):
         self.assertIsInstance(cv,
                               sklearn.model_selection.PredefinedSplit)
 
-        # holdout, binary classification, fallback to shuffle split
+        # holdout, binary classification, fallback to custom shuffle split
         D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1, 2])
         evaluator = TrainEvaluator()
         evaluator.resampling_strategy = 'holdout'
         evaluator.resampling_strategy_args = {}
         cv = evaluator.get_splitter(D)
         self.assertIsInstance(cv,
-                              sklearn.model_selection._split.ShuffleSplit)
+                              autosklearn.evaluation.splitter.CustomStratifiedShuffleSplit)
 
         # cv, binary classification
-        D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+        D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
         evaluator = TrainEvaluator()
         evaluator.resampling_strategy = 'cv'
         evaluator.resampling_strategy_args = {'folds': 5}
@@ -1099,7 +1100,7 @@ def test_get_splitter(self, te_mock):
                               sklearn.model_selection._split.StratifiedKFold)
 
         # cv, binary classification, shuffle is True
-        D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1])
+        D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
         evaluator = TrainEvaluator()
         evaluator.resampling_strategy = 'cv'
         evaluator.resampling_strategy_args = {'folds': 5}
@@ -1118,14 +1119,14 @@ def test_get_splitter(self, te_mock):
                               sklearn.model_selection._split.KFold)
         self.assertFalse(cv.shuffle)
 
-        # cv, binary classification, no fallback anticipated
-        D.data['Y_train'] = np.array([0, 0, 0, 1, 1, 1, 2])
+        # cv, binary classification, fallback to custom splitter
+        D.data['Y_train'] = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2])
         evaluator = TrainEvaluator()
         evaluator.resampling_strategy = 'cv'
         evaluator.resampling_strategy_args = {'folds': 5}
         cv = evaluator.get_splitter(D)
         self.assertIsInstance(cv,
-                              sklearn.model_selection._split.StratifiedKFold)
+                              autosklearn.evaluation.splitter.CustomStratifiedKFold)
 
         # regression, shuffle split
         D.data['Y_train'] = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5])