diff --git a/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py b/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py index a372dfc83c..575003ec4b 100644 --- a/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py +++ b/autosklearn/metalearning/metalearning/kNearestDatasets/kND.py @@ -2,6 +2,7 @@ import pandas as pd from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import MinMaxScaler import sklearn.utils from ....util.logging_ import get_logger @@ -18,6 +19,7 @@ def __init__(self, metric='l1', random_state=None, metric_params=None): self.runs = None self.best_configuration_per_dataset = None self.random_state = sklearn.utils.check_random_state(random_state) + self.scaler = MinMaxScaler() if self.metric_params is None: self.metric_params = {} @@ -44,6 +46,9 @@ def fit(self, metafeatures, runs): self.runs = runs self.num_datasets = runs.shape[1] + # Fit the metafeatures for scaler + self.scaler.fit(self.metafeatures) + # for each dataset, sort the runs according to their result best_configuration_per_dataset = {} for dataset_name in runs: @@ -102,8 +107,9 @@ def kNearestDatasets(self, x, k=1, return_distance=False): elif k == -1: k = self.num_datasets - X_train, x = self._scale(self.metafeatures, x) + X_train = self.scaler.transform(self.metafeatures) x = x.values.reshape((1, -1)) + x = self.scaler.transform(x) self._nearest_neighbors.fit(X_train) distances, neighbor_indices = self._nearest_neighbors.kneighbors( x, n_neighbors=k, return_distance=True) @@ -126,6 +132,7 @@ def kBestSuggestions(self, x, k=1, exclude_double_configurations=True): raise ValueError('Number of neighbors k cannot be zero or negative.') nearest_datasets, distances = self.kNearestDatasets(x, -1, return_distance=True) + kbest = [] added_configurations = set() @@ -151,20 +158,3 @@ def kBestSuggestions(self, x, k=1, exclude_double_configurations=True): if k == -1: k = len(kbest) return kbest[:k] - - def _scale(self, metafeatures, other): - assert isinstance(other, pd.Series), type(other) - assert other.values.dtype == np.float64 - scaled_metafeatures = metafeatures.copy(deep=True) - other = other.copy(deep=True) - - mins = scaled_metafeatures.min() - maxs = scaled_metafeatures.max() - # I also need to scale the target dataset meta features... - mins = pd.DataFrame(data=[mins, other]).min() - maxs = pd.DataFrame(data=[maxs, other]).max() - divisor = (maxs-mins) - divisor[divisor == 0] = 1 - scaled_metafeatures = (scaled_metafeatures - mins) / divisor - other = (other - mins) / divisor - return scaled_metafeatures, other diff --git a/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py b/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py index f561b16d2e..5adf420280 100644 --- a/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py +++ b/test/test_metalearning/pyMetaLearn/metalearning/test_kND.py @@ -42,21 +42,21 @@ def test_kNearestDatasets(self): neighbor, distance = kND.kNearestDatasets(self.anneal, 1, return_distance=True) self.assertEqual([233], neighbor) - np.testing.assert_array_almost_equal([1.82298937], distance) + np.testing.assert_array_almost_equal([3.8320802803440586], distance) neighbors = kND.kNearestDatasets(self.anneal, 2) self.assertEqual([233, 234], neighbors) neighbors, distance = kND.kNearestDatasets(self.anneal, 2, return_distance=True) self.assertEqual([233, 234], neighbors) - np.testing.assert_array_almost_equal([1.822989, 2.267919], distance) + np.testing.assert_array_almost_equal([3.8320802803440586, 4.367919719655942], distance) neighbors = kND.kNearestDatasets(self.anneal, -1) self.assertEqual([233, 234], neighbors) neighbors, distance = kND.kNearestDatasets(self.anneal, -1, return_distance=True) self.assertEqual([233, 234], neighbors) - np.testing.assert_array_almost_equal([1.822989, 2.267919], distance) + np.testing.assert_array_almost_equal([3.8320802803440586, 4.367919719655942], distance) self.assertRaises(ValueError, kND.kNearestDatasets, self.anneal, 0) self.assertRaises(ValueError, kND.kNearestDatasets, self.anneal, -2) @@ -67,35 +67,23 @@ def test_kBestSuggestions(self): self.runs.loc[:, [233, 234]]) neighbor = kND.kBestSuggestions(self.anneal, 1) np.testing.assert_array_almost_equal( - [(233, 1.8229893712531495, 1)], + [(233, 3.8320802803440586, 1)], neighbor, ) neighbors = kND.kBestSuggestions(self.anneal, 2) np.testing.assert_array_almost_equal( - [(233, 1.8229893712531495, 1), (234, 2.2679197196559415, 2)], + [(233, 3.8320802803440586, 1), (234, 4.367919719655942, 2)], neighbors, ) neighbors = kND.kBestSuggestions(self.anneal, -1) np.testing.assert_array_almost_equal( - [(233, 1.8229893712531495, 1), (234, 2.2679197196559415, 2)], + [(233, 3.8320802803440586, 1), (234, 4.367919719655942, 2)], neighbors, ) self.assertRaises(ValueError, kND.kBestSuggestions, self.anneal, 0) self.assertRaises(ValueError, kND.kBestSuggestions, self.anneal, -2) - def test_scale(self): - kND = KNearestDatasets() - metafeatures = pd.DataFrame([self.anneal, self.krvskp]) - metafeatures, other = kND._scale(metafeatures, self.labor) - from pandas.util.testing import assert_series_equal - # Series.equal does not work properly with floats... - assert_series_equal(metafeatures.iloc[0], - pd.Series({"number_of_instances": 0.267919719656, - "number_of_classes": 1, - "number_of_features": 1}, - name=232)) - def test_random_metric(self): kND = KNearestDatasets(metric=get_random_metric(random_state=1)) kND.fit(pd.DataFrame([self.krvskp, self.labor]),