Skip to content
Merged
113 changes: 55 additions & 58 deletions autosklearn/experimental/askl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,67 +18,63 @@
from autosklearn.classification import AutoSklearnClassifier
from autosklearn.metrics import Scorer, accuracy, balanced_accuracy, log_loss, roc_auc

metrics = (balanced_accuracy, roc_auc, log_loss)
selector_files = {}
this_directory = pathlib.Path(__file__).resolve().parent
for metric in metrics:
training_data_file = this_directory / metric.name / "askl2_training_data.json"
with open(training_data_file) as fh:
training_data = json.load(fh)
fh.seek(0)
m = hashlib.md5()
m.update(fh.read().encode("utf8"))
training_data_hash = m.hexdigest()[:10]
selector_filename = "askl2_selector_%s_%s_%s_%s.pkl" % (
autosklearn.__version__,
sklearn.__version__,
metric.name,
training_data_hash,
)
selector_directory = os.environ.get("XDG_CACHE_HOME")
if selector_directory is None:
selector_directory = pathlib.Path.home()
selector_directory = (
pathlib.Path(selector_directory).joinpath("auto-sklearn").expanduser()
)
selector_files[metric.name] = selector_directory / selector_filename
metafeatures = pd.DataFrame(training_data["metafeatures"])
strategies = training_data["strategies"]
y_values = pd.DataFrame(
training_data["y_values"], columns=strategies, index=metafeatures.index
)
minima_for_methods = training_data["minima_for_methods"]
maxima_for_methods = training_data["maxima_for_methods"]
default_strategies = training_data["tie_break_order"]
if not selector_files[metric.name].exists():
selector = autosklearn.experimental.selector.OVORF(
configuration=training_data["configuration"],
random_state=np.random.RandomState(1),
n_estimators=500,
tie_break_order=default_strategies,
def train_selectors():
global metrics
global selector_files
global strategies
global this_directory
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason to make these global?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those variables were being used inside the fit() function of askl2. A better solution would be to declare them as class variables instead, I guess.

metrics = (balanced_accuracy, roc_auc, log_loss)
selector_files = {}
this_directory = pathlib.Path(__file__).resolve().parent
for metric in metrics:
training_data_file = this_directory / metric.name / 'askl2_training_data.json'
with open(training_data_file) as fh:
training_data = json.load(fh)
fh.seek(0)
m = hashlib.md5()
m.update(fh.read().encode('utf8'))
training_data_hash = m.hexdigest()[:10]
selector_filename = "askl2_selector_%s_%s_%s_%s.pkl" % (
autosklearn.__version__,
sklearn.__version__,
metric.name,
training_data_hash
)
selector = autosklearn.experimental.selector.FallbackWrapper(
selector, default_strategies
)
selector.fit(
X=metafeatures,
y=y_values,
minima=minima_for_methods,
maxima=maxima_for_methods,
)
selector_files[metric.name].parent.mkdir(exist_ok=True, parents=True)

try:
with open(selector_files[metric.name], "wb") as fh:
pickle.dump(selector, fh)
except Exception as e:
print(
"AutoSklearn2Classifier needs to create a selector file under "
"the user's home directory or XDG_CACHE_HOME. Nevertheless "
"the path {} is not writable.".format(selector_files[metric.name])
selector_directory = os.environ.get('XDG_CACHE_HOME')
if selector_directory is None:
selector_directory = pathlib.Path.home()
selector_directory = pathlib.Path(selector_directory).joinpath('auto-sklearn').expanduser()
selector_files[metric.name] = selector_directory / selector_filename
metafeatures = pd.DataFrame(training_data['metafeatures'])
strategies = training_data['strategies']
y_values = pd.DataFrame(training_data['y_values'], columns=strategies, index=metafeatures.index)
minima_for_methods = training_data['minima_for_methods']
maxima_for_methods = training_data['maxima_for_methods']
default_strategies = training_data['tie_break_order']
if not selector_files[metric.name].exists():
selector = autosklearn.experimental.selector.OVORF(
configuration=training_data['configuration'],
random_state=np.random.RandomState(1),
n_estimators=500,
tie_break_order=default_strategies,
)
raise e
selector = autosklearn.experimental.selector.FallbackWrapper(selector, default_strategies)
selector.fit(
X=metafeatures,
y=y_values,
minima=minima_for_methods,
maxima=maxima_for_methods,
)
selector_files[metric.name].parent.mkdir(exist_ok=True, parents=True)

try:
with open(selector_files[metric.name], 'wb') as fh:
pickle.dump(selector, fh)
except Exception as e:
print("AutoSklearn2Classifier needs to create a selector file under "
"the user's home directory or XDG_CACHE_HOME. Nevertheless "
"the path {} is not writable.".format(selector_files[metric.name]))
raise e

class SmacObjectCallback:
def __init__(self, portfolio):
Expand Down Expand Up @@ -339,6 +335,7 @@ def __init__(
"classifier": include_estimators,
"feature_preprocessor": include_preprocessors,
}
train_selectors()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we can improve the startup time further by only training the selector for the metric that is given. In the case of no metric given, I guess we can just default to using all of them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do see the metric parameter that is accepted by askl2. Should this be a list of metrics? And if nothing is passed, we could use the default ones.

super().__init__(
time_left_for_this_task=time_left_for_this_task,
per_run_time_limit=per_run_time_limit,
Expand Down