Source code for mala.network.hyper_opt_optuna

"""Hyperparameter optimizer using optuna."""

import pickle

import optuna

from mala.common.parameters import printout
from mala.network.hyper_opt import HyperOpt
from mala.network.objective_base import ObjectiveBase
from mala.network.naswot_pruner import NASWOTPruner
from mala.network.multi_training_pruner import MultiTrainingPruner
from mala.common.parallelizer import parallel_warn


[docs] class HyperOptOptuna(HyperOpt): """Hyperparameter optimizer using Optuna. Parameters ---------- params : mala.common.parameters.Parameters Parameters used to create this hyperparameter optimizer. data : mala.datahandling.data_handler.DataHandler DataHandler holding the data for the hyperparameter optimization. use_pkl_checkpoints : bool If true, .pkl checkpoints will be created. Attributes ---------- params : mala.common.parameters.Parameters MALA Parameters object. objective : mala.network.objective_base.ObjectiveBase MALA objective to be optimized, i.e., a MALA NN model training. study : optuna.study.Study An Optuna study used to collect the results of the hyperparameter optimization. """ def __init__(self, params, data, use_pkl_checkpoints=False): super(HyperOptOptuna, self).__init__( params, data, use_pkl_checkpoints=use_pkl_checkpoints ) self.params = params # Make the sample behave in a reproducible way, if so specified by # the user. sampler = optuna.samplers.TPESampler( seed=params.manual_seed, multivariate=params.hyperparameters.use_multivariate, ) # See if the user specified a pruner. pruner = None if self.params.hyperparameters.pruner is not None: if self.params.hyperparameters.pruner == "naswot": pruner = NASWOTPruner(self.params, data) elif self.params.hyperparameters.pruner == "multi_training": if self.params.hyperparameters.number_training_per_trial > 1: pruner = MultiTrainingPruner(self.params) else: printout( "MultiTrainingPruner requested, but only one " "training" "per trial specified; Skipping pruner creation." ) else: raise Exception("Invalid pruner type selected.") # Create the study. if self.params.hyperparameters.rdb_storage is None: self.study = optuna.create_study( direction=self.params.hyperparameters.direction, sampler=sampler, study_name=self.params.hyperparameters.study_name, pruner=pruner, ) else: if self.params.hyperparameters.study_name is None: raise Exception( "If RDB storage is used, a name for the study " "has to be provided." ) if "sqlite" in self.params.hyperparameters.rdb_storage: engine_kwargs = { "connect_args": { "timeout": self.params.hyperparameters.sqlite_timeout } } else: engine_kwargs = None rdb_storage = optuna.storages.RDBStorage( url=self.params.hyperparameters.rdb_storage, heartbeat_interval=self.params.hyperparameters.rdb_storage_heartbeat, engine_kwargs=engine_kwargs, ) self.study = optuna.create_study( direction=self.params.hyperparameters.direction, sampler=sampler, study_name=self.params.hyperparameters.study_name, storage=rdb_storage, load_if_exists=True, pruner=pruner, ) self._checkpoint_counter = 0
[docs] def perform_study(self): """ Perform the study, i.e. the optimization. This is done by sampling a certain subset of network architectures. In this case, optuna is used. Returns ------- best_trial_loss : float Loss of the best trial. """ # The parameters could have changed. self.objective = ObjectiveBase(self.params, self._data_handler) # Fill callback list based on user checkpoint wishes. callback_list = [self.__check_stopping] if self.params.hyperparameters.checkpoints_each_trial != 0: callback_list.append(self.__create_checkpointing) self.study.optimize( self.objective, n_trials=None, callbacks=callback_list ) # Return the best lost value we could achieve. return self.study.best_value
[docs] def set_optimal_parameters(self): """ Set the optimal parameters found in the present study. The parameters will be written to the parameter object with which the hyperparameter optimizer was created. """ # Parse the parameters from the best trial. self.objective.parse_trial_optuna(self.study.best_trial)
[docs] def get_trials_from_study(self): """ Return the trials from the last study. Only returns completed trials. Returns ------- last_trials: list A list of optuna.FrozenTrial objects. """ return self.study.get_trials( states=(optuna.trial.TrialState.COMPLETE,) )
[docs] @staticmethod def requeue_zombie_trials(study_name, rdb_storage): """ Put zombie trials back into the queue to be investigated. When using Optuna with scheduling systems in HPC infrastructure, zombie trials can occur. These are trials that are still marked as "RUNNING", but are, in actuality, dead, since the HPC job ended. This function takes a saved hyperparameter study, and puts all "RUNNING" trials als "WAITING". Upon the next execution from checkpoint, they will be executed. BE CAREFUL! DO NOT USE APPLY THIS TO A RUNNING STUDY, IT WILL MESS THE STUDY UP! ONLY USE THIS ONCE ALL JOBS HAVE FINISHED, TO CLEAN UP, AND THEN RESUBMIT! Parameters ---------- rdb_storage : string Adress of the RDB storage to be cleaned. study_name : string Name of the study in the storage. Same as the checkpoint name. """ study_to_clean = optuna.load_study( study_name=study_name, storage=rdb_storage ) parallel_warn( "WARNING: Your about to clean/requeue a study." " This operation should not be done to an already" " running study." ) trials = study_to_clean.get_trials() cleaned_trials = [] for trial in trials: if trial.state == optuna.trial.TrialState.RUNNING: kwds = dict( trial_id=trial._trial_id, state=optuna.trial.TrialState.WAITING, ) if hasattr(study_to_clean._storage, "set_trial_state"): # Optuna 2.x study_to_clean._storage.set_trial_state(**kwds) else: # Optuna 3.x study_to_clean._storage.set_trial_state_values( values=None, **kwds ) cleaned_trials.append(trial.number) printout("Cleaned trials: ", cleaned_trials, min_verbosity=0)
[docs] @classmethod def resume_checkpoint( cls, checkpoint_name, alternative_storage_path=None, no_data=False, use_pkl_checkpoints=False, ): """ Prepare resumption of hyperparameter optimization from a checkpoint. Please note that to actually resume the optimization, HyperOptOptuna.perform_study() still has to be called. Parameters ---------- checkpoint_name : string Name of the checkpoint from which the checkpoint is loaded. alternative_storage_path: string Alternative storage string to load the study from. For applications on an HPC cluster it might be necessary to slightly modify the storage path between runs, since the SQL server might be running on different nodes each time. no_data : bool If True, the data won't actually be loaded into RAM or scaled. This can be useful for cases where a checkpoint is loaded for analysis purposes. use_pkl_checkpoints : bool If true, .pkl checkpoints will be loaded. Returns ------- loaded_params : mala.common.parameters.Parameters The Parameters saved in the checkpoint. new_datahandler : mala.datahandling.data_handler.DataHandler The data handler reconstructed from the checkpoint. new_hyperopt : HyperOptOptuna The hyperparameter optimizer reconstructed from the checkpoint. """ loaded_params, new_datahandler, optimizer_name = ( cls._resume_checkpoint( checkpoint_name, no_data=no_data, use_pkl_checkpoints=use_pkl_checkpoints, ) ) if alternative_storage_path is not None: loaded_params.hyperparameters.rdb_storage = ( alternative_storage_path ) new_hyperopt = HyperOptOptuna.load_from_file( loaded_params, optimizer_name, new_datahandler ) return loaded_params, new_datahandler, new_hyperopt
[docs] @classmethod def load_from_file(cls, params, file_path, data): """ Load a hyperparameter optimizer from a file. Parameters ---------- params : mala.common.parameters.Parameters Parameters object with which the hyperparameter optimizer should be created Has to be compatible with data. file_path : string Path to the file from which the hyperparameter optimizer should be loaded. data : mala.datahandling.data_handler.DataHandler DataHandler holding the training data. Returns ------- loaded_trainer : Network The hyperparameter optimizer that was loaded from the file. """ # First, load the checkpoint. if params.hyperparameters.rdb_storage is None: with open(file_path, "rb") as handle: loaded_study = pickle.load(handle) # Now, create the Trainer class with it. loaded_hyperopt = HyperOptOptuna(params, data) loaded_hyperopt.study = loaded_study else: loaded_hyperopt = HyperOptOptuna(params, data) return loaded_hyperopt
def __get_number_of_completed_trials(self, study): """ Get the number of completed trials from a study. Parameters ---------- study : optuna.study.Study Study from which the number of completed trials should be extracted. Returns ------- number_of_completed_trials : int Number of completed trials. """ # How to calculate this depends on whether or not a heartbeat was # used. If one was used, then both COMPLETE and RUNNING trials # Can be taken into account, as it can be expected that RUNNING # trials will actually finish. If no heartbeat is used, # then RUNNING trials might be Zombie trials. # See if self.params.hyperparameters.rdb_storage_heartbeat is None: return len( [ t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE ] ) else: return len( [ t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE or t.state == optuna.trial.TrialState.RUNNING ] ) def __check_stopping(self, study, trial): """ Check if this trial was already the maximum number of trials. If so, stop the study. Parameters ---------- study : optuna.study.Study Study in which the trial is running. trial : optuna.trial.FrozenTrial Trial for which the stopping condition should be tested. """ # How to check for this depends on whether or not a heartbeat was # used. If one was used, then both COMPLETE and RUNNING trials # Can be taken into account, as it can be expected that RUNNING # trials will actually finish. If no heartbeat is used, # then RUNNING trials might be Zombie trials. # See # https://github.com/optuna/optuna/issues/1883#issuecomment-841844834 # https://github.com/optuna/optuna/issues/1883#issuecomment-842106950 completed_trials = self.__get_number_of_completed_trials(study) if completed_trials >= self.params.hyperparameters.n_trials: self.study.stop() # Only check if there are trials to be checked. if completed_trials > 0: if ( self.params.hyperparameters.number_bad_trials_before_stopping is not None and self.params.hyperparameters.number_bad_trials_before_stopping > 0 ): if ( trial.number - self.study.best_trial.number >= self.params.hyperparameters.number_bad_trials_before_stopping ): printout( "No new best trial found in", self.params.hyperparameters.number_bad_trials_before_stopping, "attempts, stopping the study.", ) self.study.stop() def __create_checkpointing(self, study, trial): """ Create a checkpoint of optuna study, if necessary. This is done based on an internal checkpoint counter. Parameters ---------- study : optuna.study.Study Study in which the trial is running. trial : optuna.trial.FrozenTrial Trial for which the checkpoint may be created. """ self._checkpoint_counter += 1 need_to_checkpoint = False if ( self._checkpoint_counter >= self.params.hyperparameters.checkpoints_each_trial and self.params.hyperparameters.checkpoints_each_trial > 0 ): need_to_checkpoint = True printout( str(self.params.hyperparameters.checkpoints_each_trial) + " trials have passed, creating a " "checkpoint for hyperparameter " "optimization.", min_verbosity=0, ) if ( self.params.hyperparameters.checkpoints_each_trial < 0 and self.__get_number_of_completed_trials(study) > 0 ): if trial.number == study.best_trial.number: need_to_checkpoint = True printout( "Best trial is " + str(trial.number) + ", creating a " "checkpoint for it.", min_verbosity=0, ) if need_to_checkpoint is True: # We need to create a checkpoint! self._checkpoint_counter = 0 self._save_params_and_scaler() # The study only has to be saved if the no RDB storage is used. if self.params.hyperparameters.rdb_storage is None: hyperopt_name = ( self.params.hyperparameters.checkpoint_name + "_hyperopt.pth" ) with open(hyperopt_name, "wb") as handle: pickle.dump(self.study, handle, protocol=4)