From ab63caf2df2cb15b772b954d895795da7e8cd540 Mon Sep 17 00:00:00 2001 From: Arthur Juliani Date: Tue, 3 Apr 2018 17:33:43 -0700 Subject: [PATCH 1/2] Fix bootstrapping on episode timeout --- python/trainer_config.yaml | 6 +++++ python/unitytrainers/bc/trainer.py | 9 ++++--- python/unitytrainers/ppo/trainer.py | 29 +++++++++++++--------- python/unitytrainers/trainer.py | 5 ++-- python/unitytrainers/trainer_controller.py | 8 +++--- 5 files changed, 34 insertions(+), 23 deletions(-) diff --git a/python/trainer_config.yaml b/python/trainer_config.yaml index e7625e7e06..e473f49673 100644 --- a/python/trainer_config.yaml +++ b/python/trainer_config.yaml @@ -80,6 +80,12 @@ GoalieBrain: Ball3DBrain: normalize: true + batch_size: 1200 + buffer_size: 12000 + summary_freq: 1000 + time_horizon: 1000 + gamma: 0.995 + beta: 0.001 BouncerBrain: normalize: true diff --git a/python/unitytrainers/bc/trainer.py b/python/unitytrainers/bc/trainer.py index b280efbaaf..4b37234c42 100755 --- a/python/unitytrainers/bc/trainer.py +++ b/python/unitytrainers/bc/trainer.py @@ -229,13 +229,14 @@ def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 - def process_experiences(self, info: AllBrainInfo): + def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo): """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. - :param info: Current AllBrainInfo + :param current_info: Current AllBrainInfo + :param next_info: Next AllBrainInfo """ - info_teacher = info[self.brain_to_imitate] + info_teacher = current_info[self.brain_to_imitate] for l in range(len(info_teacher.agents)): if ((info_teacher.local_done[l] or len(self.training_buffer[info_teacher.agents[l]]['actions']) > self.trainer_parameters[ @@ -246,7 +247,7 @@ def process_experiences(self, info: AllBrainInfo): training_length=self.sequence_length) self.training_buffer[agent_id].reset_agent() - info_student = info[self.brain_name] + info_student = current_info[self.brain_name] for l in range(len(info_student.agents)): if info_student.local_done[l]: agent_id = info_student.agents[l] diff --git a/python/unitytrainers/ppo/trainer.py b/python/unitytrainers/ppo/trainer.py index 401ed9c891..4ae807efec 100755 --- a/python/unitytrainers/ppo/trainer.py +++ b/python/unitytrainers/ppo/trainer.py @@ -260,15 +260,16 @@ def add_experiences(self, curr_all_info: AllBrainInfo, next_all_info: AllBrainIn self.episode_steps[agent_id] = 0 self.episode_steps[agent_id] += 1 - - def process_experiences(self, all_info: AllBrainInfo): + def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo): """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. - :param all_info: Dictionary of all current brains and corresponding BrainInfo. + :param current_info: Dictionary of all current brains and corresponding BrainInfo. + :param new_info: Dictionary of all next brains and corresponding BrainInfo. """ - info = all_info[self.brain_name] + info = new_info[self.brain_name] + last_info = current_info[self.brain_name] for l in range(len(info.agents)): agent_actions = self.training_buffer[info.agents[l]]['actions'] if ((info.local_done[l] or len(agent_actions) > self.trainer_parameters['time_horizon']) @@ -276,18 +277,22 @@ def process_experiences(self, all_info: AllBrainInfo): if info.local_done[l] and not info.max_reached[l]: value_next = 0.0 else: - feed_dict = {self.model.batch_size: len(info.vector_observations), self.model.sequence_length: 1} + if info.max_reached[l]: + bootstrapping_info = last_info + else: + bootstrapping_info = info + feed_dict = {self.model.batch_size: len(bootstrapping_info.vector_observations), self.model.sequence_length: 1} if self.use_observations: - for i in range(len(info.visual_observations)): - feed_dict[self.model.visual_in[i]] = info.visual_observations[i] + for i in range(len(bootstrapping_info.visual_observations)): + feed_dict[self.model.visual_in[i]] = bootstrapping_info.visual_observations[i] if self.use_states: - feed_dict[self.model.vector_in] = info.vector_observations + feed_dict[self.model.vector_in] = bootstrapping_info.vector_observations if self.use_recurrent: - if info.memories.shape[1] == 0: - info.memories = np.zeros((len(info.vector_observations), self.m_size)) - feed_dict[self.model.memory_in] = info.memories + if bootstrapping_info.memories.shape[1] == 0: + bootstrapping_info.memories = np.zeros((len(bootstrapping_info.vector_observations), self.m_size)) + feed_dict[self.model.memory_in] = bootstrapping_info.memories if not self.is_continuous and self.use_recurrent: - feed_dict[self.model.prev_action] = np.reshape(info.previous_vector_actions, [-1]) + feed_dict[self.model.prev_action] = np.reshape(bootstrapping_info.previous_vector_actions, [-1]) value_next = self.sess.run(self.model.value, feed_dict)[l] agent_id = info.agents[l] diff --git a/python/unitytrainers/trainer.py b/python/unitytrainers/trainer.py index fb3aad638a..a6e97b836c 100755 --- a/python/unitytrainers/trainer.py +++ b/python/unitytrainers/trainer.py @@ -103,11 +103,12 @@ def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take """ raise UnityTrainerException("The add_experiences method was not implemented.") - def process_experiences(self, info: AllBrainInfo): + def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo): """ Checks agent histories for processing condition, and processes them as necessary. Processing involves calculating value and advantage targets for model updating step. - :param info: Dictionary of all current brains and corresponding BrainInfo. + :param current_info: Dictionary of all current-step brains and corresponding BrainInfo. + :param next_info: Dictionary of all next-step brains and corresponding BrainInfo. """ raise UnityTrainerException("The process_experiences method was not implemented.") diff --git a/python/unitytrainers/trainer_controller.py b/python/unitytrainers/trainer_controller.py index 546ea3512c..984d321b80 100644 --- a/python/unitytrainers/trainer_controller.py +++ b/python/unitytrainers/trainer_controller.py @@ -250,13 +250,11 @@ def start_learning(self): for brain_name, trainer in self.trainers.items(): trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name]) - curr_info = new_info - for brain_name, trainer in self.trainers.items(): - trainer.process_experiences(curr_info) + trainer.process_experiences(curr_info, new_info) if trainer.is_ready_update() and self.train_model and trainer.get_step <= trainer.get_max_steps: # Perform gradient descent with experience buffer trainer.update_model() - # Write training statistics to tensorboard. + # Write training statistics to Tensorboard. trainer.write_summary(self.env.curriculum.lesson_number) if self.train_model and trainer.get_step <= trainer.get_max_steps: trainer.increment_step() @@ -266,7 +264,7 @@ def start_learning(self): if global_step % self.save_freq == 0 and global_step != 0 and self.train_model: # Save Tensorflow model self._save_model(sess, steps=global_step, saver=saver) - + curr_info = new_info # Final save Tensorflow model if global_step != 0 and self.train_model: self._save_model(sess, steps=global_step, saver=saver) From cb67bd22cfd6ba36258ae6fdb87cae3b4247cb18 Mon Sep 17 00:00:00 2001 From: Arthur Juliani Date: Tue, 3 Apr 2018 17:43:54 -0700 Subject: [PATCH 2/2] Use correct BrainInfo in behavioral cloning --- python/unitytrainers/bc/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/unitytrainers/bc/trainer.py b/python/unitytrainers/bc/trainer.py index 4b37234c42..bf4f4ba4f6 100755 --- a/python/unitytrainers/bc/trainer.py +++ b/python/unitytrainers/bc/trainer.py @@ -236,7 +236,7 @@ def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInf :param current_info: Current AllBrainInfo :param next_info: Next AllBrainInfo """ - info_teacher = current_info[self.brain_to_imitate] + info_teacher = next_info[self.brain_to_imitate] for l in range(len(info_teacher.agents)): if ((info_teacher.local_done[l] or len(self.training_buffer[info_teacher.agents[l]]['actions']) > self.trainer_parameters[ @@ -247,7 +247,7 @@ def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInf training_length=self.sequence_length) self.training_buffer[agent_id].reset_agent() - info_student = current_info[self.brain_name] + info_student = next_info[self.brain_name] for l in range(len(info_student.agents)): if info_student.local_done[l]: agent_id = info_student.agents[l]