From ab63caf2df2cb15b772b954d895795da7e8cd540 Mon Sep 17 00:00:00 2001
From: Arthur Juliani <awjuliani@gmail.com>
Date: Tue, 3 Apr 2018 17:33:43 -0700
Subject: [PATCH 1/2] Fix bootstrapping on episode timeout

---
 python/trainer_config.yaml                 |  6 +++++
 python/unitytrainers/bc/trainer.py         |  9 ++++---
 python/unitytrainers/ppo/trainer.py        | 29 +++++++++++++---------
 python/unitytrainers/trainer.py            |  5 ++--
 python/unitytrainers/trainer_controller.py |  8 +++---
 5 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/python/trainer_config.yaml b/python/trainer_config.yaml
index e7625e7e06..e473f49673 100644
--- a/python/trainer_config.yaml
+++ b/python/trainer_config.yaml
@@ -80,6 +80,12 @@ GoalieBrain:
 
 Ball3DBrain:
     normalize: true
+    batch_size: 1200
+    buffer_size: 12000
+    summary_freq: 1000
+    time_horizon: 1000
+    gamma: 0.995
+    beta: 0.001
 
 BouncerBrain:
     normalize: true
diff --git a/python/unitytrainers/bc/trainer.py b/python/unitytrainers/bc/trainer.py
index b280efbaaf..4b37234c42 100755
--- a/python/unitytrainers/bc/trainer.py
+++ b/python/unitytrainers/bc/trainer.py
@@ -229,13 +229,14 @@ def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take
                         self.episode_steps[agent_id] = 0
                     self.episode_steps[agent_id] += 1
 
-    def process_experiences(self, info: AllBrainInfo):
+    def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo):
         """
         Checks agent histories for processing condition, and processes them as necessary.
         Processing involves calculating value and advantage targets for model updating step.
-        :param info: Current AllBrainInfo
+        :param current_info: Current AllBrainInfo
+        :param next_info: Next AllBrainInfo
         """
-        info_teacher = info[self.brain_to_imitate]
+        info_teacher = current_info[self.brain_to_imitate]
         for l in range(len(info_teacher.agents)):
             if ((info_teacher.local_done[l] or
                  len(self.training_buffer[info_teacher.agents[l]]['actions']) > self.trainer_parameters[
@@ -246,7 +247,7 @@ def process_experiences(self, info: AllBrainInfo):
                                                           training_length=self.sequence_length)
                 self.training_buffer[agent_id].reset_agent()
 
-        info_student = info[self.brain_name]
+        info_student = current_info[self.brain_name]
         for l in range(len(info_student.agents)):
             if info_student.local_done[l]:
                 agent_id = info_student.agents[l]
diff --git a/python/unitytrainers/ppo/trainer.py b/python/unitytrainers/ppo/trainer.py
index 401ed9c891..4ae807efec 100755
--- a/python/unitytrainers/ppo/trainer.py
+++ b/python/unitytrainers/ppo/trainer.py
@@ -260,15 +260,16 @@ def add_experiences(self, curr_all_info: AllBrainInfo, next_all_info: AllBrainIn
                         self.episode_steps[agent_id] = 0
                     self.episode_steps[agent_id] += 1
 
-
-    def process_experiences(self, all_info: AllBrainInfo):
+    def process_experiences(self, current_info: AllBrainInfo, new_info: AllBrainInfo):
         """
         Checks agent histories for processing condition, and processes them as necessary.
         Processing involves calculating value and advantage targets for model updating step.
-        :param all_info: Dictionary of all current brains and corresponding BrainInfo.
+        :param current_info: Dictionary of all current brains and corresponding BrainInfo.
+        :param new_info: Dictionary of all next brains and corresponding BrainInfo.
         """
 
-        info = all_info[self.brain_name]
+        info = new_info[self.brain_name]
+        last_info = current_info[self.brain_name]
         for l in range(len(info.agents)):
             agent_actions = self.training_buffer[info.agents[l]]['actions']
             if ((info.local_done[l] or len(agent_actions) > self.trainer_parameters['time_horizon'])
@@ -276,18 +277,22 @@ def process_experiences(self, all_info: AllBrainInfo):
                 if info.local_done[l] and not info.max_reached[l]:
                     value_next = 0.0
                 else:
-                    feed_dict = {self.model.batch_size: len(info.vector_observations), self.model.sequence_length: 1}
+                    if info.max_reached[l]:
+                        bootstrapping_info = last_info
+                    else:
+                        bootstrapping_info = info
+                    feed_dict = {self.model.batch_size: len(bootstrapping_info.vector_observations), self.model.sequence_length: 1}
                     if self.use_observations:
-                        for i in range(len(info.visual_observations)):
-                            feed_dict[self.model.visual_in[i]] = info.visual_observations[i]
+                        for i in range(len(bootstrapping_info.visual_observations)):
+                            feed_dict[self.model.visual_in[i]] = bootstrapping_info.visual_observations[i]
                     if self.use_states:
-                        feed_dict[self.model.vector_in] = info.vector_observations
+                        feed_dict[self.model.vector_in] = bootstrapping_info.vector_observations
                     if self.use_recurrent:
-                        if info.memories.shape[1] == 0:
-                            info.memories = np.zeros((len(info.vector_observations), self.m_size))
-                        feed_dict[self.model.memory_in] = info.memories
+                        if bootstrapping_info.memories.shape[1] == 0:
+                            bootstrapping_info.memories = np.zeros((len(bootstrapping_info.vector_observations), self.m_size))
+                        feed_dict[self.model.memory_in] = bootstrapping_info.memories
                     if not self.is_continuous and self.use_recurrent:
-                        feed_dict[self.model.prev_action] = np.reshape(info.previous_vector_actions, [-1])
+                        feed_dict[self.model.prev_action] = np.reshape(bootstrapping_info.previous_vector_actions, [-1])
                     value_next = self.sess.run(self.model.value, feed_dict)[l]
                 agent_id = info.agents[l]
 
diff --git a/python/unitytrainers/trainer.py b/python/unitytrainers/trainer.py
index fb3aad638a..a6e97b836c 100755
--- a/python/unitytrainers/trainer.py
+++ b/python/unitytrainers/trainer.py
@@ -103,11 +103,12 @@ def add_experiences(self, curr_info: AllBrainInfo, next_info: AllBrainInfo, take
         """
         raise UnityTrainerException("The add_experiences method was not implemented.")
 
-    def process_experiences(self, info: AllBrainInfo):
+    def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInfo):
         """
         Checks agent histories for processing condition, and processes them as necessary.
         Processing involves calculating value and advantage targets for model updating step.
-        :param info: Dictionary of all current brains and corresponding BrainInfo.
+        :param current_info: Dictionary of all current-step brains and corresponding BrainInfo.
+        :param next_info: Dictionary of all next-step brains and corresponding BrainInfo.
         """
         raise UnityTrainerException("The process_experiences method was not implemented.")
 
diff --git a/python/unitytrainers/trainer_controller.py b/python/unitytrainers/trainer_controller.py
index 546ea3512c..984d321b80 100644
--- a/python/unitytrainers/trainer_controller.py
+++ b/python/unitytrainers/trainer_controller.py
@@ -250,13 +250,11 @@ def start_learning(self):
 
                     for brain_name, trainer in self.trainers.items():
                         trainer.add_experiences(curr_info, new_info, take_action_outputs[brain_name])
-                    curr_info = new_info
-                    for brain_name, trainer in self.trainers.items():
-                        trainer.process_experiences(curr_info)
+                        trainer.process_experiences(curr_info, new_info)
                         if trainer.is_ready_update() and self.train_model and trainer.get_step <= trainer.get_max_steps:
                             # Perform gradient descent with experience buffer
                             trainer.update_model()
-                        # Write training statistics to tensorboard.
+                        # Write training statistics to Tensorboard.
                         trainer.write_summary(self.env.curriculum.lesson_number)
                         if self.train_model and trainer.get_step <= trainer.get_max_steps:
                             trainer.increment_step()
@@ -266,7 +264,7 @@ def start_learning(self):
                     if global_step % self.save_freq == 0 and global_step != 0 and self.train_model:
                         # Save Tensorflow model
                         self._save_model(sess, steps=global_step, saver=saver)
-
+                    curr_info = new_info
                 # Final save Tensorflow model
                 if global_step != 0 and self.train_model:
                     self._save_model(sess,  steps=global_step, saver=saver)

From cb67bd22cfd6ba36258ae6fdb87cae3b4247cb18 Mon Sep 17 00:00:00 2001
From: Arthur Juliani <awjuliani@gmail.com>
Date: Tue, 3 Apr 2018 17:43:54 -0700
Subject: [PATCH 2/2] Use correct BrainInfo in behavioral cloning

---
 python/unitytrainers/bc/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/unitytrainers/bc/trainer.py b/python/unitytrainers/bc/trainer.py
index 4b37234c42..bf4f4ba4f6 100755
--- a/python/unitytrainers/bc/trainer.py
+++ b/python/unitytrainers/bc/trainer.py
@@ -236,7 +236,7 @@ def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInf
         :param current_info: Current AllBrainInfo
         :param next_info: Next AllBrainInfo
         """
-        info_teacher = current_info[self.brain_to_imitate]
+        info_teacher = next_info[self.brain_to_imitate]
         for l in range(len(info_teacher.agents)):
             if ((info_teacher.local_done[l] or
                  len(self.training_buffer[info_teacher.agents[l]]['actions']) > self.trainer_parameters[
@@ -247,7 +247,7 @@ def process_experiences(self, current_info: AllBrainInfo, next_info: AllBrainInf
                                                           training_length=self.sequence_length)
                 self.training_buffer[agent_id].reset_agent()
 
-        info_student = current_info[self.brain_name]
+        info_student = next_info[self.brain_name]
         for l in range(len(info_student.agents)):
             if info_student.local_done[l]:
                 agent_id = info_student.agents[l]