[bug-fix] When agent isn't training, don't clear update buffer (#5205)

Ervin T · Ervin Teng · commit 7b800d15aee4 · 2021-04-08T17:24:30.000-04:00
* Don't clear update buffer, but don't append to it either * Update changelog * Address comments * Make experience replay buffer saving more verbose (cherry picked from commit 63e7ad4)
diff --git a/com.unity.ml-agents/CHANGELOG.md b/com.unity.ml-agents/CHANGELOG.md
@@ -6,6 +6,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to
 [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## [1.9.1-preview]
+### Bug Fixes
+#### ml-agents / ml-agents-envs / gym-unity (Python)
+- Fixed a bug where the SAC replay buffer would not be saved out at the end of a run, even if `save_replay_buffer` was enabled. (#5205)
+
 ## [1.9.0-preview] - 2021-03-17
 ### Major Changes
 #### com.unity.ml-agents (C#)
diff --git a/ml-agents/mlagents/trainers/poca/trainer.py b/ml-agents/mlagents/trainers/poca/trainer.py
@@ -166,10 +166,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
         )
         agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
 
-        # Append to update buffer
-        agent_buffer_trajectory.resequence_and_append(
-            self.update_buffer, training_length=self.policy.sequence_length
-        )
+        self._append_to_update_buffer(agent_buffer_trajectory)
 
         # If this was a terminal trajectory, append stats and reset reward collection
         if trajectory.done_reached:
diff --git a/ml-agents/mlagents/trainers/ppo/trainer.py b/ml-agents/mlagents/trainers/ppo/trainer.py
@@ -149,10 +149,8 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
         global_returns = list(np.mean(np.array(tmp_returns, dtype=np.float32), axis=0))
         agent_buffer_trajectory[BufferKey.ADVANTAGES].set(global_advantages)
         agent_buffer_trajectory[BufferKey.DISCOUNTED_RETURNS].set(global_returns)
-        # Append to update buffer
-        agent_buffer_trajectory.resequence_and_append(
-            self.update_buffer, training_length=self.policy.sequence_length
-        )
+
+        self._append_to_update_buffer(agent_buffer_trajectory)
 
         # If this was a terminal trajectory, append stats and reset reward collection
         if trajectory.done_reached:
diff --git a/ml-agents/mlagents/trainers/sac/trainer.py b/ml-agents/mlagents/trainers/sac/trainer.py
@@ -104,9 +104,12 @@ def save_replay_buffer(self) -> None:
         Save the training buffer's update buffer to a pickle file.
         """
         filename = os.path.join(self.artifact_path, "last_replay_buffer.hdf5")
-        logger.info(f"Saving Experience Replay Buffer to {filename}")
+        logger.info(f"Saving Experience Replay Buffer to {filename}...")
         with open(filename, "wb") as file_object:
             self.update_buffer.save_to_file(file_object)
+            logger.info(
+                f"Saved Experience Replay Buffer ({os.path.getsize(filename)} bytes)."
+            )
 
     def load_replay_buffer(self) -> None:
         """
@@ -175,10 +178,7 @@ def _process_trajectory(self, trajectory: Trajectory) -> None:
                 agent_buffer_trajectory[ObsUtil.get_name_at_next(i)][-1] = obs
             agent_buffer_trajectory[BufferKey.DONE][-1] = False
 
-        # Append to update buffer
-        agent_buffer_trajectory.resequence_and_append(
-            self.update_buffer, training_length=self.policy.sequence_length
-        )
+        self._append_to_update_buffer(agent_buffer_trajectory)
 
         if trajectory.done_reached:
             self._update_end_episode_stats(agent_id, self.optimizer)
diff --git a/ml-agents/mlagents/trainers/tests/test_rl_trainer.py b/ml-agents/mlagents/trainers/tests/test_rl_trainer.py
@@ -77,8 +77,7 @@ def test_clear_update_buffer():
 
 
 @mock.patch("mlagents.trainers.trainer.trainer.Trainer.save_model")
-@mock.patch("mlagents.trainers.trainer.rl_trainer.RLTrainer._clear_update_buffer")
-def test_advance(mocked_clear_update_buffer, mocked_save_model):
+def test_advance(mocked_save_model):
     trainer = create_rl_trainer()
     mock_policy = mock.Mock()
     trainer.add_policy("TestBrain", mock_policy)
@@ -115,9 +114,8 @@ def test_advance(mocked_clear_update_buffer, mocked_save_model):
         with pytest.raises(AgentManagerQueue.Empty):
             policy_queue.get_nowait()
 
-    # Check that the buffer has been cleared
+    # Check that no model has been saved
     assert not trainer.should_still_train
-    assert mocked_clear_update_buffer.call_count > 0
     assert mocked_save_model.call_count == 0
 
 
@@ -181,6 +179,39 @@ def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary):
     mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
 
 
+def test_update_buffer_append():
+    trainer = create_rl_trainer()
+    mock_policy = mock.Mock()
+    trainer.add_policy("TestBrain", mock_policy)
+    trajectory_queue = AgentManagerQueue("testbrain")
+    policy_queue = AgentManagerQueue("testbrain")
+    trainer.subscribe_trajectory_queue(trajectory_queue)
+    trainer.publish_policy_queue(policy_queue)
+    time_horizon = 10
+    trajectory = mb.make_fake_trajectory(
+        length=time_horizon,
+        observation_specs=create_observation_specs_with_shapes([(1,)]),
+        max_step_complete=True,
+        action_spec=ActionSpec.create_discrete((2,)),
+    )
+    agentbuffer_trajectory = trajectory.to_agentbuffer()
+    assert trainer.update_buffer.num_experiences == 0
+
+    # Check that if we append, our update buffer gets longer.
+    # max_steps = 100
+    for i in range(10):
+        trainer._process_trajectory(trajectory)
+        trainer._append_to_update_buffer(agentbuffer_trajectory)
+        assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon
+
+    # Check that if we append after stopping training, nothing happens.
+    # We process enough trajectories to hit max steps
+    trainer.set_is_policy_updating(False)
+    trainer._process_trajectory(trajectory)
+    trainer._append_to_update_buffer(agentbuffer_trajectory)
+    assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon
+
+
 class RLTrainerWarningTest(unittest.TestCase):
     def test_warning_group_reward(self):
         with self.assertLogs("mlagents.trainers", level="WARN") as cm:
diff --git a/ml-agents/mlagents/trainers/trainer/rl_trainer.py b/ml-agents/mlagents/trainers/trainer/rl_trainer.py
@@ -245,6 +245,21 @@ def _maybe_write_summary(self, step_after_process: int) -> None:
         if step_after_process >= self._next_summary_step and self.get_step != 0:
             self._write_summary(self._next_summary_step)
 
+    def _append_to_update_buffer(self, agentbuffer_trajectory: AgentBuffer) -> None:
+        """
+        Append an AgentBuffer to the update buffer. If the trainer isn't training,
+        don't update to avoid a memory leak.
+        """
+        if self.should_still_train:
+            seq_len = (
+                self.trainer_settings.network_settings.memory.sequence_length
+                if self.trainer_settings.network_settings.memory is not None
+                else 1
+            )
+            agentbuffer_trajectory.resequence_and_append(
+                self.update_buffer, training_length=seq_len
+            )
+
     def _maybe_save_model(self, step_after_process: int) -> None:
         """
         If processing the trajectory will make the step exceed the next model write,
@@ -298,5 +313,3 @@ def advance(self) -> None:
                         for q in self.policy_queues:
                             # Get policies that correspond to the policy queue in question
                             q.put(self.get_policy(q.behavior_id))
-        else:
-            self._clear_update_buffer()