[bug-fix] Improve performance for PPO with continuous actions (#3662)

Ervin T · vincentpierre · commit 16b9a7876e38 · 2020-03-26T12:37:30.000-07:00
diff --git a/ml-agents/mlagents/trainers/distributions.py b/ml-agents/mlagents/trainers/distributions.py
@@ -64,6 +64,7 @@ def __init__(
         act_size: List[int],
         reparameterize: bool = False,
         tanh_squash: bool = False,
+        condition_sigma: bool = True,
         log_sigma_min: float = -20,
         log_sigma_max: float = 2,
     ):
@@ -79,7 +80,11 @@ def __init__(
         :param log_sigma_max: Maximum log standard deviation to clip by.
         """
         encoded = self._create_mu_log_sigma(
-            logits, act_size, log_sigma_min, log_sigma_max
+            logits,
+            act_size,
+            log_sigma_min,
+            log_sigma_max,
+            condition_sigma=condition_sigma,
         )
         self._sampled_policy = self._create_sampled_policy(encoded)
         if not reparameterize:
@@ -101,6 +106,7 @@ def _create_mu_log_sigma(
         act_size: List[int],
         log_sigma_min: float,
         log_sigma_max: float,
+        condition_sigma: bool,
     ) -> "GaussianDistribution.MuSigmaTensors":
 
         mu = tf.layers.dense(
@@ -112,14 +118,22 @@ def _create_mu_log_sigma(
             reuse=tf.AUTO_REUSE,
         )
 
-        # Policy-dependent log_sigma_sq
-        log_sigma = tf.layers.dense(
-            logits,
-            act_size[0],
-            activation=None,
-            name="log_std",
-            kernel_initializer=ModelUtils.scaled_init(0.01),
-        )
+        if condition_sigma:
+            # Policy-dependent log_sigma_sq
+            log_sigma = tf.layers.dense(
+                logits,
+                act_size[0],
+                activation=None,
+                name="log_std",
+                kernel_initializer=ModelUtils.scaled_init(0.01),
+            )
+        else:
+            log_sigma = tf.get_variable(
+                "log_std",
+                [act_size[0]],
+                dtype=tf.float32,
+                initializer=tf.zeros_initializer(),
+            )
         log_sigma = tf.clip_by_value(log_sigma, log_sigma_min, log_sigma_max)
         sigma = tf.exp(log_sigma)
         return self.MuSigmaTensors(mu, log_sigma, sigma)
@@ -155,8 +169,8 @@ def _do_squash_correction_for_tanh(self, probs, squashed_policy):
         """
         Adjust probabilities for squashed sample before output
         """
-        probs -= tf.log(1 - squashed_policy ** 2 + EPSILON)
-        return probs
+        adjusted_probs = probs - tf.log(1 - squashed_policy ** 2 + EPSILON)
+        return adjusted_probs
 
     @property
     def total_log_probs(self) -> tf.Tensor:
diff --git a/ml-agents/mlagents/trainers/policy/nn_policy.py b/ml-agents/mlagents/trainers/policy/nn_policy.py
@@ -202,6 +202,7 @@ def _create_cc_actor(
                 self.act_size,
                 reparameterize=reparameterize,
                 tanh_squash=tanh_squash,
+                condition_sigma=condition_sigma_on_obs,
             )
 
         if tanh_squash:
diff --git a/ml-agents/mlagents/trainers/tests/test_simple_rl.py b/ml-agents/mlagents/trainers/tests/test_simple_rl.py
@@ -220,13 +220,14 @@ def test_visual_advanced_ppo(vis_encode_type, num_visual):
 def test_recurrent_ppo(use_discrete):
     env = Memory1DEnvironment([BRAIN_NAME], use_discrete=use_discrete)
     override_vals = {
-        "max_steps": 3000,
+        "max_steps": 4000,
         "batch_size": 64,
         "buffer_size": 128,
+        "learning_rate": 1e-3,
         "use_recurrent": True,
     }
     config = generate_config(PPO_CONFIG, override_vals)
-    _check_environment_trains(env, config)
+    _check_environment_trains(env, config, success_threshold=0.9)
 
 
 @pytest.mark.parametrize("use_discrete", [True, False])

Original file line number	Diff line number	Diff line change
`@@ -202,6 +202,7 @@ def _create_cc_actor(`
`202`	`202`	`self.act_size,`
`203`	`203`	`reparameterize=reparameterize,`
`204`	`204`	`tanh_squash=tanh_squash,`
	`205`	`+ condition_sigma=condition_sigma_on_obs,`
`205`	`206`	`)`
`206`	`207`
`207`	`208`	`if tanh_squash:`