chainer · seann999 · Aug 21, 2018 · Aug 21, 2018 · Aug 21, 2018 · Aug 22, 2018
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,6 @@
-*.pyc
+results
+**/__pycache__
 .ipynb_checkpoints
-chainerrl.egg-info
-build/
-dist/
-.idea/
-results/
-examples/gym/results/
+build/lib/chainerrl
+dist
+*.ipynb
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
diff --git a/assets/ChainerRL.png b/assets/ChainerRL.png
diff --git a/assets/breakout.gif b/assets/breakout.gif
diff --git a/assets/humanoid.gif b/assets/humanoid.gif
diff --git a/chainerrl/__init__.py b/chainerrl/__init__.py
diff --git a/chainerrl/__pycache__/__init__.cpython-35.pyc b/chainerrl/__pycache__/__init__.cpython-35.pyc
diff --git a/chainerrl/__pycache__/__init__.cpython-36.pyc b/chainerrl/__pycache__/__init__.cpython-36.pyc
diff --git a/chainerrl/__pycache__/action_value.cpython-35.pyc b/chainerrl/__pycache__/action_value.cpython-35.pyc
diff --git a/chainerrl/__pycache__/action_value.cpython-36.pyc b/chainerrl/__pycache__/action_value.cpython-36.pyc
diff --git a/chainerrl/__pycache__/agent.cpython-35.pyc b/chainerrl/__pycache__/agent.cpython-35.pyc
diff --git a/chainerrl/__pycache__/agent.cpython-36.pyc b/chainerrl/__pycache__/agent.cpython-36.pyc
diff --git a/chainerrl/__pycache__/distribution.cpython-35.pyc b/chainerrl/__pycache__/distribution.cpython-35.pyc
diff --git a/chainerrl/__pycache__/distribution.cpython-36.pyc b/chainerrl/__pycache__/distribution.cpython-36.pyc
diff --git a/chainerrl/__pycache__/env.cpython-35.pyc b/chainerrl/__pycache__/env.cpython-35.pyc
diff --git a/chainerrl/__pycache__/env.cpython-36.pyc b/chainerrl/__pycache__/env.cpython-36.pyc
diff --git a/chainerrl/__pycache__/explorer.cpython-35.pyc b/chainerrl/__pycache__/explorer.cpython-35.pyc
diff --git a/chainerrl/__pycache__/explorer.cpython-36.pyc b/chainerrl/__pycache__/explorer.cpython-36.pyc
diff --git a/chainerrl/__pycache__/policy.cpython-35.pyc b/chainerrl/__pycache__/policy.cpython-35.pyc
diff --git a/chainerrl/__pycache__/policy.cpython-36.pyc b/chainerrl/__pycache__/policy.cpython-36.pyc
diff --git a/chainerrl/__pycache__/q_function.cpython-35.pyc b/chainerrl/__pycache__/q_function.cpython-35.pyc
diff --git a/chainerrl/__pycache__/q_function.cpython-36.pyc b/chainerrl/__pycache__/q_function.cpython-36.pyc
diff --git a/chainerrl/__pycache__/recurrent.cpython-35.pyc b/chainerrl/__pycache__/recurrent.cpython-35.pyc
diff --git a/chainerrl/__pycache__/recurrent.cpython-36.pyc b/chainerrl/__pycache__/recurrent.cpython-36.pyc
diff --git a/chainerrl/__pycache__/replay_buffer.cpython-35.pyc b/chainerrl/__pycache__/replay_buffer.cpython-35.pyc
diff --git a/chainerrl/__pycache__/replay_buffer.cpython-36.pyc b/chainerrl/__pycache__/replay_buffer.cpython-36.pyc
diff --git a/chainerrl/__pycache__/spaces.cpython-35.pyc b/chainerrl/__pycache__/spaces.cpython-35.pyc
diff --git a/chainerrl/__pycache__/v_function.cpython-35.pyc b/chainerrl/__pycache__/v_function.cpython-35.pyc
diff --git a/chainerrl/__pycache__/v_function.cpython-36.pyc b/chainerrl/__pycache__/v_function.cpython-36.pyc
diff --git a/chainerrl/action_value.py b/chainerrl/action_value.py
@@ -102,6 +102,80 @@ def __getitem__(self, i):
             self.q_values[i], q_values_formatter=self.q_values_formatter)
 
 
+class DiscreteActionValueWithSigma(ActionValue):
+    """Q-function output for discrete action space.
+
+    Args:
+        q_values (ndarray or chainer.Variable):
+            Array of Q values whose shape is (batchsize, n_actions)
+    """
+
+    def __init__(self, q_values, sigma_values, all_sigmas=None, q_values_formatter=lambda x: x):
+        assert isinstance(q_values, chainer.Variable)
+        self.xp = cuda.get_array_module(q_values.data)
+        self.q_values = q_values
+        self.sigmas = sigma_values
+        self.n_actions = q_values.data.shape[1]
+        self.q_values_formatter = q_values_formatter
+        self.all_sigmas = all_sigmas
+
+    @cached_property
+    def greedy_actions(self):
+        return chainer.Variable(
+            self.q_values.data.argmax(axis=1).astype(np.int32))
+
+    @cached_property
+    def sample_actions(self):
+        noise = self.xp.random.standard_normal(self.sigmas.shape)
+        sig = self.xp.sqrt(self.xp.absolute(self.sigmas.data))
+        vals = self.q_values.data + sig * noise
+        return chainer.Variable(vals.argmax(axis=1).astype(np.int32))
+
+    def sample_actions_given_sigma(self, sigma):
+        noise = self.xp.random.standard_normal(self.sigmas.shape)
+        vals = self.q_values.data + sigma * noise
+        return chainer.Variable(vals.argmax(axis=1).astype(np.int32))
+
+    def sample_actions_given_noise(self, sigma):
+        vals = self.q_values.data + sigma
+        return chainer.Variable(vals.argmax(axis=1).astype(np.int32))
+
+    @cached_property
+    def max(self):
+        with chainer.force_backprop_mode():
+            return F.select_item(self.q_values, self.greedy_actions)
+
+    @cached_property
+    def max_sigma(self):
+        with chainer.force_backprop_mode():
+            return F.select_item(self.sigmas, self.greedy_actions)
+
+    def evaluate_actions(self, actions):
+        return F.select_item(self.q_values, actions)
+
+    def evaluate_action_sigmas(self, actions):
+        return F.select_item(self.sigmas, actions)
+
+    def compute_advantage(self, actions):
+        return self.evaluate_actions(actions) - self.max
+
+    def compute_double_advantage(self, actions, argmax_actions):
+        return (self.evaluate_actions(actions) -
+                self.evaluate_actions(argmax_actions))
+
+    def compute_expectation(self, beta):
+        return F.sum(F.softmax(beta * self.q_values) * self.q_values, axis=1)
+
+    def __repr__(self):
+        return 'DiscreteActionValue greedy_actions:{} q_values:{}'.format(
+            self.greedy_actions.data,
+            self.q_values_formatter(self.q_values.data))
+
+    @property
+    def params(self):
+        return (self.q_values,)
+
+
 class DistributionalDiscreteActionValue(ActionValue):
     """distributional Q-function output for discrete action space.
 

diff --git a/chainerrl/agent.py b/chainerrl/agent.py
diff --git a/chainerrl/agents/__init__.py b/chainerrl/agents/__init__.py
@@ -1,6 +1,7 @@
 from chainerrl.agents.a3c import A3C  # NOQA
 from chainerrl.agents.acer import ACER  # NOQA
 from chainerrl.agents.al import AL  # NOQA
+from chainerrl.agents.categorical_double_dqn import CategoricalDoubleDQN  # NOQA
 from chainerrl.agents.categorical_dqn import CategoricalDQN  # NOQA
 from chainerrl.agents.ddpg import DDPG  # NOQA
 from chainerrl.agents.double_dqn import DoubleDQN  # NOQA
@@ -15,4 +16,5 @@
 from chainerrl.agents.reinforce import REINFORCE  # NOQA
 from chainerrl.agents.residual_dqn import ResidualDQN  # NOQA
 from chainerrl.agents.sarsa import SARSA  # NOQA
+from chainerrl.agents.expected_sarsa import ExpectedSARSA  # NOQA
 from chainerrl.agents.trpo import TRPO  # NOQA
diff --git a/chainerrl/agents/__pycache__/__init__.cpython-35.pyc b/chainerrl/agents/__pycache__/__init__.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/__init__.cpython-36.pyc b/chainerrl/agents/__pycache__/__init__.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/a3c.cpython-35.pyc b/chainerrl/agents/__pycache__/a3c.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/a3c.cpython-36.pyc b/chainerrl/agents/__pycache__/a3c.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/acer.cpython-35.pyc b/chainerrl/agents/__pycache__/acer.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/acer.cpython-36.pyc b/chainerrl/agents/__pycache__/acer.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/al.cpython-35.pyc b/chainerrl/agents/__pycache__/al.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/al.cpython-36.pyc b/chainerrl/agents/__pycache__/al.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/categorical_dqn.cpython-35.pyc b/chainerrl/agents/__pycache__/categorical_dqn.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/categorical_dqn.cpython-36.pyc b/chainerrl/agents/__pycache__/categorical_dqn.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/ddpg.cpython-35.pyc b/chainerrl/agents/__pycache__/ddpg.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/ddpg.cpython-36.pyc b/chainerrl/agents/__pycache__/ddpg.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/double_dqn.cpython-35.pyc b/chainerrl/agents/__pycache__/double_dqn.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/double_dqn.cpython-36.pyc b/chainerrl/agents/__pycache__/double_dqn.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/double_pal.cpython-35.pyc b/chainerrl/agents/__pycache__/double_pal.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/double_pal.cpython-36.pyc b/chainerrl/agents/__pycache__/double_pal.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/dpp.cpython-35.pyc b/chainerrl/agents/__pycache__/dpp.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/dpp.cpython-36.pyc b/chainerrl/agents/__pycache__/dpp.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/dqn.cpython-35.pyc b/chainerrl/agents/__pycache__/dqn.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/dqn.cpython-36.pyc b/chainerrl/agents/__pycache__/dqn.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/nsq.cpython-35.pyc b/chainerrl/agents/__pycache__/nsq.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/nsq.cpython-36.pyc b/chainerrl/agents/__pycache__/nsq.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/pal.cpython-35.pyc b/chainerrl/agents/__pycache__/pal.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/pal.cpython-36.pyc b/chainerrl/agents/__pycache__/pal.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/pcl.cpython-35.pyc b/chainerrl/agents/__pycache__/pcl.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/pcl.cpython-36.pyc b/chainerrl/agents/__pycache__/pcl.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/pgt.cpython-35.pyc b/chainerrl/agents/__pycache__/pgt.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/pgt.cpython-36.pyc b/chainerrl/agents/__pycache__/pgt.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/ppo.cpython-35.pyc b/chainerrl/agents/__pycache__/ppo.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/ppo.cpython-36.pyc b/chainerrl/agents/__pycache__/ppo.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/reinforce.cpython-35.pyc b/chainerrl/agents/__pycache__/reinforce.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/reinforce.cpython-36.pyc b/chainerrl/agents/__pycache__/reinforce.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/residual_dqn.cpython-35.pyc b/chainerrl/agents/__pycache__/residual_dqn.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/residual_dqn.cpython-36.pyc b/chainerrl/agents/__pycache__/residual_dqn.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/sarsa.cpython-35.pyc b/chainerrl/agents/__pycache__/sarsa.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/sarsa.cpython-36.pyc b/chainerrl/agents/__pycache__/sarsa.cpython-36.pyc
diff --git a/chainerrl/agents/__pycache__/trpo.cpython-35.pyc b/chainerrl/agents/__pycache__/trpo.cpython-35.pyc
diff --git a/chainerrl/agents/__pycache__/trpo.cpython-36.pyc b/chainerrl/agents/__pycache__/trpo.cpython-36.pyc
diff --git a/chainerrl/agents/a3c.py b/chainerrl/agents/a3c.py
diff --git a/chainerrl/agents/acer.py b/chainerrl/agents/acer.py
diff --git a/chainerrl/agents/al.py b/chainerrl/agents/al.py
@@ -29,7 +29,7 @@ def __init__(self, *args, **kwargs):
         self.alpha = kwargs.pop('alpha', 0.9)
         super().__init__(*args, **kwargs)
 
-    def _compute_y_and_t(self, exp_batch, gamma):
+    def _compute_y_and_t(self, exp_batch):
 
         batch_state = exp_batch['state']
         batch_size = len(exp_batch['reward'])
@@ -56,7 +56,7 @@ def _compute_y_and_t(self, exp_batch, gamma):
             batch_terminal = exp_batch['is_state_terminal']
 
             # T Q: Bellman operator
-            t_q = batch_rewards + self.gamma * \
+            t_q = batch_rewards + exp_batch['discount'] * \
                 (1.0 - batch_terminal) * next_q_max
 
             # T_AL Q: advantage learning operator

diff --git a/chainerrl/agents/categorical_double_dqn.py b/chainerrl/agents/categorical_double_dqn.py
@@ -0,0 +1,49 @@
+from __future__ import unicode_literals
+from __future__ import print_function
+from __future__ import division
+from __future__ import absolute_import
+from future import standard_library
+standard_library.install_aliases()  # NOQA
+
+import chainer
+
+from chainerrl.agents import categorical_dqn
+from chainerrl.agents.categorical_dqn import _apply_categorical_projection
+from chainerrl.recurrent import state_kept
+
+
+class CategoricalDoubleDQN(categorical_dqn.CategoricalDQN):
+    """Categorical Double DQN.
+
+    """
+
+    def _compute_target_values(self, exp_batch):
+        """Compute a batch of target return distributions."""
+
+        batch_next_state = exp_batch['next_state']
+
+        with chainer.using_config('train', False), state_kept(self.q_function):
+            next_qout = self.q_function(batch_next_state)
+
+        target_next_qout = self.target_q_function(batch_next_state)
+
+        next_q_max = target_next_qout.evaluate_actions(
+            next_qout.greedy_actions)
+
+        batch_rewards = exp_batch['reward']
+        batch_terminal = exp_batch['is_state_terminal']
+        discount = exp_batch['discount']
+
+        batch_size = exp_batch['reward'].shape[0]
+        z_values = target_next_qout.z_values
+        n_atoms = z_values.size
+
+        # next_q_max: (batch_size, n_atoms)
+        next_q_max = target_next_qout.max_as_distribution.array
+        assert next_q_max.shape == (batch_size, n_atoms), next_q_max.shape
+
+        # Tz: (batch_size, n_atoms)
+        Tz = (batch_rewards[..., None]
+              + (1.0 - batch_terminal[..., None]) * discount[..., None]
+              * z_values[None])
+        return _apply_categorical_projection(Tz, next_q_max, z_values)
diff --git a/chainerrl/agents/categorical_dqn.py b/chainerrl/agents/categorical_dqn.py
@@ -6,6 +6,7 @@
 standard_library.install_aliases()  # NOQA
 
 import chainer
+from chainer import cuda
 import chainer.functions as F
 import numpy as np
 
@@ -72,6 +73,51 @@ def _apply_categorical_projection(y, y_probs, z):
     return z_probs
 
 
+def compute_value_loss(y, t, batch_accumulator='mean'):
+    """Compute a loss for value prediction problem.
+
+    Args:
+        y (Variable or ndarray): Predicted values.
+        t (Variable or ndarray): Target values.
+        batch_accumulator (str): 'mean' or 'sum'. 'mean' will use the mean of
+            the loss values in a batch. 'sum' will use the sum.
+    Returns:
+        (Variable) scalar loss
+    """
+    assert batch_accumulator in ('mean', 'sum')
+
+    eltwise_loss = -t * F.log(F.clip(y, 1e-10, 1.))
+
+    if batch_accumulator == 'sum':
+        loss = F.sum(eltwise_loss)
+    else:
+        loss = F.mean(F.sum(eltwise_loss, axis=1))
+    return loss
+
+
+def compute_weighted_value_loss(y, t, weights, batch_accumulator='mean'):
+    """Compute a loss for value prediction problem.
+
+    Args:
+        y (Variable or ndarray): Predicted values.
+        t (Variable or ndarray): Target values.
+        weights (ndarray): Weights for y, t.
+        batch_accumulator (str): 'mean' will devide loss by batchsize
+    Returns:
+        (Variable) scalar loss
+    """
+    assert batch_accumulator in ('mean', 'sum')
+
+    eltwise_loss = -t * F.log(F.clip(y, 1e-10, 1.))
+
+    loss = F.sum(eltwise_loss * weights[:, None, None])
+    if batch_accumulator == 'sum':
+        loss = F.sum(eltwise_loss)
+    else:
+        loss = F.mean(F.sum(eltwise_loss, axis=1))
+    return loss
+
+
 class CategoricalDQN(dqn.DQN):
     """Categorical DQN.
 
@@ -81,7 +127,7 @@ class CategoricalDQN(dqn.DQN):
     DistributionalDiscreteActionValue and clip_delta is ignored.
     """
 
-    def _compute_target_values(self, exp_batch, gamma):
+    def _compute_target_values(self, exp_batch):
         """Compute a batch of target return distributions."""
 
         batch_next_state = exp_batch['next_state']
@@ -100,10 +146,12 @@ def _compute_target_values(self, exp_batch, gamma):
 
         # Tz: (batch_size, n_atoms)
         Tz = (batch_rewards[..., None]
-              + (1.0 - batch_terminal[..., None]) * gamma * z_values[None])
+              + (1.0 - batch_terminal[..., None])
+              * self.xp.expand_dims(exp_batch['discount'], 1)
+              * z_values[None])
         return _apply_categorical_projection(Tz, next_q_max, z_values)
 
-    def _compute_y_and_t(self, exp_batch, gamma):
+    def _compute_y_and_t(self, exp_batch):
         """Compute a batch of predicted/target return distributions."""
 
         batch_size = exp_batch['reward'].shape[0]
@@ -120,19 +168,29 @@ def _compute_y_and_t(self, exp_batch, gamma):
         assert batch_q.shape == (batch_size, n_atoms)
 
         with chainer.no_backprop_mode():
-            batch_q_target = self._compute_target_values(exp_batch, gamma)
+            batch_q_target = self._compute_target_values(exp_batch)
             assert batch_q_target.shape == (batch_size, n_atoms)
 
         return batch_q, batch_q_target
 
-    def _compute_loss(self, exp_batch, gamma, errors_out=None):
+    def _compute_loss(self, exp_batch, errors_out=None):
         """Compute a loss of categorical DQN."""
-        y, t = self._compute_y_and_t(exp_batch, gamma)
+        y, t = self._compute_y_and_t(exp_batch)
         # Minimize the cross entropy
         # y is clipped to avoid log(0)
         eltwise_loss = -t * F.log(F.clip(y, 1e-10, 1.))
-        if self.batch_accumulator == 'sum':
-            loss = F.sum(eltwise_loss)
+
+        if errors_out is not None:
+            del errors_out[:]
+            delta = F.sum(eltwise_loss, axis=1)
+            delta = cuda.to_cpu(delta.array)
+            for e in delta:
+                errors_out.append(e)
+
+        if 'weights' in exp_batch:
+            return compute_weighted_value_loss(
+                y, t, exp_batch['weights'],
+                batch_accumulator=self.batch_accumulator)
         else:
-            loss = F.mean(F.sum(eltwise_loss, axis=1))
-        return loss
+            return compute_value_loss(y, t,
+                                      batch_accumulator=self.batch_accumulator)
diff --git a/chainerrl/agents/ddpg.py b/chainerrl/agents/ddpg.py
@@ -255,7 +255,7 @@ def compute_actor_loss(self, batch):
     def update(self, experiences, errors_out=None):
         """Update the model from experiences"""
 
-        batch = batch_experiences(experiences, self.xp, self.phi)
+        batch = batch_experiences(experiences, self.xp, self.phi, self.gamma)
         self.critic_optimizer.update(lambda: self.compute_critic_loss(batch))
         self.actor_optimizer.update(lambda: self.compute_actor_loss(batch))
 
@@ -273,7 +273,7 @@ def update_from_episodes(self, episodes, errors_out=None):
                     break
                 transitions.append(ep[i])
             batch = batch_experiences(
-                transitions, xp=self.xp, phi=self.phi)
+                transitions, xp=self.xp, phi=self.phi, gamma=self.gamma)
             batches.append(batch)
 
         with self.model.state_reset(), self.target_model.state_reset():

diff --git a/chainerrl/agents/double_dqn.py b/chainerrl/agents/double_dqn.py
@@ -17,7 +17,7 @@ class DoubleDQN(dqn.DQN):
     See: http://arxiv.org/abs/1509.06461.
     """
 
-    def _compute_target_values(self, exp_batch, gamma):
+    def _compute_target_values(self, exp_batch):
 
         batch_next_state = exp_batch['next_state']
 
@@ -31,5 +31,6 @@ def _compute_target_values(self, exp_batch, gamma):
 
         batch_rewards = exp_batch['reward']
         batch_terminal = exp_batch['is_state_terminal']
+        discount = exp_batch['discount']
 
-        return batch_rewards + self.gamma * (1.0 - batch_terminal) * next_q_max
+        return batch_rewards + discount * (1.0 - batch_terminal) * next_q_max
diff --git a/chainerrl/agents/double_pal.py b/chainerrl/agents/double_pal.py
@@ -15,7 +15,7 @@
 
 class DoublePAL(pal.PAL):
 
-    def _compute_y_and_t(self, exp_batch, gamma):
+    def _compute_y_and_t(self, exp_batch):
 
         batch_state = exp_batch['state']
         batch_size = len(exp_batch['reward'])
@@ -45,7 +45,7 @@ def _compute_y_and_t(self, exp_batch, gamma):
             batch_terminal = exp_batch['is_state_terminal']
 
             # T Q: Bellman operator
-            t_q = batch_rewards + self.gamma * \
+            t_q = batch_rewards + exp_batch['discount'] * \
                 (1.0 - batch_terminal) * next_q_max
 
             # T_PAL Q: persistent advantage learning operator

diff --git a/chainerrl/agents/dpp.py b/chainerrl/agents/dpp.py
@@ -26,7 +26,7 @@ class AbstractDPP(with_metaclass(ABCMeta, DQN)):
     def _l_operator(self, qout):
         raise NotImplementedError()
 
-    def _compute_target_values(self, exp_batch, gamma):
+    def _compute_target_values(self, exp_batch):
 
         batch_next_state = exp_batch['next_state']
 
@@ -37,9 +37,9 @@ def _compute_target_values(self, exp_batch, gamma):
         batch_terminal = exp_batch['is_state_terminal']
 
         return (batch_rewards +
-                self.gamma * (1 - batch_terminal) * next_q_expect)
+                exp_batch['discount'] * (1 - batch_terminal) * next_q_expect)
 
-    def _compute_y_and_t(self, exp_batch, gamma):
+    def _compute_y_and_t(self, exp_batch):
 
         batch_state = exp_batch['state']
         batch_size = len(exp_batch['reward'])
@@ -65,7 +65,7 @@ def _compute_y_and_t(self, exp_batch, gamma):
 
             # r + g * LQ'(s_{t+1},a)
             batch_q_target = F.reshape(
-                self._compute_target_values(exp_batch, gamma), (batch_size, 1))
+                self._compute_target_values(exp_batch), (batch_size, 1))
 
             # Q'(s_t,a_t) + r + g * LQ'(s_{t+1},a) - LQ'(s_t,a)
             t = target_q + batch_q_target - target_q_expect