Skip to content

Commit 5a7bfae

Browse files
Merge branch 'master' into rainbow_scores
2 parents de02800 + 63dad78 commit 5a7bfae

File tree

8 files changed

+260
-0
lines changed

8 files changed

+260
-0
lines changed

.pfnci/config.pbtxt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ configs {
1010
memory: 30
1111
gpu: 1
1212
}
13+
time_limit {
14+
seconds: 1200
15+
}
1316
environment_variables { key: "GPU" value: "1" }
1417
command: "bash .pfnci/script.sh py3.gpu"
1518
}
@@ -24,6 +27,9 @@ configs {
2427
cpu: 10
2528
memory: 10
2629
}
30+
time_limit {
31+
seconds: 1200
32+
}
2733
command: "bash .pfnci/script.sh py3.cpu"
2834
}
2935
}
@@ -53,6 +59,9 @@ configs {
5359
cpu: 10
5460
memory: 10
5561
}
62+
time_limit {
63+
seconds: 1200
64+
}
5665
command: "bash .pfnci/script.sh py3.chainer4"
5766
}
5867
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# TRPO on MuJoCo benchmarks
2+
3+
This example trains a TRPO agent ([Trust Region Policy Optimization](https://arxiv.org/abs/1502.05477)) on MuJoCo benchmarks from OpenAI Gym.
4+
5+
We follow the training and evaluation settings of [Deep Reinforcement Learning that Matters](https://arxiv.org/abs/1709.06560), which provides thorough, highly tuned benchmark results.
6+
7+
## Requirements
8+
9+
- MuJoCo Pro 1.5
10+
- mujoco_py>=1.50, <2.1
11+
12+
## Running the Example
13+
14+
```
15+
python train_trpo.py [options]
16+
```
17+
18+
### Useful Options
19+
20+
- `--gpu`. Specifies the GPU. If you do not have a GPU on your machine, run the example with the option `--gpu -1`. E.g. `python train_trpo.py --gpu -1`.
21+
- `--env`. Specifies the environment. E.g. `python train_trpo.py --env HalfCheetah-v2`.
22+
- `--render`. Add this option to render the states in a GUI window.
23+
- `--seed`. This option specifies the random seed used.
24+
- `--outdir` This option specifies the output directory to which the results are written.
25+
26+
To view the full list of options, either view the code or run the example with the `--help` option.
27+
28+
## Results
29+
30+
These scores are evaluated by average return +/- standard error of 100 evaluation episodes after 2M training steps.
31+
32+
Reported scores are taken from the row Table 1 of [Deep Reinforcement Learning that Matters](https://arxiv.org/abs/1709.06560).
33+
Here we try to reproduce TRPO (Schulman et al. 2017) of the (64, 64) column, which corresponds to the default settings.
34+
35+
| Environment | ChainerRL Score | Reported Score |
36+
| -------------- |:---------------:|:--------------:|
37+
| HalfCheetah-v2 | **1474**+/-112 | 205+/-256 |
38+
| Hopper-v2 | **3056**+/-44 | 2828+/-70 |
39+
| Walker2d-v2 | 3073+/-59 | N/A |
40+
| Swimmer-v2 | 200+/-25 | N/A |
41+
42+
### Learning Curves
43+
44+
The shaded region represents a standard deviation of the average evaluation over 20 trials.
45+
46+
![HalfCheetah-v2](assets/HalfCheetah-v2.png)
47+
![Hopper-v2](assets/Hopper-v2.png)
48+
![Walker2d-v2](assets/Walker2d-v2.png)
49+
![Swimmer-v2](assets/Swimmer-v2.png)
13.9 KB
Loading
14.5 KB
Loading
12.3 KB
Loading
15.2 KB
Loading
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
"""A training script of TRPO on OpenAI Gym Mujoco environments.
2+
3+
This script follows the settings of https://arxiv.org/abs/1709.06560 as much
4+
as possible.
5+
"""
6+
from __future__ import division
7+
from __future__ import print_function
8+
from __future__ import unicode_literals
9+
from __future__ import absolute_import
10+
from builtins import * # NOQA
11+
from future import standard_library
12+
standard_library.install_aliases() # NOQA
13+
14+
import argparse
15+
import logging
16+
import os
17+
18+
import chainer
19+
from chainer import functions as F
20+
from chainer import links as L
21+
import gym
22+
import gym.spaces
23+
import gym.wrappers
24+
import numpy as np
25+
26+
import chainerrl
27+
28+
29+
def main():
30+
31+
parser = argparse.ArgumentParser()
32+
parser.add_argument('--gpu', type=int, default=0,
33+
help='GPU device ID. Set to -1 to use CPUs only.')
34+
parser.add_argument('--env', type=str, default='Hopper-v2',
35+
help='Gym Env ID')
36+
parser.add_argument('--seed', type=int, default=0,
37+
help='Random seed [0, 2 ** 32)')
38+
parser.add_argument('--outdir', type=str, default='results',
39+
help='Directory path to save output files.'
40+
' If it does not exist, it will be created.')
41+
parser.add_argument('--steps', type=int, default=2 * 10 ** 6,
42+
help='Total time steps for training.')
43+
parser.add_argument('--eval-interval', type=int, default=100000,
44+
help='Interval between evaluation phases in steps.')
45+
parser.add_argument('--eval-n-runs', type=int, default=100,
46+
help='Number of episodes ran in an evaluation phase')
47+
parser.add_argument('--render', action='store_true', default=False,
48+
help='Render the env')
49+
parser.add_argument('--demo', action='store_true', default=False,
50+
help='Run demo episodes, not training')
51+
parser.add_argument('--load', type=str, default='',
52+
help='Directory path to load a saved agent data from'
53+
' if it is a non-empty string.')
54+
parser.add_argument('--trpo-update-interval', type=int, default=5000,
55+
help='Interval steps of TRPO iterations.')
56+
parser.add_argument('--logger-level', type=int, default=logging.INFO,
57+
help='Level of the root logger.')
58+
parser.add_argument('--monitor', action='store_true',
59+
help='Monitor the env by gym.wrappers.Monitor.'
60+
' Videos and additional log will be saved.')
61+
args = parser.parse_args()
62+
63+
logging.basicConfig(level=args.logger_level)
64+
65+
# Set random seed
66+
chainerrl.misc.set_random_seed(args.seed, gpus=(args.gpu,))
67+
68+
args.outdir = chainerrl.experiments.prepare_output_dir(args, args.outdir)
69+
70+
def make_env(test):
71+
env = gym.make(args.env)
72+
# Use different random seeds for train and test envs
73+
env_seed = 2 ** 32 - 1 - args.seed if test else args.seed
74+
env.seed(env_seed)
75+
# Cast observations to float32 because our model uses float32
76+
env = chainerrl.wrappers.CastObservationToFloat32(env)
77+
if args.monitor:
78+
env = gym.wrappers.Monitor(env, args.outdir)
79+
if args.render:
80+
env = chainerrl.wrappers.Render(env)
81+
return env
82+
83+
env = make_env(test=False)
84+
timestep_limit = env.spec.tags.get(
85+
'wrapper_config.TimeLimit.max_episode_steps')
86+
obs_space = env.observation_space
87+
action_space = env.action_space
88+
print('Observation space:', obs_space)
89+
print('Action space:', action_space)
90+
91+
assert isinstance(obs_space, gym.spaces.Box)
92+
93+
# Normalize observations based on their empirical mean and variance
94+
obs_normalizer = chainerrl.links.EmpiricalNormalization(
95+
obs_space.low.size, clip_threshold=5)
96+
97+
# Orthogonal weight initialization is used as OpenAI Baselines does
98+
winit = chainerrl.initializers.Orthogonal(1.)
99+
winit_last = chainerrl.initializers.Orthogonal(1e-2)
100+
101+
action_size = action_space.low.size
102+
policy = chainer.Sequential(
103+
L.Linear(None, 64, initialW=winit),
104+
F.tanh,
105+
L.Linear(None, 64, initialW=winit),
106+
F.tanh,
107+
L.Linear(None, action_size, initialW=winit_last),
108+
chainerrl.policies.GaussianHeadWithStateIndependentCovariance(
109+
action_size=action_size,
110+
var_type='diagonal',
111+
var_func=lambda x: F.exp(2 * x), # Parameterize log std
112+
var_param_init=0, # log std = 0 => std = 1
113+
),
114+
)
115+
116+
vf = chainer.Sequential(
117+
L.Linear(None, 64, initialW=winit),
118+
F.tanh,
119+
L.Linear(None, 64, initialW=winit),
120+
F.tanh,
121+
L.Linear(None, 1, initialW=winit),
122+
)
123+
124+
if args.gpu >= 0:
125+
chainer.cuda.get_device_from_id(args.gpu).use()
126+
policy.to_gpu(args.gpu)
127+
vf.to_gpu(args.gpu)
128+
obs_normalizer.to_gpu(args.gpu)
129+
130+
# TRPO's policy is optimized via CG and line search, so it doesn't require
131+
# a chainer.Optimizer. Only the value function needs it.
132+
vf_opt = chainer.optimizers.Adam()
133+
vf_opt.setup(vf)
134+
135+
# Draw the computational graph and save it in the output directory.
136+
fake_obs = chainer.Variable(
137+
policy.xp.zeros_like(obs_space.low, dtype=np.float32)[None],
138+
name='observation')
139+
chainerrl.misc.draw_computational_graph(
140+
[policy(fake_obs)], os.path.join(args.outdir, 'policy'))
141+
chainerrl.misc.draw_computational_graph(
142+
[vf(fake_obs)], os.path.join(args.outdir, 'vf'))
143+
144+
# Hyperparameters in http://arxiv.org/abs/1709.06560
145+
agent = chainerrl.agents.TRPO(
146+
policy=policy,
147+
vf=vf,
148+
vf_optimizer=vf_opt,
149+
obs_normalizer=obs_normalizer,
150+
update_interval=args.trpo_update_interval,
151+
max_kl=0.01,
152+
conjugate_gradient_max_iter=20,
153+
conjugate_gradient_damping=1e-1,
154+
gamma=0.995,
155+
lambd=0.97,
156+
vf_epochs=5,
157+
entropy_coef=0,
158+
)
159+
160+
if args.load:
161+
agent.load(args.load)
162+
163+
if args.demo:
164+
env = make_env(test=True)
165+
eval_stats = chainerrl.experiments.eval_performance(
166+
env=env,
167+
agent=agent,
168+
n_steps=None,
169+
n_episodes=args.eval_n_runs,
170+
max_episode_len=timestep_limit)
171+
print('n_runs: {} mean: {} median: {} stdev {}'.format(
172+
args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
173+
eval_stats['stdev']))
174+
else:
175+
176+
chainerrl.experiments.train_agent_with_evaluation(
177+
agent=agent,
178+
env=env,
179+
eval_env=make_env(test=True),
180+
outdir=args.outdir,
181+
steps=args.steps,
182+
eval_n_steps=None,
183+
eval_n_episodes=args.eval_n_runs,
184+
eval_interval=args.eval_interval,
185+
train_max_episode_len=timestep_limit,
186+
)
187+
188+
189+
if __name__ == '__main__':
190+
main()
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
set -Ceu
4+
5+
outdir=$(mktemp -d)
6+
7+
gpu="$1"
8+
9+
# mujoco/reproduction/trpo (specify non-mujoco env to test without mujoco)
10+
python examples/mujoco/reproduction/trpo/train_trpo.py --steps 10 --trpo-update-interval 5 --outdir $outdir/mujoco/reproduction/trpo --env Pendulum-v0 --gpu $gpu
11+
model=$(find $outdir/mujoco/reproduction/trpo -name "*_finish")
12+
python examples/mujoco/reproduction/trpo/train_trpo.py --demo --load $model --eval-n-runs 1 --env Pendulum-v0 --outdir $outdir/temp --gpu $gpu

0 commit comments

Comments
 (0)