Skip to content

Commit ac4b1a4

Browse files
author
Jonathan Harper
committed
Remove the --num-runs option
The "num-runs" command-line option provides the ability to run multiple identically-configured training runs in separate processes by running mlagents-learn only once. This is a rarely used ML-Agents feature, but it adds complexity to other parts of the system by adding the need to support multiprocessing and managing of ports for the parallel training runs. It also doesn't provide truly reproducible experiments, since there is no guarantee of resource isolation between the trials. This commit removes the --num-runs option, with the idea that users will manage parallel or sequential runs of the same experiment themselves in the future.
1 parent b9194a5 commit ac4b1a4

File tree

3 files changed

+16
-73
lines changed

3 files changed

+16
-73
lines changed

docs/Training-ML-Agents.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,6 @@ environment, you can set the following command line options when invoking
114114
the oldest checkpoint is deleted when saving a new checkpoint. Defaults to 5.
115115
* `--lesson=<n>`: Specify which lesson to start with when performing curriculum
116116
training. Defaults to 0.
117-
* `--num-runs=<n>`: Sets the number of concurrent training sessions to perform.
118-
Default is set to 1. Set to higher values when benchmarking performance and
119-
multiple training sessions is desired. Training sessions are independent, and
120-
do not improve learning performance.
121117
* `--num-envs=<n>`: Specifies the number of concurrent Unity environment instances to
122118
collect experiences from when training. Defaults to 1.
123119
* `--run-id=<path>`: Specifies an identifier for each training run. This

ml-agents/mlagents/trainers/learn.py

Lines changed: 11 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import logging
33
import argparse
44

5-
from multiprocessing import Process, Queue
65
import os
76
import glob
87
import shutil
@@ -14,7 +13,6 @@
1413
import mlagents_envs
1514
from mlagents import tf_utils
1615
from mlagents.trainers.trainer_controller import TrainerController
17-
from mlagents.trainers.exception import TrainerError
1816
from mlagents.trainers.meta_curriculum import MetaCurriculum
1917
from mlagents.trainers.trainer_util import load_config, TrainerFactory
2018
from mlagents.trainers.stats import TensorboardWriter, CSVWriter, StatsReporter
@@ -29,7 +27,6 @@
2927

3028
class CommandLineOptions(NamedTuple):
3129
debug: bool
32-
num_runs: int
3330
seed: int
3431
env_path: str
3532
run_id: str
@@ -109,9 +106,6 @@ def parse_command_line(argv: Optional[List[str]] = None) -> CommandLineOptions:
109106
default="ppo",
110107
help="The directory name for model and summary statistics",
111108
)
112-
parser.add_argument(
113-
"--num-runs", default=1, type=int, help="Number of concurrent training sessions"
114-
)
115109
parser.add_argument(
116110
"--save-freq", default=50000, type=int, help="Frequency at which to save model"
117111
)
@@ -209,13 +203,9 @@ def parse_command_line(argv: Optional[List[str]] = None) -> CommandLineOptions:
209203
return CommandLineOptions.from_argparse(args)
210204

211205

212-
def run_training(
213-
sub_id: int, run_seed: int, options: CommandLineOptions, process_queue: Queue
214-
) -> None:
206+
def run_training(run_seed: int, options: CommandLineOptions) -> None:
215207
"""
216208
Launches training session.
217-
:param process_queue: Queue used to send signal back to main.
218-
:param sub_id: Unique id for training session.
219209
:param options: parsed command line arguments
220210
:param run_seed: Random seed used for training.
221211
:param run_options: Command line arguments for training.
@@ -225,30 +215,16 @@ def run_training(
225215
curriculum_folder = options.curriculum_folder
226216
# Recognize and use docker volume if one is passed as an argument
227217
if not options.docker_target_name:
228-
model_path = "./models/{run_id}-{sub_id}".format(
229-
run_id=options.run_id, sub_id=sub_id
230-
)
218+
model_path = f"./models/{options.run_id}"
231219
summaries_dir = "./summaries"
232220
else:
233-
trainer_config_path = "/{docker_target_name}/{trainer_config_path}".format(
234-
docker_target_name=options.docker_target_name,
235-
trainer_config_path=trainer_config_path,
236-
)
221+
trainer_config_path = f"/{options.docker_target_name}/{trainer_config_path}"
237222
if curriculum_folder is not None:
238-
curriculum_folder = "/{docker_target_name}/{curriculum_folder}".format(
239-
docker_target_name=options.docker_target_name,
240-
curriculum_folder=curriculum_folder,
241-
)
242-
model_path = "/{docker_target_name}/models/{run_id}-{sub_id}".format(
243-
docker_target_name=options.docker_target_name,
244-
run_id=options.run_id,
245-
sub_id=sub_id,
246-
)
247-
summaries_dir = "/{docker_target_name}/summaries".format(
248-
docker_target_name=options.docker_target_name
249-
)
223+
curriculum_folder = f"/{options.docker_target_name}/{curriculum_folder}"
224+
model_path = f"/{options.docker_target_name}/models/{options.run_id}"
225+
summaries_dir = f"/{options.docker_target_name}/summaries"
250226
trainer_config = load_config(trainer_config_path)
251-
port = options.base_port + (sub_id * options.num_envs)
227+
port = options.base_port
252228

253229
# Configure CSV, Tensorboard Writers and StatsReporter
254230
# We assume reward and episode length are needed in the CSV.
@@ -301,16 +277,14 @@ def run_training(
301277
trainer_factory,
302278
model_path,
303279
summaries_dir,
304-
options.run_id + "-" + str(sub_id),
280+
options.run_id,
305281
options.save_freq,
306282
maybe_meta_curriculum,
307283
options.train_model,
308284
run_seed,
309285
sampler_manager,
310286
resampling_interval,
311287
)
312-
# Signal that environment has been launched.
313-
process_queue.put(True)
314288
# Begin training
315289
try:
316290
tc.start_learning(env_manager)
@@ -461,40 +435,14 @@ def main():
461435
else:
462436
# disable noisy warnings from tensorflow.
463437
tf_utils.set_warnings_enabled(False)
464-
if options.env_path is None and options.num_runs > 1:
465-
raise TrainerError(
466-
"It is not possible to launch more than one concurrent training session "
467-
"when training from the editor."
468-
)
469438

470-
jobs = []
471439
run_seed = options.seed
472440
if options.cpu:
473441
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
474442

475-
if options.num_runs == 1:
476-
if options.seed == -1:
477-
run_seed = np.random.randint(0, 10000)
478-
run_training(0, run_seed, options, Queue())
479-
else:
480-
for i in range(options.num_runs):
481-
if options.seed == -1:
482-
run_seed = np.random.randint(0, 10000)
483-
process_queue = Queue()
484-
p = Process(target=run_training, args=(i, run_seed, options, process_queue))
485-
jobs.append(p)
486-
p.start()
487-
# Wait for signal that environment has successfully launched
488-
while process_queue.get() is not True:
489-
continue
490-
491-
# Wait for jobs to complete. Otherwise we'll have an extra
492-
# unhandled KeyboardInterrupt if we end early.
493-
try:
494-
for job in jobs:
495-
job.join()
496-
except KeyboardInterrupt:
497-
pass
443+
if options.seed == -1:
444+
run_seed = np.random.randint(0, 10000)
445+
run_training(run_seed, options)
498446

499447

500448
# For python debugger to directly run this script

ml-agents/mlagents/trainers/tests/test_learn.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,12 @@ def test_run_training(
3535
mock_init = MagicMock(return_value=None)
3636
with patch.object(TrainerController, "__init__", mock_init):
3737
with patch.object(TrainerController, "start_learning", MagicMock()):
38-
learn.run_training(0, 0, basic_options(), MagicMock())
38+
learn.run_training(0, basic_options())
3939
mock_init.assert_called_once_with(
4040
trainer_factory_mock.return_value,
41-
"./models/ppo-0",
41+
"./models/ppo",
4242
"./summaries",
43-
"ppo-0",
43+
"ppo",
4444
50000,
4545
None,
4646
False,
@@ -69,9 +69,9 @@ def test_docker_target_path(
6969
mock_init = MagicMock(return_value=None)
7070
with patch.object(TrainerController, "__init__", mock_init):
7171
with patch.object(TrainerController, "start_learning", MagicMock()):
72-
learn.run_training(0, 0, options_with_docker_target, MagicMock())
72+
learn.run_training(0, options_with_docker_target)
7373
mock_init.assert_called_once()
74-
assert mock_init.call_args[0][1] == "/dockertarget/models/ppo-0"
74+
assert mock_init.call_args[0][1] == "/dockertarget/models/ppo"
7575
assert mock_init.call_args[0][2] == "/dockertarget/summaries"
7676

7777

@@ -111,7 +111,6 @@ def test_commandline_args():
111111
"--lesson=3",
112112
"--load",
113113
"--run-id=myawesomerun",
114-
"--num-runs=3",
115114
"--save-freq=123456",
116115
"--seed=7890",
117116
"--train",

0 commit comments

Comments
 (0)