From b19e7fd28281c2dd909c26dd39f014294bb6f964 Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Tue, 18 Feb 2020 16:23:19 -0800 Subject: [PATCH 1/5] Fixing #3460 --- gym-unity/gym_unity/envs/__init__.py | 69 ++++++++++++++++++---------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/gym-unity/gym_unity/envs/__init__.py b/gym-unity/gym_unity/envs/__init__.py index f9cf21eab3..fbe8864e30 100644 --- a/gym-unity/gym_unity/envs/__init__.py +++ b/gym-unity/gym_unity/envs/__init__.py @@ -1,7 +1,7 @@ import logging import itertools import numpy as np -from typing import Any, Dict, List, Optional, Tuple, Union, Set +from typing import Any, Dict, List, Optional, Tuple, Union import gym from gym import error, spaces @@ -74,7 +74,10 @@ def __init__( self.visual_obs = None self._n_agents = -1 - self._done_agents: Set[int] = set() + + self._gym_id_order: List[int] = [] + self._done_agents_index_to_last_reward: Dict[int, float] = {} + # Save the step result from the last time all Agents requested decisions. self._previous_step_result: BatchedStepResult = None self._multiagent = multiagent @@ -121,6 +124,7 @@ def __init__( step_result = self._env.get_step_result(self.brain_name) self._check_agents(step_result.n_agents()) self._previous_step_result = step_result + self._gym_id_order = list(self._previous_step_result.agent_id) # Set observation and action spaces if self.group_spec.is_action_discrete(): @@ -368,11 +372,18 @@ def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: "The number of agents in the scene does not match the expected number." ) - # remove the done Agents - indices_to_keep: List[int] = [] - for index, is_done in enumerate(step_result.done): - if not is_done: - indices_to_keep.append(index) + if step_result.n_agents() - sum(step_result.done) != self._n_agents: + raise UnityGymException( + "The number of agents in the scene does not match the expected number." + ) + + for index, agent_id in enumerate(step_result.agent_id): + if step_result.done[index]: + gym_index = self._gym_id_order.index(agent_id) + self._done_agents_index_to_last_reward[gym_index] = step_result.reward[ + index + ] + self._gym_id_order[gym_index] = -1 # no agent at that index # Set the new AgentDone flags to True # Note that the corresponding agent_id that gets marked done will be different @@ -380,40 +391,48 @@ def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: # only cares about the ordering. for index, agent_id in enumerate(step_result.agent_id): if not self._previous_step_result.contains_agent(agent_id): + # insert the id in the current id list: + self._gym_id_order[self._gym_id_order.index(-1)] = agent_id + # This is a new agent step_result.done[index] = True - if agent_id in self._done_agents: - step_result.done[index] = True - self._done_agents = set() + # The index of the agent among not-done agents is + original_index = self._gym_id_order.index(agent_id) + step_result.reward[index] = self._done_agents_index_to_last_reward[ + original_index + ] + self._done_agents_index_to_last_reward = {} + self._previous_step_result = step_result # store the new original + new_id_order = [] + for agent_id in self._gym_id_order: + new_id_order.append(list(step_result.agent_id).index(agent_id)) + _mask: Optional[List[np.array]] = None if step_result.action_mask is not None: _mask = [] for mask_index in range(len(step_result.action_mask)): - _mask.append(step_result.action_mask[mask_index][indices_to_keep]) + _mask.append(step_result.action_mask[mask_index][new_id_order]) new_obs: List[np.array] = [] for obs_index in range(len(step_result.obs)): - new_obs.append(step_result.obs[obs_index][indices_to_keep]) + new_obs.append(step_result.obs[obs_index][new_id_order]) return BatchedStepResult( obs=new_obs, - reward=step_result.reward[indices_to_keep], - done=step_result.done[indices_to_keep], - max_step=step_result.max_step[indices_to_keep], - agent_id=step_result.agent_id[indices_to_keep], + reward=step_result.reward[new_id_order], + done=step_result.done[new_id_order], + max_step=step_result.max_step[new_id_order], + agent_id=step_result.agent_id[new_id_order], action_mask=_mask, ) def _sanitize_action(self, action: np.array) -> np.array: - if self._previous_step_result.n_agents() == self._n_agents: - return action sanitized_action = np.zeros( (self._previous_step_result.n_agents(), self.group_spec.action_size) ) - input_index = 0 - for index in range(self._previous_step_result.n_agents()): + for index, agent_id in enumerate(self._previous_step_result.agent_id): if not self._previous_step_result.done[index]: - sanitized_action[index, :] = action[input_index, :] - input_index = input_index + 1 + array_index = self._gym_id_order.index(agent_id) + sanitized_action[index, :] = action[array_index, :] return sanitized_action def _step(self, needs_reset: bool = False) -> BatchedStepResult: @@ -432,7 +451,11 @@ def _step(self, needs_reset: bool = False) -> BatchedStepResult: "The environment does not have the expected amount of agents." + "Some agents did not request decisions at the same time." ) - self._done_agents.update(list(info.agent_id)) + for agent_id, reward in zip(info.agent_id, info.reward): + gym_index = self._gym_id_order.index(agent_id) + self._done_agents_index_to_last_reward[gym_index] = reward + self._gym_id_order[gym_index] = -1 # no agent at that index + self._env.step() info = self._env.get_step_result(self.brain_name) return self._sanitize_info(info) From 322ce3eef398e98b06988a26f58d9827849b4d36 Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Tue, 18 Feb 2020 17:08:25 -0800 Subject: [PATCH 2/5] Addressing comments --- gym-unity/gym_unity/envs/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gym-unity/gym_unity/envs/__init__.py b/gym-unity/gym_unity/envs/__init__.py index fbe8864e30..46cee347bc 100644 --- a/gym-unity/gym_unity/envs/__init__.py +++ b/gym-unity/gym_unity/envs/__init__.py @@ -392,7 +392,8 @@ def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: for index, agent_id in enumerate(step_result.agent_id): if not self._previous_step_result.contains_agent(agent_id): # insert the id in the current id list: - self._gym_id_order[self._gym_id_order.index(-1)] = agent_id + original_index = self._gym_id_order.index(-1) + self._gym_id_order[original_index] = agent_id # This is a new agent step_result.done[index] = True # The index of the agent among not-done agents is @@ -405,8 +406,9 @@ def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: self._previous_step_result = step_result # store the new original new_id_order = [] + agent_id_list = list(step_result.agent_id) for agent_id in self._gym_id_order: - new_id_order.append(list(step_result.agent_id).index(agent_id)) + new_id_order.append(agent_id_list.index(agent_id)) _mask: Optional[List[np.array]] = None if step_result.action_mask is not None: From 8cceca047ee6c431313c927fa971a42f5e5f793f Mon Sep 17 00:00:00 2001 From: vincentpierre Date: Wed, 19 Feb 2020 11:13:41 -0800 Subject: [PATCH 3/5] Added 2 tests --- gym-unity/gym_unity/envs/__init__.py | 6 ++-- gym-unity/gym_unity/tests/test_gym.py | 40 +++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/gym-unity/gym_unity/envs/__init__.py b/gym-unity/gym_unity/envs/__init__.py index 46cee347bc..3f6edd2ae1 100644 --- a/gym-unity/gym_unity/envs/__init__.py +++ b/gym-unity/gym_unity/envs/__init__.py @@ -397,7 +397,6 @@ def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: # This is a new agent step_result.done[index] = True # The index of the agent among not-done agents is - original_index = self._gym_id_order.index(agent_id) step_result.reward[index] = self._done_agents_index_to_last_reward[ original_index ] @@ -431,9 +430,12 @@ def _sanitize_action(self, action: np.array) -> np.array: sanitized_action = np.zeros( (self._previous_step_result.n_agents(), self.group_spec.action_size) ) + agent_id_to_gym_index = { + agent_id: gym_index for gym_index, agent_id in enumerate(self._gym_id_order) + } for index, agent_id in enumerate(self._previous_step_result.agent_id): if not self._previous_step_result.done[index]: - array_index = self._gym_id_order.index(agent_id) + array_index = agent_id_to_gym_index[agent_id] sanitized_action[index, :] = action[array_index, :] return sanitized_action diff --git a/gym-unity/gym_unity/tests/test_gym.py b/gym-unity/gym_unity/tests/test_gym.py index 59ff05a92c..29f6182222 100644 --- a/gym-unity/gym_unity/tests/test_gym.py +++ b/gym-unity/gym_unity/tests/test_gym.py @@ -84,6 +84,46 @@ def test_gym_wrapper_visual(mock_env, use_uint8): assert isinstance(info, dict) +@mock.patch("gym_unity.envs.UnityEnvironment") +def test_sanitize_action_shuffled_id(mock_env): + mock_spec = create_mock_group_spec( + vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3] + ) + mock_step = create_mock_vector_step_result(num_agents=5) + mock_step.agent_id = np.array(range(5)) + setup_mock_unityenvironment(mock_env, mock_spec, mock_step) + env = UnityEnv(" ", use_visual=False, multiagent=True) + + shuffled_step_result = create_mock_vector_step_result(num_agents=5) + shuffled_order = [4, 2, 3, 1, 0] + shuffled_step_result.reward = np.array(shuffled_order) + shuffled_step_result.agent_id = np.array(shuffled_order) + sanitized_result = env._sanitize_info(shuffled_step_result) + for expected_reward, reward in zip(range(5), sanitized_result.reward): + assert expected_reward == reward + for expected_agent_id, agent_id in zip(range(5), sanitized_result.agent_id): + assert expected_agent_id == agent_id + + +@mock.patch("gym_unity.envs.UnityEnvironment") +def test_sanitize_action_one_agent_done(mock_env): + mock_spec = create_mock_group_spec( + vector_action_space_type="discrete", vector_action_space_size=[2, 2, 3] + ) + mock_step = create_mock_vector_step_result(num_agents=5) + mock_step.agent_id = np.array(range(5)) + setup_mock_unityenvironment(mock_env, mock_spec, mock_step) + env = UnityEnv(" ", use_visual=False, multiagent=True) + + received_step_result = create_mock_vector_step_result(num_agents=6) + received_step_result.agent_id = np.array(range(6)) + # agent #3 (id = 2) is Done + received_step_result.done = np.array([False] * 2 + [True] + [False] * 3) + sanitized_result = env._sanitize_info(received_step_result) + for expected_agent_id, agent_id in zip([0, 1, 5, 3, 4], sanitized_result.agent_id): + assert expected_agent_id == agent_id + + # Helper methods From 488592067561992c0f99d97fda7df277fa261ace Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Mon, 24 Feb 2020 12:07:46 -0800 Subject: [PATCH 4/5] encapsulate the agent mapping operations (#3481) * encapsulate the agent mapping operations * rename, linear time impl * cleanup * dict.popitem * udpate comments --- gym-unity/gym_unity/envs/__init__.py | 128 ++++++++++++++++++++------ gym-unity/gym_unity/tests/test_gym.py | 37 +++++++- 2 files changed, 136 insertions(+), 29 deletions(-) diff --git a/gym-unity/gym_unity/envs/__init__.py b/gym-unity/gym_unity/envs/__init__.py index 3f6edd2ae1..c7da3af9b9 100644 --- a/gym-unity/gym_unity/envs/__init__.py +++ b/gym-unity/gym_unity/envs/__init__.py @@ -75,8 +75,7 @@ def __init__( self.visual_obs = None self._n_agents = -1 - self._gym_id_order: List[int] = [] - self._done_agents_index_to_last_reward: Dict[int, float] = {} + self.agent_mapper = AgentIdIndexMapper() # Save the step result from the last time all Agents requested decisions. self._previous_step_result: BatchedStepResult = None @@ -124,7 +123,7 @@ def __init__( step_result = self._env.get_step_result(self.brain_name) self._check_agents(step_result.n_agents()) self._previous_step_result = step_result - self._gym_id_order = list(self._previous_step_result.agent_id) + self.agent_mapper.set_initial_agents(list(self._previous_step_result.agent_id)) # Set observation and action spaces if self.group_spec.is_action_discrete(): @@ -379,11 +378,7 @@ def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: for index, agent_id in enumerate(step_result.agent_id): if step_result.done[index]: - gym_index = self._gym_id_order.index(agent_id) - self._done_agents_index_to_last_reward[gym_index] = step_result.reward[ - index - ] - self._gym_id_order[gym_index] = -1 # no agent at that index + self.agent_mapper.mark_agent_done(agent_id, step_result.reward[index]) # Set the new AgentDone flags to True # Note that the corresponding agent_id that gets marked done will be different @@ -391,23 +386,17 @@ def _sanitize_info(self, step_result: BatchedStepResult) -> BatchedStepResult: # only cares about the ordering. for index, agent_id in enumerate(step_result.agent_id): if not self._previous_step_result.contains_agent(agent_id): - # insert the id in the current id list: - original_index = self._gym_id_order.index(-1) - self._gym_id_order[original_index] = agent_id - # This is a new agent + # Register this agent, and get the reward of the previous agent that + # was in its index, so that we can return it to the gym. + last_reward = self.agent_mapper.register_new_agent_id(agent_id) step_result.done[index] = True - # The index of the agent among not-done agents is - step_result.reward[index] = self._done_agents_index_to_last_reward[ - original_index - ] - self._done_agents_index_to_last_reward = {} + step_result.reward[index] = last_reward self._previous_step_result = step_result # store the new original - new_id_order = [] - agent_id_list = list(step_result.agent_id) - for agent_id in self._gym_id_order: - new_id_order.append(agent_id_list.index(agent_id)) + # Get a permutation of the agent IDs so that a given ID stays in the same + # index as where it was first seen. + new_id_order = self.agent_mapper.get_id_permutation(list(step_result.agent_id)) _mask: Optional[List[np.array]] = None if step_result.action_mask is not None: @@ -430,12 +419,9 @@ def _sanitize_action(self, action: np.array) -> np.array: sanitized_action = np.zeros( (self._previous_step_result.n_agents(), self.group_spec.action_size) ) - agent_id_to_gym_index = { - agent_id: gym_index for gym_index, agent_id in enumerate(self._gym_id_order) - } for index, agent_id in enumerate(self._previous_step_result.agent_id): if not self._previous_step_result.done[index]: - array_index = agent_id_to_gym_index[agent_id] + array_index = self.agent_mapper.get_gym_index(agent_id) sanitized_action[index, :] = action[array_index, :] return sanitized_action @@ -456,9 +442,7 @@ def _step(self, needs_reset: bool = False) -> BatchedStepResult: + "Some agents did not request decisions at the same time." ) for agent_id, reward in zip(info.agent_id, info.reward): - gym_index = self._gym_id_order.index(agent_id) - self._done_agents_index_to_last_reward[gym_index] = reward - self._gym_id_order[gym_index] = -1 # no agent at that index + self.agent_mapper.mark_agent_done(agent_id, reward) self._env.step() info = self._env.get_step_result(self.brain_name) @@ -526,3 +510,91 @@ def lookup_action(self, action): :return: The List containing the branched actions. """ return self.action_lookup[action] + + +class AgentIdIndexMapper: + def __init__(self) -> None: + self._agent_id_to_gym_index: Dict[int, int] = {} + self._done_agents_index_to_last_reward: Dict[int, float] = {} + + def set_initial_agents(self, agent_ids: List[int]) -> None: + """ + Provide the initial list of agent ids for the mapper + """ + for idx, agent_id in enumerate(agent_ids): + self._agent_id_to_gym_index[agent_id] = idx + + def mark_agent_done(self, agent_id: int, reward: float) -> None: + """ + Declare the agent done with the corresponding final reward. + """ + gym_index = self._agent_id_to_gym_index.pop(agent_id) + self._done_agents_index_to_last_reward[gym_index] = reward + + def register_new_agent_id(self, agent_id: int) -> float: + """ + Adds the new agent ID and returns the reward to use for the previous agent in this index + """ + # Any free index is OK here. + free_index, last_reward = self._done_agents_index_to_last_reward.popitem() + self._agent_id_to_gym_index[agent_id] = free_index + return last_reward + + def get_id_permutation(self, agent_ids: List[int]) -> List[int]: + """ + Get the permutation from new agent ids to the order that preserves the positions of previous agents. + The result is a list with each integer from 0 to len(agent_ids)-1 appearing exactly once. + """ + # Map the new agent ids to the their index + new_agent_ids_to_index = { + agent_id: idx for idx, agent_id in enumerate(agent_ids) + } + + # Make the output list. We don't write to it sequentially, so start with dummy values. + new_permutation = [-1] * len(agent_ids) + + # For each agent ID, find the new index of the agent, and write it in the original index. + for agent_id, original_index in self._agent_id_to_gym_index.items(): + new_permutation[original_index] = new_agent_ids_to_index[agent_id] + return new_permutation + + def get_gym_index(self, agent_id: int) -> int: + """ + Get the gym index for the current agent. + """ + return self._agent_id_to_gym_index[agent_id] + + +class AgentIdIndexMapperSlow: + """ + Reference implementation of AgentIdIndexMapper. + The operations are O(N^2) so it shouldn't be used for large numbers of agents. + See AgentIdIndexMapper for method descriptions + """ + + def __init__(self) -> None: + self._gym_id_order: List[int] = [] + self._done_agents_index_to_last_reward: Dict[int, float] = {} + + def set_initial_agents(self, agent_ids: List[int]) -> None: + self._gym_id_order = list(agent_ids) + + def mark_agent_done(self, agent_id: int, reward: float) -> None: + gym_index = self._gym_id_order.index(agent_id) + self._done_agents_index_to_last_reward[gym_index] = reward + self._gym_id_order[gym_index] = -1 + + def register_new_agent_id(self, agent_id: int) -> float: + original_index = self._gym_id_order.index(-1) + self._gym_id_order[original_index] = agent_id + reward = self._done_agents_index_to_last_reward.pop(original_index) + return reward + + def get_id_permutation(self, agent_ids): + new_id_order = [] + for agent_id in self._gym_id_order: + new_id_order.append(agent_ids.index(agent_id)) + return new_id_order + + def get_gym_index(self, agent_id: int) -> int: + return self._gym_id_order.index(agent_id) diff --git a/gym-unity/gym_unity/tests/test_gym.py b/gym-unity/gym_unity/tests/test_gym.py index 29f6182222..e2015dc64c 100644 --- a/gym-unity/gym_unity/tests/test_gym.py +++ b/gym-unity/gym_unity/tests/test_gym.py @@ -3,7 +3,12 @@ import numpy as np from gym import spaces -from gym_unity.envs import UnityEnv, UnityGymException +from gym_unity.envs import ( + UnityEnv, + UnityGymException, + AgentIdIndexMapper, + AgentIdIndexMapperSlow, +) from mlagents_envs.base_env import AgentGroupSpec, ActionType, BatchedStepResult @@ -183,3 +188,33 @@ def setup_mock_unityenvironment(mock_env, mock_spec, mock_result): mock_env.return_value.get_agent_groups.return_value = ["MockBrain"] mock_env.return_value.get_agent_group_spec.return_value = mock_spec mock_env.return_value.get_step_result.return_value = mock_result + + +@pytest.mark.parametrize("mapper_cls", [AgentIdIndexMapper, AgentIdIndexMapperSlow]) +def test_agent_id_index_mapper(mapper_cls): + mapper = mapper_cls() + initial_agent_ids = [1001, 1002, 1003, 1004] + mapper.set_initial_agents(initial_agent_ids) + + # Mark some agents as done with their last rewards. + mapper.mark_agent_done(1001, 42.0) + mapper.mark_agent_done(1004, 1337.0) + + # Now add new agents, and get the rewards of the agent they replaced. + old_reward1 = mapper.register_new_agent_id(2001) + old_reward2 = mapper.register_new_agent_id(2002) + + # Order of the rewards don't matter + assert {old_reward1, old_reward2} == {42.0, 1337.0} + + new_agent_ids = [1002, 1003, 2001, 2002] + permutation = mapper.get_id_permutation(new_agent_ids) + # Make sure it's actually a permutation - needs to contain 0..N-1 with no repeats. + assert set(permutation) == set(range(0, 4)) + + # For initial agents that were in the initial group, they need to be in the same slot. + # Agents that were added later can be anywhere. + permuted_ids = [new_agent_ids[i] for i in permutation] + for idx, agent_id in enumerate(initial_agent_ids): + if agent_id in permuted_ids: + assert permuted_ids[idx] == agent_id From 2076fd08bc50b6aedcc170c0153abd0402918257 Mon Sep 17 00:00:00 2001 From: Chris Elion Date: Mon, 24 Feb 2020 12:10:44 -0800 Subject: [PATCH 5/5] Update gym-unity/gym_unity/tests/test_gym.py --- gym-unity/gym_unity/tests/test_gym.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gym-unity/gym_unity/tests/test_gym.py b/gym-unity/gym_unity/tests/test_gym.py index e2015dc64c..3ee6e67913 100644 --- a/gym-unity/gym_unity/tests/test_gym.py +++ b/gym-unity/gym_unity/tests/test_gym.py @@ -213,7 +213,7 @@ def test_agent_id_index_mapper(mapper_cls): assert set(permutation) == set(range(0, 4)) # For initial agents that were in the initial group, they need to be in the same slot. - # Agents that were added later can be anywhere. + # Agents that were added later can appear in any free slot. permuted_ids = [new_agent_ids[i] for i in permutation] for idx, agent_id in enumerate(initial_agent_ids): if agent_id in permuted_ids: