Skip to content

Commit fe0c2a0

Browse files
Enrico Usaienrico-usai
authored andcommitted
Force nodes to down and power_save when stopping the cluster
If a cluster is stopped while a node is powering-up (alloc#-idle#), node is kept in the powering-up state on cluster start. This makes the node unavailable for the entire ResumeTimeout which is 60 minutes. Slurm is ignoring the transition to power_down if we don't put the node to down first. From @demartinofra ## Manual test * Created a cluster and submitted a job on it * When the node was powering up stopped the cluster and verified the node is correctly marked as power down * Restarted the cluster and verified the node is back to powering save state (after about 2 minutes) * Job ran correctly in the new node. Signed-off-by: Enrico Usai <[email protected]>
1 parent 3489e16 commit fe0c2a0

File tree

3 files changed

+15
-7
lines changed

3 files changed

+15
-7
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ aws-parallelcluster-node CHANGELOG
33

44
This file is used to list changes made in each version of the aws-parallelcluster-node package.
55

6+
2.11.2
7+
-----
8+
9+
**BUG FIXES**
10+
- Slurm: fix issue that was causing powering up nodes to not be correctly reset after a stop and start of the cluster.
11+
612
2.11.1
713
-----
814

src/common/schedulers/slurm_commands.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ def update_all_partitions(state, reset_node_addrs_hostname):
235235
logging.info(f"Setting partition {part.name} state from {part.state} to {state}")
236236
if reset_node_addrs_hostname:
237237
logging.info(f"Resetting partition nodes {part.nodes}")
238-
reset_nodes(part.nodes, state="power_down", reason="stopping cluster")
238+
set_nodes_down_and_power_save(part.nodes, reason="stopping cluster")
239239
partition_to_update.append(part.name)
240240
succeeded_partitions = update_partitions(partition_to_update, state)
241241
return succeeded_partitions == partition_to_update

tests/common/schedulers/test_slurm_commands.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,7 @@ def test_update_partitions(
614614
],
615615
PartitionStatus.INACTIVE,
616616
True,
617-
[call("node-3,node-4", reason="stopping cluster", state="power_down")],
617+
[call("node-3,node-4", reason="stopping cluster")],
618618
["part-2"],
619619
["part-2"],
620620
True,
@@ -627,8 +627,8 @@ def test_update_partitions(
627627
PartitionStatus.INACTIVE,
628628
True,
629629
[
630-
call("node-1,node-2", reason="stopping cluster", state="power_down"),
631-
call("node-3,node-4", reason="stopping cluster", state="power_down"),
630+
call("node-1,node-2", reason="stopping cluster"),
631+
call("node-3,node-4", reason="stopping cluster"),
632632
],
633633
["part-1", "part-2"],
634634
["part-1", "part-2"],
@@ -682,7 +682,9 @@ def test_update_all_partitions(
682682
expected_results,
683683
mocker,
684684
):
685-
reset_node_spy = mocker.patch("common.schedulers.slurm_commands.reset_nodes", auto_spec=True)
685+
set_nodes_down_and_power_save_spy = mocker.patch(
686+
"common.schedulers.slurm_commands.set_nodes_down_and_power_save", auto_spec=True
687+
)
686688
update_partitions_spy = mocker.patch(
687689
"common.schedulers.slurm_commands.update_partitions", return_value=mock_succeeded_partitions, auto_spec=True
688690
)
@@ -692,7 +694,7 @@ def test_update_all_partitions(
692694
assert_that(update_all_partitions(state, reset_node_addrs_hostname=reset_node_info)).is_equal_to(expected_results)
693695
get_part_spy.assert_called_with(get_all_nodes=True)
694696
if expected_reset_nodes_calls:
695-
reset_node_spy.assert_has_calls(expected_reset_nodes_calls)
697+
set_nodes_down_and_power_save_spy.assert_has_calls(expected_reset_nodes_calls)
696698
else:
697-
reset_node_spy.assert_not_called()
699+
set_nodes_down_and_power_save_spy.assert_not_called()
698700
update_partitions_spy.assert_called_with(partitions_to_update, state)

0 commit comments

Comments
 (0)