Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,6 @@ Thumbs.db
install/
results/
.*

# conda
env/
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ These schemas enable CloudAI to be flexible and compatible with different system
|Sleep|✅|✅|✅|
|UCC|✅|❌|❌|
|SlurmContainer|✅|❌|❌|
|SlurmRayContainer|✅|❌|❌|
|MegatronRun (experimental)|✅|❌|❌|


Expand Down
23 changes: 23 additions & 0 deletions conf/common/test/slurm_ray_container_vllm.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For new files please set a single year value (diagnostic we have today is misleading)

# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "slurm_ray_container_vllm"
description = "Run example script with vLLM"
test_template_name = "SlurmRayContainer"

[cmd_args]
docker_image_url = "vllm/vllm-openai:latest"
cmd = "python3 examples/offline_inference/llm_engine_example.py -tp 8 -pp 2"
2 changes: 1 addition & 1 deletion conf/common/test_scenario/sleep.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
22 changes: 22 additions & 0 deletions conf/common/test_scenario/slurm_ray_container.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "slurm_ray_container_example"

[[Tests]]
id = "Tests.1"
test_name = "slurm_ray_container_vllm"
num_nodes = "2"
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"toml==0.10.2",
"kubernetes==30.1.0",
"pydantic==2.8.2",
"jinja2==3.1.6",
]
[project.scripts]
cloudai = "cloudai.__main__:main"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ tbparse==0.0.8
toml==0.10.2
kubernetes==30.1.0
pydantic==2.8.2
jinja2==3.1.6
7 changes: 7 additions & 0 deletions src/cloudai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@
SleepTestDefinition,
)
from .workloads.slurm_container import SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition
from .workloads.slurm_ray_container import SlurmRayContainerCommandGenStrategy, SlurmRayContainerTestDefinition
from .workloads.ucc_test import (
UCCTestDefinition,
UCCTestGradingStrategy,
Expand Down Expand Up @@ -156,6 +157,7 @@
SleepTestDefinition,
NeMoRunTestDefinition,
SlurmContainerTestDefinition,
SlurmRayContainerTestDefinition,
MegatronRunTestDefinition,
],
SlurmJobIdRetrievalStrategy,
Expand Down Expand Up @@ -191,6 +193,7 @@
SleepTestDefinition,
NeMoRunTestDefinition,
SlurmContainerTestDefinition,
SlurmRayContainerTestDefinition,
MegatronRunTestDefinition,
],
DefaultJobStatusRetrievalStrategy,
Expand All @@ -207,6 +210,9 @@
Registry().add_strategy(
CommandGenStrategy, [SlurmSystem], [SlurmContainerTestDefinition], SlurmContainerCommandGenStrategy
)
Registry().add_strategy(
CommandGenStrategy, [SlurmSystem], [SlurmRayContainerTestDefinition], SlurmRayContainerCommandGenStrategy
)

Registry().add_installer("slurm", SlurmInstaller)
Registry().add_installer("standalone", StandaloneInstaller)
Expand All @@ -226,6 +232,7 @@
Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition)
Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition)
Registry().add_test_definition("SlurmRayContainer", SlurmRayContainerTestDefinition)
Registry().add_test_definition("MegatronRun", MegatronRunTestDefinition)

Registry().add_agent("grid_search", GridSearchAgent)
Expand Down
5 changes: 5 additions & 0 deletions src/cloudai/_core/test_scenario_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition
from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition
from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition
from cloudai.workloads.slurm_ray_container import (
SlurmRayContainerReportGenerationStrategy,
SlurmRayContainerTestDefinition,
)
from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy

from .exceptions import TestScenarioParsingError, format_validation_error
Expand All @@ -54,6 +58,7 @@
NemotronTestDefinition: {JaxToolboxReportGenerationStrategy},
SleepTestDefinition: {SleepReportGenerationStrategy},
SlurmContainerTestDefinition: {SlurmContainerReportGenerationStrategy},
SlurmRayContainerTestDefinition: {SlurmRayContainerReportGenerationStrategy},
UCCTestDefinition: {UCCTestReportGenerationStrategy},
}

Expand Down
58 changes: 41 additions & 17 deletions src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,37 +310,61 @@ def _write_sbatch_script(

return f"sbatch {batch_script_path}"

def _append_sbatch_directives(
self, batch_script_content: List[str], args: Dict[str, Any], output_path: Path
) -> None:
def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]:
"""
Append SBATCH directives to the batch script content.
Get the Slurm batch script directives.

Args:
batch_script_content (List[str]): The list of script lines to append to.
args (Dict[str, Any]): Arguments including job settings.
args (Dict[str, Any]): Slurm-specific arguments.
output_path (Path): Output directory for script and logs.

Returns:
Dict[str, str]: Dictionary of Slurm batch script directives.
"""
batch_script_content = self._add_reservation(batch_script_content)
sbatch_directives: Dict[str, str] = {}

if "output" not in args:
batch_script_content.append(f"#SBATCH --output={output_path / 'stdout.txt'}")
sbatch_directives["output"] = f"{output_path / 'stdout.txt'}"
if "error" not in args:
batch_script_content.append(f"#SBATCH --error={output_path / 'stderr.txt'}")
batch_script_content.append(f"#SBATCH --partition={self.system.default_partition}")
sbatch_directives["error"] = f"{output_path / 'stderr.txt'}"

sbatch_directives["partition"] = self.system.default_partition

if args["node_list_str"]:
batch_script_content.append(f"#SBATCH --nodelist={args['node_list_str']}")
sbatch_directives["nodelist"] = args["node_list_str"]
if self.system.account:
batch_script_content.append(f"#SBATCH --account={self.system.account}")
sbatch_directives["account"] = self.system.account
if self.system.distribution:
batch_script_content.append(f"#SBATCH --distribution={self.system.distribution}")
sbatch_directives["distribution"] = self.system.distribution
if self.system.gpus_per_node:
batch_script_content.append(f"#SBATCH --gpus-per-node={self.system.gpus_per_node}")
batch_script_content.append(f"#SBATCH --gres=gpu:{self.system.gpus_per_node}")
sbatch_directives["gpus_per_node"] = str(self.system.gpus_per_node)
sbatch_directives["gres"] = f"gpu:{self.system.gpus_per_node}"
if self.system.ntasks_per_node:
batch_script_content.append(f"#SBATCH --ntasks-per-node={self.system.ntasks_per_node}")
sbatch_directives["ntasks_per_node"] = str(self.system.ntasks_per_node)
if "time_limit" in args:
batch_script_content.append(f"#SBATCH --time={args['time_limit']}")
sbatch_directives["time_limit"] = args["time_limit"]

return sbatch_directives

def _append_sbatch_directives(
self, batch_script_content: List[str], args: Dict[str, Any], output_path: Path
) -> None:
"""
Append SBATCH directives to the batch script content.

Args:
batch_script_content (List[str]): The list of script lines to append to.
args (Dict[str, Any]): Arguments including job settings.
output_path (Path): Output directory for script and logs.
"""
batch_script_content = self._add_reservation(batch_script_content)
sbatch_directives = self._get_sbatch_directives(args, output_path)

for key, value in sbatch_directives.items():
if value:
batch_script_content.append(f"#SBATCH --{key}={value}")
else:
batch_script_content.append(f"#SBATCH --{key}")

for arg in self.system.extra_sbatch_args:
batch_script_content.append(f"#SBATCH {arg}")
Expand Down
26 changes: 26 additions & 0 deletions src/cloudai/workloads/slurm_ray_container/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .report_generation_strategy import SlurmRayContainerReportGenerationStrategy
from .slurm_command_gen_strategy import SlurmRayContainerCommandGenStrategy
from .slurm_ray_container import SlurmRayContainerCmdArgs, SlurmRayContainerTestDefinition

__all__ = [
"SlurmRayContainerCmdArgs",
"SlurmRayContainerCommandGenStrategy",
"SlurmRayContainerReportGenerationStrategy",
"SlurmRayContainerTestDefinition",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from cloudai import ReportGenerationStrategy


class SlurmRayContainerReportGenerationStrategy(ReportGenerationStrategy):
"""Report generation strategy for a generic Slurm ray container test."""

def can_handle_directory(self) -> bool:
return False

def generate_report(self) -> None:
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from typing import Any, Dict, List, Union, cast

from jinja2 import Template

from cloudai import TestRun
from cloudai.workloads.slurm_container import SlurmContainerCommandGenStrategy

from .slurm_ray_container import SlurmRayContainerTestDefinition


class SlurmRayContainerCommandGenStrategy(SlurmContainerCommandGenStrategy):
"""Command generation strategy for generic Slurm container tests."""

def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]:
sbatch_directives = super()._get_sbatch_directives(args, output_path)
# TODO(Amey): We probably need to figure out what to do with cpus-per-task, mem-per-cpu
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be set with SlurmSystem.extra_sbatch_args. The downside is that it is set per System, so all tests in a scenario will have it.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically, i want this to be dynamic, as a fraction of total resources

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we have to set the task per worker to 1 for ray, we need to ensure that all the resources are made available to the process.

# override tasks per node
sbatch_directives["tasks-per-node"] = "2"
sbatch_directives["exclusive"] = ""

return sbatch_directives

def _gen_srun_command(
self,
slurm_args: Dict[str, Any],
env_vars: Dict[str, str],
cmd_args: Dict[str, Union[str, List[str]]],
tr: TestRun,
) -> str:
srun_command_parts = self.gen_srun_prefix(slurm_args, tr)
nsys_command_parts = super().gen_nsys_command(tr)
cmd_args["srun_command_prefix"] = " ".join(srun_command_parts + nsys_command_parts)
test_command_parts = self.generate_test_command(env_vars, cmd_args, tr)
return " ".join(test_command_parts)

def generate_test_command(
self, env_vars: dict[str, str], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun
) -> list[str]:
tdef: SlurmRayContainerTestDefinition = cast(SlurmRayContainerTestDefinition, tr.test.test_definition)

command_parts: list[str] = [tdef.cmd_args.cmd]
if tr.test.extra_cmd_args:
command_parts.append(tr.test.extra_cmd_args)

# load the jinja template file which is placed at the same directory as this file
script_dir = Path(__file__).parent
template_path = script_dir / "slurm_ray_container_template.sh.jinja"
template = Template(template_path.read_text())

conda_activate_command = f"conda activate {tdef.cmd_args.conda_env} && " if tdef.cmd_args.conda_env else ""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please help me understanding this part. Isn't env for ray is ready inside a container? Why this extra env needed?

In CloudAI we have a concept of installable: items that should be "installed" before run (done with cloudai install ...). Examples: docker images, git repos with python scripts (in this case we can create venv for it), etc. Repos can be mount into a container to have files available.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Essentially, this is supposed to be an optional parameter to activate a specific environment if required. For instance, in the Vajra nightly perf test container, we have multiple envs for vllm, vajra, sglang etc.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm concerned that SlurmRayContainer becomes too Vajra-specific. This shouldn't be a blocker, but if we can generalize it, would be great. I don't have a good idea so far.


# render the template
rendered_template = template.render(
{
"conda_activate_command": conda_activate_command,
"command": " ".join(command_parts),
"srun_command_prefix": cmd_args["srun_command_prefix"],
}
)

return [rendered_template]
Loading