NVIDIA · AgrawalAmey · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025
@@ -89,3 +89,6 @@ Thumbs.db
 install/
 results/
 .*
+
+# conda
+env/
@@ -28,6 +28,7 @@ These schemas enable CloudAI to be flexible and compatible with different system
 |Sleep|✅|✅|✅|
 |UCC|✅|❌|❌|
 |SlurmContainer|✅|❌|❌|
+|SlurmRayContainer|✅|❌|❌|
 |MegatronRun (experimental)|✅|❌|❌|
 
 

@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "slurm_ray_container_vllm"
+description = "Run example script with vLLM"
+test_template_name = "SlurmRayContainer"
+
+[cmd_args]
+docker_image_url = "vllm/vllm-openai:latest"
+cmd = "python3 examples/offline_inference/llm_engine_example.py -tp 8 -pp 2"
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");

@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "slurm_ray_container_example"
+
+[[Tests]]
+id = "Tests.1"
+test_name = "slurm_ray_container_vllm"
+num_nodes = "2"
@@ -24,6 +24,7 @@ dependencies = [
   "toml==0.10.2",
   "kubernetes==30.1.0",
   "pydantic==2.8.2",
+  "jinja2==3.1.6",
 ]
   [project.scripts]
   cloudai = "cloudai.__main__:main"

@@ -4,3 +4,4 @@ tbparse==0.0.8
 toml==0.10.2
 kubernetes==30.1.0
 pydantic==2.8.2
+jinja2==3.1.6
@@ -97,6 +97,7 @@
     SleepTestDefinition,
 )
 from .workloads.slurm_container import SlurmContainerCommandGenStrategy, SlurmContainerTestDefinition
+from .workloads.slurm_ray_container import SlurmRayContainerCommandGenStrategy, SlurmRayContainerTestDefinition
 from .workloads.ucc_test import (
     UCCTestDefinition,
     UCCTestGradingStrategy,
@@ -156,6 +157,7 @@
         SleepTestDefinition,
         NeMoRunTestDefinition,
         SlurmContainerTestDefinition,
+        SlurmRayContainerTestDefinition,
         MegatronRunTestDefinition,
     ],
     SlurmJobIdRetrievalStrategy,
@@ -191,6 +193,7 @@
         SleepTestDefinition,
         NeMoRunTestDefinition,
         SlurmContainerTestDefinition,
+        SlurmRayContainerTestDefinition,
         MegatronRunTestDefinition,
     ],
     DefaultJobStatusRetrievalStrategy,
@@ -207,6 +210,9 @@
 Registry().add_strategy(
     CommandGenStrategy, [SlurmSystem], [SlurmContainerTestDefinition], SlurmContainerCommandGenStrategy
 )
+Registry().add_strategy(
+    CommandGenStrategy, [SlurmSystem], [SlurmRayContainerTestDefinition], SlurmRayContainerCommandGenStrategy
+)
 
 Registry().add_installer("slurm", SlurmInstaller)
 Registry().add_installer("standalone", StandaloneInstaller)
@@ -226,6 +232,7 @@
 Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition)
 Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
 Registry().add_test_definition("SlurmContainer", SlurmContainerTestDefinition)
+Registry().add_test_definition("SlurmRayContainer", SlurmRayContainerTestDefinition)
 Registry().add_test_definition("MegatronRun", MegatronRunTestDefinition)
 
 Registry().add_agent("grid_search", GridSearchAgent)

@@ -36,6 +36,10 @@
 from cloudai.workloads.nemo_run import NeMoRunReportGenerationStrategy, NeMoRunTestDefinition
 from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition
 from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition
+from cloudai.workloads.slurm_ray_container import (
+    SlurmRayContainerReportGenerationStrategy,
+    SlurmRayContainerTestDefinition,
+)
 from cloudai.workloads.ucc_test import UCCTestDefinition, UCCTestReportGenerationStrategy
 
 from .exceptions import TestScenarioParsingError, format_validation_error
@@ -54,6 +58,7 @@
     NemotronTestDefinition: {JaxToolboxReportGenerationStrategy},
     SleepTestDefinition: {SleepReportGenerationStrategy},
     SlurmContainerTestDefinition: {SlurmContainerReportGenerationStrategy},
+    SlurmRayContainerTestDefinition: {SlurmRayContainerReportGenerationStrategy},
     UCCTestDefinition: {UCCTestReportGenerationStrategy},
 }
 

@@ -310,37 +310,61 @@ def _write_sbatch_script(
 
         return f"sbatch {batch_script_path}"
 
-    def _append_sbatch_directives(
-        self, batch_script_content: List[str], args: Dict[str, Any], output_path: Path
-    ) -> None:
+    def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]:
         """
-        Append SBATCH directives to the batch script content.
+        Get the Slurm batch script directives.
 
         Args:
-            batch_script_content (List[str]): The list of script lines to append to.
-            args (Dict[str, Any]): Arguments including job settings.
+            args (Dict[str, Any]): Slurm-specific arguments.
             output_path (Path): Output directory for script and logs.
+
+        Returns:
+            Dict[str, str]: Dictionary of Slurm batch script directives.
         """
-        batch_script_content = self._add_reservation(batch_script_content)
+        sbatch_directives: Dict[str, str] = {}
 
         if "output" not in args:
-            batch_script_content.append(f"#SBATCH --output={output_path / 'stdout.txt'}")
+            sbatch_directives["output"] = f"{output_path / 'stdout.txt'}"
         if "error" not in args:
-            batch_script_content.append(f"#SBATCH --error={output_path / 'stderr.txt'}")
-        batch_script_content.append(f"#SBATCH --partition={self.system.default_partition}")
+            sbatch_directives["error"] = f"{output_path / 'stderr.txt'}"
+
+        sbatch_directives["partition"] = self.system.default_partition
+
         if args["node_list_str"]:
-            batch_script_content.append(f"#SBATCH --nodelist={args['node_list_str']}")
+            sbatch_directives["nodelist"] = args["node_list_str"]
         if self.system.account:
-            batch_script_content.append(f"#SBATCH --account={self.system.account}")
+            sbatch_directives["account"] = self.system.account
         if self.system.distribution:
-            batch_script_content.append(f"#SBATCH --distribution={self.system.distribution}")
+            sbatch_directives["distribution"] = self.system.distribution
         if self.system.gpus_per_node:
-            batch_script_content.append(f"#SBATCH --gpus-per-node={self.system.gpus_per_node}")
-            batch_script_content.append(f"#SBATCH --gres=gpu:{self.system.gpus_per_node}")
+            sbatch_directives["gpus_per_node"] = str(self.system.gpus_per_node)
+            sbatch_directives["gres"] = f"gpu:{self.system.gpus_per_node}"
         if self.system.ntasks_per_node:
-            batch_script_content.append(f"#SBATCH --ntasks-per-node={self.system.ntasks_per_node}")
+            sbatch_directives["ntasks_per_node"] = str(self.system.ntasks_per_node)
         if "time_limit" in args:
-            batch_script_content.append(f"#SBATCH --time={args['time_limit']}")
+            sbatch_directives["time_limit"] = args["time_limit"]
+
+        return sbatch_directives
+
+    def _append_sbatch_directives(
+        self, batch_script_content: List[str], args: Dict[str, Any], output_path: Path
+    ) -> None:
+        """
+        Append SBATCH directives to the batch script content.
+
+        Args:
+            batch_script_content (List[str]): The list of script lines to append to.
+            args (Dict[str, Any]): Arguments including job settings.
+            output_path (Path): Output directory for script and logs.
+        """
+        batch_script_content = self._add_reservation(batch_script_content)
+        sbatch_directives = self._get_sbatch_directives(args, output_path)
+
+        for key, value in sbatch_directives.items():
+            if value:
+                batch_script_content.append(f"#SBATCH --{key}={value}")
+            else:
+                batch_script_content.append(f"#SBATCH --{key}")
 
         for arg in self.system.extra_sbatch_args:
             batch_script_content.append(f"#SBATCH {arg}")

@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .report_generation_strategy import SlurmRayContainerReportGenerationStrategy
+from .slurm_command_gen_strategy import SlurmRayContainerCommandGenStrategy
+from .slurm_ray_container import SlurmRayContainerCmdArgs, SlurmRayContainerTestDefinition
+
+__all__ = [
+    "SlurmRayContainerCmdArgs",
+    "SlurmRayContainerCommandGenStrategy",
+    "SlurmRayContainerReportGenerationStrategy",
+    "SlurmRayContainerTestDefinition",
+]
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from cloudai import ReportGenerationStrategy
+
+
+class SlurmRayContainerReportGenerationStrategy(ReportGenerationStrategy):
+    """Report generation strategy for a generic Slurm ray container test."""
+
+    def can_handle_directory(self) -> bool:
+        return False
+
+    def generate_report(self) -> None:
+        pass
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import Any, Dict, List, Union, cast
+
+from jinja2 import Template
+
+from cloudai import TestRun
+from cloudai.workloads.slurm_container import SlurmContainerCommandGenStrategy
+
+from .slurm_ray_container import SlurmRayContainerTestDefinition
+
+
+class SlurmRayContainerCommandGenStrategy(SlurmContainerCommandGenStrategy):
+    """Command generation strategy for generic Slurm container tests."""
+
+    def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]:
+        sbatch_directives = super()._get_sbatch_directives(args, output_path)
+        # TODO(Amey): We probably need to figure out what to do with cpus-per-task, mem-per-cpu
+        # override tasks per node
+        sbatch_directives["tasks-per-node"] = "2"
+        sbatch_directives["exclusive"] = ""
+
+        return sbatch_directives
+
+    def _gen_srun_command(
+        self,
+        slurm_args: Dict[str, Any],
+        env_vars: Dict[str, str],
+        cmd_args: Dict[str, Union[str, List[str]]],
+        tr: TestRun,
+    ) -> str:
+        srun_command_parts = self.gen_srun_prefix(slurm_args, tr)
+        nsys_command_parts = super().gen_nsys_command(tr)
+        cmd_args["srun_command_prefix"] = " ".join(srun_command_parts + nsys_command_parts)
+        test_command_parts = self.generate_test_command(env_vars, cmd_args, tr)
+        return " ".join(test_command_parts)
+
+    def generate_test_command(
+        self, env_vars: dict[str, str], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun
+    ) -> list[str]:
+        tdef: SlurmRayContainerTestDefinition = cast(SlurmRayContainerTestDefinition, tr.test.test_definition)
+
+        command_parts: list[str] = [tdef.cmd_args.cmd]
+        if tr.test.extra_cmd_args:
+            command_parts.append(tr.test.extra_cmd_args)
+
+        # load the jinja template file which is placed at the same directory as this file
+        script_dir = Path(__file__).parent
+        template_path = script_dir / "slurm_ray_container_template.sh.jinja"
+        template = Template(template_path.read_text())
+
+        conda_activate_command = f"conda activate {tdef.cmd_args.conda_env} && " if tdef.cmd_args.conda_env else ""
+
+        # render the template
+        rendered_template = template.render(
+            {
+                "conda_activate_command": conda_activate_command,
+                "command": " ".join(command_parts),
+                "srun_command_prefix": cmd_args["srun_command_prefix"],
+            }
+        )
+
+        return [rendered_template]
-Original file line number
+Diff line change
@@ Expand Up / @@ -89,3 +89,6 @@ Thumbs.db @@
     install/
     results/
     .*
+    # conda
+    env/