-
Notifications
You must be signed in to change notification settings - Fork 35
[DRAFT][RAY] Add slurm ray tests #409
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
77b559f
2c352e8
0978f5e
a41de4e
a2321aa
7d13c4c
9185db9
a01f053
e408026
f13a597
7b982ad
6c5e9dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -89,3 +89,6 @@ Thumbs.db | |
| install/ | ||
| results/ | ||
| .* | ||
|
|
||
| # conda | ||
| env/ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES | ||
| # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| name = "slurm_ray_container_vllm" | ||
| description = "Run example script with vLLM" | ||
| test_template_name = "SlurmRayContainer" | ||
|
|
||
| [cmd_args] | ||
| docker_image_url = "vllm/vllm-openai:latest" | ||
| cmd = "python3 examples/offline_inference/llm_engine_example.py -tp 8 -pp 2" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES | ||
| # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| name = "slurm_ray_container_example" | ||
|
|
||
| [[Tests]] | ||
| id = "Tests.1" | ||
| test_name = "slurm_ray_container_vllm" | ||
| num_nodes = "2" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,3 +4,4 @@ tbparse==0.0.8 | |
| toml==0.10.2 | ||
| kubernetes==30.1.0 | ||
| pydantic==2.8.2 | ||
| jinja2==3.1.6 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES | ||
| # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| from .report_generation_strategy import SlurmRayContainerReportGenerationStrategy | ||
| from .slurm_command_gen_strategy import SlurmRayContainerCommandGenStrategy | ||
| from .slurm_ray_container import SlurmRayContainerCmdArgs, SlurmRayContainerTestDefinition | ||
|
|
||
| __all__ = [ | ||
| "SlurmRayContainerCmdArgs", | ||
| "SlurmRayContainerCommandGenStrategy", | ||
| "SlurmRayContainerReportGenerationStrategy", | ||
| "SlurmRayContainerTestDefinition", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES | ||
| # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
|
|
||
| from cloudai import ReportGenerationStrategy | ||
|
|
||
|
|
||
| class SlurmRayContainerReportGenerationStrategy(ReportGenerationStrategy): | ||
| """Report generation strategy for a generic Slurm ray container test.""" | ||
|
|
||
| def can_handle_directory(self) -> bool: | ||
| return False | ||
|
|
||
| def generate_report(self) -> None: | ||
| pass |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES | ||
| # Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| from pathlib import Path | ||
| from typing import Any, Dict, List, Union, cast | ||
|
|
||
| from jinja2 import Template | ||
|
|
||
| from cloudai import TestRun | ||
| from cloudai.workloads.slurm_container import SlurmContainerCommandGenStrategy | ||
|
|
||
| from .slurm_ray_container import SlurmRayContainerTestDefinition | ||
|
|
||
|
|
||
| class SlurmRayContainerCommandGenStrategy(SlurmContainerCommandGenStrategy): | ||
| """Command generation strategy for generic Slurm container tests.""" | ||
|
|
||
| def _get_sbatch_directives(self, args: Dict[str, Any], output_path: Path) -> Dict[str, str]: | ||
| sbatch_directives = super()._get_sbatch_directives(args, output_path) | ||
| # TODO(Amey): We probably need to figure out what to do with cpus-per-task, mem-per-cpu | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be set with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Basically, i want this to be dynamic, as a fraction of total resources There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we have to set the task per worker to 1 for ray, we need to ensure that all the resources are made available to the process. |
||
| # override tasks per node | ||
| sbatch_directives["tasks-per-node"] = "2" | ||
| sbatch_directives["exclusive"] = "" | ||
|
|
||
| return sbatch_directives | ||
|
|
||
| def _gen_srun_command( | ||
| self, | ||
| slurm_args: Dict[str, Any], | ||
| env_vars: Dict[str, str], | ||
| cmd_args: Dict[str, Union[str, List[str]]], | ||
| tr: TestRun, | ||
| ) -> str: | ||
| srun_command_parts = self.gen_srun_prefix(slurm_args, tr) | ||
| nsys_command_parts = super().gen_nsys_command(tr) | ||
| cmd_args["srun_command_prefix"] = " ".join(srun_command_parts + nsys_command_parts) | ||
| test_command_parts = self.generate_test_command(env_vars, cmd_args, tr) | ||
| return " ".join(test_command_parts) | ||
|
|
||
| def generate_test_command( | ||
| self, env_vars: dict[str, str], cmd_args: Dict[str, Union[str, List[str]]], tr: TestRun | ||
| ) -> list[str]: | ||
| tdef: SlurmRayContainerTestDefinition = cast(SlurmRayContainerTestDefinition, tr.test.test_definition) | ||
|
|
||
| command_parts: list[str] = [tdef.cmd_args.cmd] | ||
| if tr.test.extra_cmd_args: | ||
| command_parts.append(tr.test.extra_cmd_args) | ||
|
|
||
| # load the jinja template file which is placed at the same directory as this file | ||
| script_dir = Path(__file__).parent | ||
| template_path = script_dir / "slurm_ray_container_template.sh.jinja" | ||
| template = Template(template_path.read_text()) | ||
|
|
||
| conda_activate_command = f"conda activate {tdef.cmd_args.conda_env} && " if tdef.cmd_args.conda_env else "" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please help me understanding this part. Isn't env for In CloudAI we have a concept of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Essentially, this is supposed to be an optional parameter to activate a specific environment if required. For instance, in the Vajra nightly perf test container, we have multiple envs for vllm, vajra, sglang etc. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm concerned that |
||
|
|
||
| # render the template | ||
| rendered_template = template.render( | ||
| { | ||
| "conda_activate_command": conda_activate_command, | ||
| "command": " ".join(command_parts), | ||
| "srun_command_prefix": cmd_args["srun_command_prefix"], | ||
| } | ||
| ) | ||
|
|
||
| return [rendered_template] | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For new files please set a single year value (diagnostic we have today is misleading)