From 8ac4b6c2311971810eac91da2a559c0ebc9e2c92 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Fri, 17 Oct 2025 00:23:29 +0300 Subject: [PATCH 01/15] Add DDLB workload --- conf/common/test/ddlb_test.toml | 22 ++++ conf/common/test_scenario/ddlb_test.toml | 26 +++++ src/cloudai/registration.py | 6 + src/cloudai/workloads/ddlb/__init__.py | 24 ++++ src/cloudai/workloads/ddlb/ddlb.py | 106 ++++++++++++++++++ .../ddlb/slurm_command_gen_strategy.py | 41 +++++++ 6 files changed, 225 insertions(+) create mode 100644 conf/common/test/ddlb_test.toml create mode 100644 conf/common/test_scenario/ddlb_test.toml create mode 100644 src/cloudai/workloads/ddlb/__init__.py create mode 100644 src/cloudai/workloads/ddlb/ddlb.py create mode 100644 src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py diff --git a/conf/common/test/ddlb_test.toml b/conf/common/test/ddlb_test.toml new file mode 100644 index 00000000..bb117b99 --- /dev/null +++ b/conf/common/test/ddlb_test.toml @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "ddlb_test" +description = "DDLB test configuration" +test_template_name = "DDLBTest" + +[cmd_args] +docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest" diff --git a/conf/common/test_scenario/ddlb_test.toml b/conf/common/test_scenario/ddlb_test.toml new file mode 100644 index 00000000..5945fc99 --- /dev/null +++ b/conf/common/test_scenario/ddlb_test.toml @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "ddlb-test" + +#pre_test = "ddlb_test" +#post_test = "ddlb_test" + +[[Tests]] +id = "Tests.ddlb" +test_name = "ddlb_test" +num_nodes = 1 +time_limit = "00:10:00" diff --git a/src/cloudai/registration.py b/src/cloudai/registration.py index 288226de..579e292e 100644 --- a/src/cloudai/registration.py +++ b/src/cloudai/registration.py @@ -92,6 +92,10 @@ def register_all(): NcclTestRunAIJsonGenStrategy, NcclTestSlurmCommandGenStrategy, ) + from cloudai.workloads.ddlb import ( + DDLBTestDefinition, + DDLBTestSlurmCommandGenStrategy, + ) from cloudai.workloads.nemo_launcher import ( NeMoLauncherGradingStrategy, NeMoLauncherReportGenerationStrategy, @@ -155,6 +159,7 @@ def register_all(): Registry().add_command_gen_strategy(SlurmSystem, MegatronRunTestDefinition, MegatronRunSlurmCommandGenStrategy) Registry().add_command_gen_strategy(SlurmSystem, NCCLTestDefinition, NcclTestSlurmCommandGenStrategy) + Registry().add_command_gen_strategy(SlurmSystem, DDLBTestDefinition, DDLBTestSlurmCommandGenStrategy) Registry().add_strategy(GradingStrategy, [SlurmSystem], [SleepTestDefinition], SleepGradingStrategy) Registry().add_command_gen_strategy(SlurmSystem, NeMoLauncherTestDefinition, NeMoLauncherSlurmCommandGenStrategy) @@ -204,6 +209,7 @@ def register_all(): Registry().add_test_definition("UCCTest", UCCTestDefinition) Registry().add_test_definition("NcclTest", NCCLTestDefinition) + Registry().add_test_definition("DDLBTest", DDLBTestDefinition) Registry().add_test_definition("ChakraReplay", ChakraReplayTestDefinition) Registry().add_test_definition("Sleep", SleepTestDefinition) Registry().add_test_definition("NeMoLauncher", NeMoLauncherTestDefinition) diff --git a/src/cloudai/workloads/ddlb/__init__.py b/src/cloudai/workloads/ddlb/__init__.py new file mode 100644 index 00000000..f93a56a8 --- /dev/null +++ b/src/cloudai/workloads/ddlb/__init__.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .ddlb import DDLBCmdArgs, DDLBTestDefinition +from .slurm_command_gen_strategy import DDLBTestSlurmCommandGenStrategy + +__all__ = [ + "DDLBCmdArgs", + "DDLBTestDefinition", + "DDLBTestSlurmCommandGenStrategy", +] diff --git a/src/cloudai/workloads/ddlb/ddlb.py b/src/cloudai/workloads/ddlb/ddlb.py new file mode 100644 index 00000000..cd722d75 --- /dev/null +++ b/src/cloudai/workloads/ddlb/ddlb.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Literal, Optional, Union + +from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun +from cloudai.models.workload import CmdArgs, TestDefinition + + +class DDLBCmdArgs(CmdArgs): + """DDLB test command arguments.""" + + docker_image_url: str + +class DDLBTestDefinition(TestDefinition): + """Test object for DDLB.""" + + cmd_args: DDLBCmdArgs + _docker_image: Optional[DockerImage] = None + + @property + def extra_args_str(self) -> str: + parts = [] + for k, v in self.extra_cmd_args.items(): + parts.append(f"{k} {v}" if v else k) + return " ".join(parts) + + @property + def docker_image(self) -> DockerImage: + if not self._docker_image: + self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) + return self._docker_image + + @property + def installables(self) -> list[Installable]: + return [self.docker_image] + + def was_run_successful(self, tr: TestRun) -> JobStatusResult: + stdout_path = tr.output_path / "stdout.txt" + if stdout_path.is_file(): + with stdout_path.open("r") as file: + content = file.read() + + # Check for specific error patterns + if "Error" in content: + return JobStatusResult( + is_successful=False, + error_message=( + f"DDLB test failure detected in {stdout_path}. " + "Possible reasons include network errors or remote process exits. " + "Please review the DDLB test output and errors in the file first. " + "If the issue persists, contact the system administrator." + ), + ) + if "Error" in content: + return JobStatusResult( + is_successful=False, + error_message=( + f"Test failure detected in {stdout_path}. " + "Please review the specific test failure messages in the file. " + "Ensure that the DDLB test environment is correctly set up and configured. " + "If the issue persists, contact the system administrator." + ), + ) + + # Identify missing success indicators + missing_indicators = [] + if "Benchmark Results" not in content: + missing_indicators.append("'Benchmark Results'") + + error_message = ( + f"Missing success indicators in {stdout_path}: {', '.join(missing_indicators)}. " + "These keywords are expected to be present in stdout.txt, usually towards the end of the file. " + "Please review the DDLB test output and errors in the file. " + "Ensure the DDLB test ran to completion. You can run the generated sbatch script manually " + f"and check if {stdout_path} is created and contains the expected keywords. " + "If the issue persists, contact the system administrator." + ) + + return JobStatusResult(is_successful=False, error_message=error_message) + + return JobStatusResult(is_successful=True) + + return JobStatusResult( + is_successful=False, + error_message=( + f"stdout.txt file not found in the specified output directory {tr.output_path}. " + "This file is expected to be created as a result of the DDLB test run. " + "Please ensure the DDLB test was executed properly and that stdout.txt is generated. " + f"You can run the generated DDLB test command manually and verify the creation of {stdout_path}. " + "If the issue persists, contact the system administrator." + ), + ) diff --git a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py new file mode 100644 index 00000000..c6dfc57b --- /dev/null +++ b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, cast + +from cloudai.systems.slurm import SlurmCommandGenStrategy + +from .ddlb import DDLBTestDefinition + + +class DDLBTestSlurmCommandGenStrategy(SlurmCommandGenStrategy): + """Command generation strategy for DDLB tests on Slurm systems.""" + + def _container_mounts(self) -> List[str]: + return [] + + def image_path(self) -> str | None: + tdef: DDLBTestDefinition = cast(DDLBTestDefinition, self.test_run.test.test_definition) + return str(tdef.docker_image.installed_path) + + def generate_test_command(self) -> List[str]: + tdef: DDLBTestDefinition = cast(DDLBTestDefinition, self.test_run.test.test_definition) + srun_command_parts = ["python scripts/run_benchmark.py"] + return srun_command_parts + + def gen_srun_success_check(self) -> str: + output_file = self.test_run.output_path / "stdout.txt" + return f'grep -q "Benchmark Results" {output_file} && echo 1 || echo 0' From ef4256cc8ee51b058afbc400f8daafc39a313100 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Mon, 27 Oct 2025 23:00:07 +0200 Subject: [PATCH 02/15] Update copyright, remove comments --- conf/common/test_scenario/ddlb_test.toml | 5 +---- src/cloudai/workloads/ddlb/__init__.py | 2 +- src/cloudai/workloads/ddlb/ddlb.py | 2 +- src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/conf/common/test_scenario/ddlb_test.toml b/conf/common/test_scenario/ddlb_test.toml index 5945fc99..959a40e1 100644 --- a/conf/common/test_scenario/ddlb_test.toml +++ b/conf/common/test_scenario/ddlb_test.toml @@ -16,11 +16,8 @@ name = "ddlb-test" -#pre_test = "ddlb_test" -#post_test = "ddlb_test" - [[Tests]] id = "Tests.ddlb" test_name = "ddlb_test" num_nodes = 1 -time_limit = "00:10:00" +time_limit = "00:30:00" diff --git a/src/cloudai/workloads/ddlb/__init__.py b/src/cloudai/workloads/ddlb/__init__.py index f93a56a8..12bdebbc 100644 --- a/src/cloudai/workloads/ddlb/__init__.py +++ b/src/cloudai/workloads/ddlb/__init__.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/workloads/ddlb/ddlb.py b/src/cloudai/workloads/ddlb/ddlb.py index cd722d75..56080076 100644 --- a/src/cloudai/workloads/ddlb/ddlb.py +++ b/src/cloudai/workloads/ddlb/ddlb.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py index c6dfc57b..b325d7dc 100644 --- a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 2bf56e4047734c0b86b2e13920c2e9ff6b971f41 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Mon, 27 Oct 2025 23:06:21 +0200 Subject: [PATCH 03/15] Greptile feedback --- src/cloudai/workloads/ddlb/ddlb.py | 16 ++-------------- .../workloads/ddlb/slurm_command_gen_strategy.py | 3 +-- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/src/cloudai/workloads/ddlb/ddlb.py b/src/cloudai/workloads/ddlb/ddlb.py index 56080076..00fd70ef 100644 --- a/src/cloudai/workloads/ddlb/ddlb.py +++ b/src/cloudai/workloads/ddlb/ddlb.py @@ -25,6 +25,7 @@ class DDLBCmdArgs(CmdArgs): docker_image_url: str + class DDLBTestDefinition(TestDefinition): """Test object for DDLB.""" @@ -65,24 +66,11 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: "If the issue persists, contact the system administrator." ), ) - if "Error" in content: - return JobStatusResult( - is_successful=False, - error_message=( - f"Test failure detected in {stdout_path}. " - "Please review the specific test failure messages in the file. " - "Ensure that the DDLB test environment is correctly set up and configured. " - "If the issue persists, contact the system administrator." - ), - ) # Identify missing success indicators - missing_indicators = [] if "Benchmark Results" not in content: - missing_indicators.append("'Benchmark Results'") - error_message = ( - f"Missing success indicators in {stdout_path}: {', '.join(missing_indicators)}. " + f"Missing success indicators in {stdout_path}: 'Benchmark Results'. " "These keywords are expected to be present in stdout.txt, usually towards the end of the file. " "Please review the DDLB test output and errors in the file. " "Ensure the DDLB test ran to completion. You can run the generated sbatch script manually " diff --git a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py index b325d7dc..a07e9415 100644 --- a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py @@ -33,8 +33,7 @@ def image_path(self) -> str | None: def generate_test_command(self) -> List[str]: tdef: DDLBTestDefinition = cast(DDLBTestDefinition, self.test_run.test.test_definition) - srun_command_parts = ["python scripts/run_benchmark.py"] - return srun_command_parts + return ["python scripts/run_benchmark.py"] def gen_srun_success_check(self) -> str: output_file = self.test_run.output_path / "stdout.txt" From eda5d0e7930445be9a363145714ae328ba059a85 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Fri, 31 Oct 2025 19:41:14 +0200 Subject: [PATCH 04/15] Add mpirun --- conf/common/test/ddlb_test.toml | 4 +++- src/cloudai/workloads/ddlb/ddlb.py | 1 + src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py | 5 ++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/conf/common/test/ddlb_test.toml b/conf/common/test/ddlb_test.toml index bb117b99..a26073cb 100644 --- a/conf/common/test/ddlb_test.toml +++ b/conf/common/test/ddlb_test.toml @@ -19,4 +19,6 @@ description = "DDLB test configuration" test_template_name = "DDLBTest" [cmd_args] -docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest" +docker_image_url = "/mnt/lustre/gaia/nsarkauskas/gaia/Cloudai/nsarkauskas+ddlb.sqsh" +# Number of MPI ranks passed to mpirun -np +np = 8 diff --git a/src/cloudai/workloads/ddlb/ddlb.py b/src/cloudai/workloads/ddlb/ddlb.py index 00fd70ef..012cacb9 100644 --- a/src/cloudai/workloads/ddlb/ddlb.py +++ b/src/cloudai/workloads/ddlb/ddlb.py @@ -24,6 +24,7 @@ class DDLBCmdArgs(CmdArgs): """DDLB test command arguments.""" docker_image_url: str + np: int class DDLBTestDefinition(TestDefinition): diff --git a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py index a07e9415..5f11bd43 100644 --- a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py @@ -33,7 +33,10 @@ def image_path(self) -> str | None: def generate_test_command(self) -> List[str]: tdef: DDLBTestDefinition = cast(DDLBTestDefinition, self.test_run.test.test_definition) - return ["python scripts/run_benchmark.py"] + cmd = ["mpirun -np "] + cmd.append(str(tdef.cmd_args.np)) + cmd.append("python scripts/run_benchmark.py") + return cmd def gen_srun_success_check(self) -> str: output_file = self.test_run.output_path / "stdout.txt" From 920a023e7160c13f96015fb6344613b5124a98f2 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Fri, 31 Oct 2025 20:39:40 +0200 Subject: [PATCH 05/15] Revert "Add mpirun" This reverts commit eda5d0e7930445be9a363145714ae328ba059a85. --- conf/common/test/ddlb_test.toml | 4 +--- src/cloudai/workloads/ddlb/ddlb.py | 1 - src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py | 5 +---- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/conf/common/test/ddlb_test.toml b/conf/common/test/ddlb_test.toml index a26073cb..bb117b99 100644 --- a/conf/common/test/ddlb_test.toml +++ b/conf/common/test/ddlb_test.toml @@ -19,6 +19,4 @@ description = "DDLB test configuration" test_template_name = "DDLBTest" [cmd_args] -docker_image_url = "/mnt/lustre/gaia/nsarkauskas/gaia/Cloudai/nsarkauskas+ddlb.sqsh" -# Number of MPI ranks passed to mpirun -np -np = 8 +docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest" diff --git a/src/cloudai/workloads/ddlb/ddlb.py b/src/cloudai/workloads/ddlb/ddlb.py index 012cacb9..00fd70ef 100644 --- a/src/cloudai/workloads/ddlb/ddlb.py +++ b/src/cloudai/workloads/ddlb/ddlb.py @@ -24,7 +24,6 @@ class DDLBCmdArgs(CmdArgs): """DDLB test command arguments.""" docker_image_url: str - np: int class DDLBTestDefinition(TestDefinition): diff --git a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py index 5f11bd43..a07e9415 100644 --- a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py @@ -33,10 +33,7 @@ def image_path(self) -> str | None: def generate_test_command(self) -> List[str]: tdef: DDLBTestDefinition = cast(DDLBTestDefinition, self.test_run.test.test_definition) - cmd = ["mpirun -np "] - cmd.append(str(tdef.cmd_args.np)) - cmd.append("python scripts/run_benchmark.py") - return cmd + return ["python scripts/run_benchmark.py"] def gen_srun_success_check(self) -> str: output_file = self.test_run.output_path / "stdout.txt" From 8a16a4540ae707ea18ecf65aa436fcba9130e383 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Wed, 5 Nov 2025 22:54:14 +0200 Subject: [PATCH 06/15] Use new CLI args --- conf/common/test/ddlb_test.toml | 9 ++++++++ src/cloudai/workloads/ddlb/ddlb.py | 8 +++++++ .../ddlb/slurm_command_gen_strategy.py | 23 ++++++++++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/conf/common/test/ddlb_test.toml b/conf/common/test/ddlb_test.toml index bb117b99..6160fdaf 100644 --- a/conf/common/test/ddlb_test.toml +++ b/conf/common/test/ddlb_test.toml @@ -20,3 +20,12 @@ test_template_name = "DDLBTest" [cmd_args] docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest" +primitive = "tp_columnwise" +m = [1024,8192] +n = 128 +k = 1024 +dtype = "float16" +num_iterations = 50 +num_warmups = 10 +# Maker sure to specify only one configuration per --impl argument. i.e., do not write in one impl "order=AG_before,AG_after" +impl = ["pytorch;backend=nccl;order=AG_before", "fuser;algorithm=p2p_pipeline;backend=cuda;order=AG_before"] diff --git a/src/cloudai/workloads/ddlb/ddlb.py b/src/cloudai/workloads/ddlb/ddlb.py index 00fd70ef..9f6e45cb 100644 --- a/src/cloudai/workloads/ddlb/ddlb.py +++ b/src/cloudai/workloads/ddlb/ddlb.py @@ -24,6 +24,14 @@ class DDLBCmdArgs(CmdArgs): """DDLB test command arguments.""" docker_image_url: str + primitive: str + m: Union[int, list[int]] = 1024 + n: Union[int, list[int]] = 128 + k: Union[int, list[int]] = 1024 + dtype: str + num_iterations: int = 50 + num_warmups: int = 5 + impl: Union[str, list[str]] = "pytorch;backend=nccl;order=AG_before,AG_after" class DDLBTestDefinition(TestDefinition): diff --git a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py index a07e9415..58294eb7 100644 --- a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py @@ -33,7 +33,28 @@ def image_path(self) -> str | None: def generate_test_command(self) -> List[str]: tdef: DDLBTestDefinition = cast(DDLBTestDefinition, self.test_run.test.test_definition) - return ["python scripts/run_benchmark.py"] + srun_command_parts = ["python ddlb/cli/benchmark.py"] + ddlb_test_args = tdef.cmd_args.model_dump().keys() + for arg in ddlb_test_args: + if arg is "docker_image_url": + continue + + value = getattr(tdef.cmd_args, arg) + if value is None: + continue + + match arg: + case "m" | "n" | "k": + srun_command_parts.append(f"-{arg} {value}") + case "num_iterations" | "num_warmups": + srun_command_parts.append(f"--{arg.replace('_', '-')} {value}") + case _: + srun_command_parts.append(f"--{arg} {value}") + + if self.test_run.test.extra_cmd_args: + srun_command_parts.append(self.test_run.test.extra_cmd_args) + + return srun_command_parts def gen_srun_success_check(self) -> str: output_file = self.test_run.output_path / "stdout.txt" From e5f2e61b9bb876465e793688f1160d757b4d5281 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Wed, 5 Nov 2025 23:52:27 +0200 Subject: [PATCH 07/15] Move to experimental, sbatch acceptance test --- .../test/ddlb_test.toml | 0 src/cloudai/workloads/ddlb/ddlb.py | 2 +- tests/ref_data/ddlb.sbatch | 17 +++++++++++++++++ tests/test_acceptance.py | 16 ++++++++++++++++ tests/test_init.py | 8 +++++++- 5 files changed, 41 insertions(+), 2 deletions(-) rename conf/{common => experimental}/test/ddlb_test.toml (100%) create mode 100644 tests/ref_data/ddlb.sbatch diff --git a/conf/common/test/ddlb_test.toml b/conf/experimental/test/ddlb_test.toml similarity index 100% rename from conf/common/test/ddlb_test.toml rename to conf/experimental/test/ddlb_test.toml diff --git a/src/cloudai/workloads/ddlb/ddlb.py b/src/cloudai/workloads/ddlb/ddlb.py index 9f6e45cb..77cb81e4 100644 --- a/src/cloudai/workloads/ddlb/ddlb.py +++ b/src/cloudai/workloads/ddlb/ddlb.py @@ -31,7 +31,7 @@ class DDLBCmdArgs(CmdArgs): dtype: str num_iterations: int = 50 num_warmups: int = 5 - impl: Union[str, list[str]] = "pytorch;backend=nccl;order=AG_before,AG_after" + impl: Union[str, list[str]] = "pytorch;backend=nccl;order=AG_before" class DDLBTestDefinition(TestDefinition): diff --git a/tests/ref_data/ddlb.sbatch b/tests/ref_data/ddlb.sbatch new file mode 100644 index 00000000..c6b1a14c --- /dev/null +++ b/tests/ref_data/ddlb.sbatch @@ -0,0 +1,17 @@ +#!/bin/bash +# generated by CloudAI@__CLOUDAI_VERSION__ +#SBATCH --job-name=__JOB_NAME__ +#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt +#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt +#SBATCH --partition=main +#SBATCH -N 1 +#SBATCH --gpus-per-node=8 +#SBATCH --gres=gpu:8 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." + +srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh + +srun --export=ALL --mpi=pmix --container-image=gitlab-master.nvidia.com/nsarkauskas/ddlb:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output bash -c "source __OUTPUT_DIR__/output/env_vars.sh; python ddlb/cli/benchmark.py --primitive tp_columnwise -m 1024 -n 128 -k 1024 --dtype float16 --num-iterations 50 --num-warmups 5 --impl pytorch;backend=nccl;order=AG_before" diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 08791655..227b0a02 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -67,6 +67,7 @@ TritonInferenceTestDefinition, ) from cloudai.workloads.ucc_test import UCCCmdArgs, UCCTestDefinition +from cloudai.workloads.ddlb import DDLBCmdArgs, DDLBTestDefinition SLURM_TEST_SCENARIOS = [ {"path": Path("conf/common/test_scenario/sleep.toml"), "expected_dirs_number": 4, "log_file": "sleep_debug.log"}, @@ -248,6 +249,7 @@ def build_special_test_run( @pytest.fixture( params=[ "ucc", + "ddlb", "nccl", "sleep", "gpt-pre-test", @@ -291,6 +293,20 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - cmd_args=NCCLCmdArgs(docker_image_url="nvcr.io#nvidia/pytorch:24.02-py3"), ), ), + "ddlb": lambda: create_test_run( + partial_tr, + slurm_system, + "ddlb", + DDLBTestDefinition( + name="ddlb", + description="ddlb", + test_template_name="ddlb", + cmd_args=DDLBCmdArgs(docker_image_url="gitlab-master.nvidia.com/nsarkauskas/ddlb:latest", + primitive="tp_columnwise", m=1024, n=128, k=1024, dtype="float16", + num_iterations=50, num_warmups=5, + impl="pytorch;backend=nccl;order=AG_before"), + ), + ), "sleep": lambda: create_test_run( partial_tr, slurm_system, diff --git a/tests/test_init.py b/tests/test_init.py index 12f3aaa7..8f6eda49 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -83,6 +83,10 @@ UCCTestGradingStrategy, UCCTestSlurmCommandGenStrategy, ) +from cloudai.workloads.ddlb import ( + DDLBTestDefinition, + DDLBTestSlurmCommandGenStrategy, +) def test_systems(): @@ -116,6 +120,7 @@ def test_runners(): (SlurmSystem, SleepTestDefinition): SleepSlurmCommandGenStrategy, (SlurmSystem, SlurmContainerTestDefinition): SlurmContainerCommandGenStrategy, (SlurmSystem, UCCTestDefinition): UCCTestSlurmCommandGenStrategy, + (SlurmSystem, DDLBTestDefinition): DDLBTestSlurmCommandGenStrategy, (SlurmSystem, MegatronRunTestDefinition): MegatronRunSlurmCommandGenStrategy, (StandaloneSystem, SleepTestDefinition): SleepStandaloneCommandGenStrategy, (LSFSystem, SleepTestDefinition): SleepLSFCommandGenStrategy, @@ -184,9 +189,10 @@ def test_installers(): def test_definitions(): test_defs = Registry().test_definitions_map - assert len(test_defs) == 17 + assert len(test_defs) == 18 for tdef in [ ("UCCTest", UCCTestDefinition), + ("DDLBTest", DDLBTestDefinition), ("NcclTest", NCCLTestDefinition), ("ChakraReplay", ChakraReplayTestDefinition), ("Sleep", SleepTestDefinition), From aa01c79e6116de45445a5258d1e956d688cf7532 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Wed, 5 Nov 2025 16:53:10 -0500 Subject: [PATCH 08/15] Update src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py index 58294eb7..c9b427e1 100644 --- a/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/ddlb/slurm_command_gen_strategy.py @@ -36,7 +36,7 @@ def generate_test_command(self) -> List[str]: srun_command_parts = ["python ddlb/cli/benchmark.py"] ddlb_test_args = tdef.cmd_args.model_dump().keys() for arg in ddlb_test_args: - if arg is "docker_image_url": + if arg == "docker_image_url": continue value = getattr(tdef.cmd_args, arg) From e307fda3104cf1ab66be86aeb3a7536b8d7af326 Mon Sep 17 00:00:00 2001 From: Nicholas Sarkauskas Date: Wed, 5 Nov 2025 17:03:22 -0500 Subject: [PATCH 09/15] Add docs --- doc/workloads/ddlb.rst | 77 ++++++++++++++++++++++++++++++++++++++++++ doc/workloads/index.md | 1 + 2 files changed, 78 insertions(+) create mode 100644 doc/workloads/ddlb.rst diff --git a/doc/workloads/ddlb.rst b/doc/workloads/ddlb.rst new file mode 100644 index 00000000..1eb449a6 --- /dev/null +++ b/doc/workloads/ddlb.rst @@ -0,0 +1,77 @@ +DDLB +==== + +This workload (`test_template_name` is ``DDLB``) allows you to execute DDLB (Distributed Deep Learning Benchmarks) within the CloudAI framework. Please find the DDLB README at https://github.com/samnordmann/ddlb. + +Usage Example +------------- + +Test TOML example: + +.. code-block:: toml + + name = "my_ddlb_test" + description = "Example DDLB test" + test_template_name = "DDLB" + + [cmd_args] + docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest" + primitive = "tp_columnwise" + dtype = "float16" + +Test Scenario example: + +.. code-block:: toml + + name = "ddlb-test" + + [[Tests]] + id = "ddlb.1" + num_nodes = 1 + time_limit = "00:10:00" + + test_name = "my_ddlb_test" + +Test-in-Scenario example: + +.. code-block:: toml + + name = "ddlb-test" + + [[Tests]] + id = "ddlb.1" + num_nodes = 1 + time_limit = "00:10:00" + + name = "my_ddlb_test" + description = "Example DDLB test" + test_template_name = "DDLB" + + [Tests.cmd_args] + docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest" + primitive = "tp_columnwise" + m = 1024 + n = 128 + k = 1024 + dtype = "float16" + num_iterations = 50 + num_warmups = 5 + impl = "pytorch;backend=nccl;order=AG_before" + +API Documentation +--------------------------------- + +Command Arguments +~~~~~~~~~~~~~~~~~ + +.. autoclass:: cloudai.workloads.ddlb.ddlb.DDLBCmdArgs + :members: + :show-inheritance: + +Test Definition +~~~~~~~~~~~~~~~ + +.. autoclass:: cloudai.workloads.ddlb.ddlb.DDLBTestDefinition + :members: + :show-inheritance: + diff --git a/doc/workloads/index.md b/doc/workloads/index.md index 1aef89c9..cfc6170e 100644 --- a/doc/workloads/index.md +++ b/doc/workloads/index.md @@ -12,6 +12,7 @@ ai_dynamo bash_cmd chakra_replay nccl +ddlb nemo_run nixl_bench nixl_kvbench From 7bbf993005701e6c1c78c52d7418f90f9a4a0a60 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Wed, 5 Nov 2025 17:05:53 -0500 Subject: [PATCH 10/15] Update src/cloudai/workloads/ddlb/ddlb.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- src/cloudai/workloads/ddlb/ddlb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/workloads/ddlb/ddlb.py b/src/cloudai/workloads/ddlb/ddlb.py index 77cb81e4..e85f6259 100644 --- a/src/cloudai/workloads/ddlb/ddlb.py +++ b/src/cloudai/workloads/ddlb/ddlb.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Literal, Optional, Union +from typing import Optional from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun from cloudai.models.workload import CmdArgs, TestDefinition From 04b385d11193c356b14ef326c6d6666977007436 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Thu, 6 Nov 2025 00:10:23 +0200 Subject: [PATCH 11/15] Readd import --- src/cloudai/workloads/ddlb/ddlb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/workloads/ddlb/ddlb.py b/src/cloudai/workloads/ddlb/ddlb.py index e85f6259..e62ae376 100644 --- a/src/cloudai/workloads/ddlb/ddlb.py +++ b/src/cloudai/workloads/ddlb/ddlb.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Optional, Union from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun from cloudai.models.workload import CmdArgs, TestDefinition From 2df0d989a116eb36d1fdf93ef468957b32ceaa8f Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Thu, 6 Nov 2025 00:12:02 +0200 Subject: [PATCH 12/15] Linter --- src/cloudai/registration.py | 8 ++++---- tests/test_acceptance.py | 17 ++++++++++++----- tests/test_init.py | 8 ++++---- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/cloudai/registration.py b/src/cloudai/registration.py index 579e292e..76ade7eb 100644 --- a/src/cloudai/registration.py +++ b/src/cloudai/registration.py @@ -69,6 +69,10 @@ def register_all(): ChakraReplaySlurmCommandGenStrategy, ChakraReplayTestDefinition, ) + from cloudai.workloads.ddlb import ( + DDLBTestDefinition, + DDLBTestSlurmCommandGenStrategy, + ) from cloudai.workloads.jax_toolbox import ( GPTTestDefinition, GrokTestDefinition, @@ -92,10 +96,6 @@ def register_all(): NcclTestRunAIJsonGenStrategy, NcclTestSlurmCommandGenStrategy, ) - from cloudai.workloads.ddlb import ( - DDLBTestDefinition, - DDLBTestSlurmCommandGenStrategy, - ) from cloudai.workloads.nemo_launcher import ( NeMoLauncherGradingStrategy, NeMoLauncherReportGenerationStrategy, diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 227b0a02..4b28f151 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -37,6 +37,7 @@ GenAIPerfArgs, PrefillWorkerArgs, ) +from cloudai.workloads.ddlb import DDLBCmdArgs, DDLBTestDefinition from cloudai.workloads.jax_toolbox import ( GPTCmdArgs, GPTTestDefinition, @@ -67,7 +68,6 @@ TritonInferenceTestDefinition, ) from cloudai.workloads.ucc_test import UCCCmdArgs, UCCTestDefinition -from cloudai.workloads.ddlb import DDLBCmdArgs, DDLBTestDefinition SLURM_TEST_SCENARIOS = [ {"path": Path("conf/common/test_scenario/sleep.toml"), "expected_dirs_number": 4, "log_file": "sleep_debug.log"}, @@ -301,10 +301,17 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - name="ddlb", description="ddlb", test_template_name="ddlb", - cmd_args=DDLBCmdArgs(docker_image_url="gitlab-master.nvidia.com/nsarkauskas/ddlb:latest", - primitive="tp_columnwise", m=1024, n=128, k=1024, dtype="float16", - num_iterations=50, num_warmups=5, - impl="pytorch;backend=nccl;order=AG_before"), + cmd_args=DDLBCmdArgs( + docker_image_url="gitlab-master.nvidia.com/nsarkauskas/ddlb:latest", + primitive="tp_columnwise", + m=1024, + n=128, + k=1024, + dtype="float16", + num_iterations=50, + num_warmups=5, + impl="pytorch;backend=nccl;order=AG_before", + ), ), ), "sleep": lambda: create_test_run( diff --git a/tests/test_init.py b/tests/test_init.py index 8f6eda49..7e486db1 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -33,6 +33,10 @@ ChakraReplaySlurmCommandGenStrategy, ChakraReplayTestDefinition, ) +from cloudai.workloads.ddlb import ( + DDLBTestDefinition, + DDLBTestSlurmCommandGenStrategy, +) from cloudai.workloads.jax_toolbox import ( GPTTestDefinition, GrokTestDefinition, @@ -83,10 +87,6 @@ UCCTestGradingStrategy, UCCTestSlurmCommandGenStrategy, ) -from cloudai.workloads.ddlb import ( - DDLBTestDefinition, - DDLBTestSlurmCommandGenStrategy, -) def test_systems(): From 20ffb5f1dc5e19d47badebc92a8339a94b2bec58 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Thu, 6 Nov 2025 00:14:33 +0200 Subject: [PATCH 13/15] Linter --- conf/experimental/test/ddlb_test.toml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/conf/experimental/test/ddlb_test.toml b/conf/experimental/test/ddlb_test.toml index 6160fdaf..1beda7c1 100644 --- a/conf/experimental/test/ddlb_test.toml +++ b/conf/experimental/test/ddlb_test.toml @@ -21,11 +21,14 @@ test_template_name = "DDLBTest" [cmd_args] docker_image_url = "gitlab-master.nvidia.com/nsarkauskas/ddlb:latest" primitive = "tp_columnwise" -m = [1024,8192] +m = [1024, 8192] n = 128 k = 1024 dtype = "float16" num_iterations = 50 num_warmups = 10 # Maker sure to specify only one configuration per --impl argument. i.e., do not write in one impl "order=AG_before,AG_after" -impl = ["pytorch;backend=nccl;order=AG_before", "fuser;algorithm=p2p_pipeline;backend=cuda;order=AG_before"] +impl = [ + "pytorch;backend=nccl;order=AG_before", + "fuser;algorithm=p2p_pipeline;backend=cuda;order=AG_before", +] From 2068f5fca6a0a0e667aacabff1063d1fae7b2c3b Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Wed, 5 Nov 2025 17:18:08 -0500 Subject: [PATCH 14/15] Update conf/experimental/test/ddlb_test.toml Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- conf/experimental/test/ddlb_test.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/experimental/test/ddlb_test.toml b/conf/experimental/test/ddlb_test.toml index 1beda7c1..8f82066b 100644 --- a/conf/experimental/test/ddlb_test.toml +++ b/conf/experimental/test/ddlb_test.toml @@ -27,7 +27,7 @@ k = 1024 dtype = "float16" num_iterations = 50 num_warmups = 10 -# Maker sure to specify only one configuration per --impl argument. i.e., do not write in one impl "order=AG_before,AG_after" +# Make sure to specify only one configuration per --impl argument. i.e., do not write in one impl "order=AG_before,AG_after" impl = [ "pytorch;backend=nccl;order=AG_before", "fuser;algorithm=p2p_pipeline;backend=cuda;order=AG_before", From 1f1e6cda68bc42e620f7c3cf93ef0592390d72d1 Mon Sep 17 00:00:00 2001 From: Nick Sarkauskas Date: Fri, 7 Nov 2025 16:38:12 +0200 Subject: [PATCH 15/15] Move test_scenario/ddlb_test.toml to experimental --- conf/{common => experimental}/test_scenario/ddlb_test.toml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename conf/{common => experimental}/test_scenario/ddlb_test.toml (100%) diff --git a/conf/common/test_scenario/ddlb_test.toml b/conf/experimental/test_scenario/ddlb_test.toml similarity index 100% rename from conf/common/test_scenario/ddlb_test.toml rename to conf/experimental/test_scenario/ddlb_test.toml