Skip to content

Commit b3f62c7

Browse files
committed
mmultigpu ci test
1 parent f758a21 commit b3f62c7

File tree

2 files changed

+125
-5
lines changed

2 files changed

+125
-5
lines changed

.github/scripts/filter-matrix.py

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,41 @@ def filter_matrix_item(
6060
return True
6161

6262

63+
def create_distributed_config(item: Dict[str, Any]) -> Dict[str, Any]:
64+
"""Create distributed test configuration from a regular config.
65+
66+
Takes a standard test config and modifies it for distributed testing:
67+
- Changes runner to multi-GPU instance
68+
- Adds num_gpus field
69+
- Adds config marker
70+
"""
71+
import sys
72+
73+
# Create a copy to avoid modifying the original
74+
dist_item = item.copy()
75+
76+
# Debug: Show original config
77+
print(f"[DEBUG] Creating distributed config from:", file=sys.stderr)
78+
print(f"[DEBUG] Python: {item.get('python_version')}", file=sys.stderr)
79+
print(f"[DEBUG] CUDA: {item.get('desired_cuda')}", file=sys.stderr)
80+
print(
81+
f"[DEBUG] Original runner: {item.get('validation_runner')}", file=sys.stderr
82+
)
83+
84+
# Override runner to use multi-GPU instance
85+
dist_item["validation_runner"] = "linux.g4dn.12xlarge.nvidia.gpu"
86+
87+
# Add distributed-specific fields
88+
dist_item["num_gpus"] = 2
89+
dist_item["config"] = "distributed"
90+
91+
# Debug: Show modified config
92+
print(f"[DEBUG] New runner: {dist_item['validation_runner']}", file=sys.stderr)
93+
print(f"[DEBUG] GPUs: {dist_item['num_gpus']}", file=sys.stderr)
94+
95+
return dist_item
96+
97+
6398
def main(args: list[str]) -> None:
6499
parser = argparse.ArgumentParser()
65100
parser.add_argument(
@@ -99,16 +134,69 @@ def main(args: list[str]) -> None:
99134

100135
includes = matrix_dict["include"]
101136
filtered_includes = []
137+
distributed_includes = [] # NEW: separate list for distributed configs
138+
139+
print(f"[DEBUG] Processing {len(includes)} input configs", file=sys.stderr)
102140

103141
for item in includes:
142+
py_ver = item.get("python_version", "unknown")
143+
cuda_ver = item.get("desired_cuda", "unknown")
144+
145+
print(f"[DEBUG] Checking config: py={py_ver}, cuda={cuda_ver}", file=sys.stderr)
146+
104147
if filter_matrix_item(
105148
item,
106149
options.jetpack == "true",
107150
options.limit_pr_builds == "true",
108151
):
152+
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
109153
filtered_includes.append(item)
110154

111-
filtered_matrix_dict = {"include": filtered_includes}
155+
# NEW: Create distributed variant for specific configs
156+
# Only Python 3.10 + CUDA 13.0 for now
157+
if item["python_version"] == "3.10" and item["desired_cuda"] == "cu130":
158+
print(
159+
f"[DEBUG] Creating distributed config for py3.10+cu130",
160+
file=sys.stderr,
161+
)
162+
distributed_includes.append(create_distributed_config(item))
163+
else:
164+
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
165+
166+
# Debug: Show summary
167+
print(f"[DEBUG] Final counts:", file=sys.stderr)
168+
print(f"[DEBUG] Regular configs: {len(filtered_includes)}", file=sys.stderr)
169+
print(
170+
f"[DEBUG] Distributed configs: {len(distributed_includes)}", file=sys.stderr
171+
)
172+
173+
# Debug: Show which configs will be built
174+
print(
175+
f"[DEBUG] Configs that will be BUILT (in filtered_includes):", file=sys.stderr
176+
)
177+
for item in filtered_includes:
178+
print(
179+
f"[DEBUG] - py={item.get('python_version')}, cuda={item.get('desired_cuda')}",
180+
file=sys.stderr,
181+
)
182+
183+
print(
184+
f"[DEBUG] Configs for DISTRIBUTED TESTS (in distributed_includes):",
185+
file=sys.stderr,
186+
)
187+
for item in distributed_includes:
188+
print(
189+
f"[DEBUG] - py={item.get('python_version')}, cuda={item.get('desired_cuda')}, gpus={item.get('num_gpus')}",
190+
file=sys.stderr,
191+
)
192+
193+
# NEW: Output both regular and distributed configs
194+
filtered_matrix_dict = {
195+
"include": filtered_includes,
196+
"distributed_include": distributed_includes, # NEW field
197+
}
198+
199+
# Output to stdout (consumed by GitHub Actions)
112200
print(json.dumps(filtered_matrix_dict))
113201

114202

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -480,18 +480,50 @@ jobs:
480480
ref: ""
481481
test-infra-repository: pytorch/test-infra
482482
test-infra-ref: main
483-
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
483+
# Extract the distributed_include array from filter-matrix output
484+
build-matrix: |
485+
{
486+
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).distributed_include) }}
487+
}
484488
pre-script: ${{ matrix.pre-script }}
485489
script: |
486490
set -euo pipefail
491+
492+
# Debug: Show what config we're using
493+
echo "=========================================="
494+
echo "DISTRIBUTED TEST CONFIGURATION"
495+
echo "=========================================="
496+
echo "Python version: ${PYTHON_VERSION}"
497+
echo "CUDA version: ${CU_VERSION}"
498+
echo "Runner: ${{ matrix.validation_runner }}"
499+
echo "Num GPUs: ${{ matrix.num_gpus }}"
500+
echo "Config: ${{ matrix.config }}"
501+
echo "=========================================="
502+
503+
# Verify GPUs are available
504+
echo "Checking GPU availability:"
505+
nvidia-smi
506+
echo "GPU count: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
507+
echo "=========================================="
508+
487509
export USE_HOST_DEPS=1
488510
export CI_BUILD=1
489511
export USE_TRTLLM_PLUGINS=1
512+
513+
# Install MPI (required for TensorRT-LLM plugins)
514+
echo "Installing MPI..."
490515
dnf install -y mpich mpich-devel openmpi openmpi-devel
516+
517+
# Run distributed tests
491518
pushd .
492-
cd tests/py
493-
cd dynamo
494-
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
519+
cd tests/py/dynamo
520+
521+
echo "Running distributed tests with mpirun..."
522+
mpirun --allow-run-as-root -n ${{ matrix.num_gpus }} \
523+
python -m pytest -ra \
524+
--junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml \
525+
distributed/test_nccl_ops.py
526+
495527
popd
496528
497529
concurrency:

0 commit comments

Comments
 (0)