Clarify multinode NCCL report thresholds

This commit is contained in:
cs 2026-05-23 19:33:01 +08:00
parent 4f38b3a2a0
commit 63c32fd75d
3 changed files with 35 additions and 24 deletions

View File

@ -463,7 +463,7 @@ class MultiNodeNCCLTest:
topo.get("label", ""),
f"{topo.get('peak_busbw_gbps', 0):.2f} GB/s",
str(topo.get("peak_size", "")),
f">= {topo.get('min_required_gbps', 0):.0f} GB/s" if topo.get("min_required_gbps") else "-",
f">= {_format_gbps(topo.get('min_required_gbps', 0))} GB/s" if topo.get("min_required_gbps") else "-",
topo.get("status", "?"),
)
c.print(table)
@ -475,3 +475,13 @@ def _format_size(size_bytes: int) -> str:
if size_bytes >= factor and size_bytes % factor == 0:
return f"{size_bytes // factor}{suffix}"
return str(size_bytes)
def _format_gbps(value) -> str:
try:
numeric = float(value)
except (TypeError, ValueError):
return str(value)
if numeric.is_integer():
return f"{numeric:.0f}"
return f"{numeric:.2f}"

View File

@ -439,7 +439,7 @@ class ReportGenerator:
if row.get("status") != "PASS"
]
failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-"
lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {req:.0f} | {status} |")
lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {_format_gbps(req)} | {status} |")
elif isinstance(data, dict) and data.get("error"):
lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |")
lines.append("")
@ -457,7 +457,7 @@ class ReportGenerator:
f"{row.get('worst_busbw_gbps', 0):.1f} | "
f"{row.get('mean_busbw_gbps', 0):.1f} | "
f"{row.get('stddev_pct', 0):.2f}% | "
f">= {data.get('min_required_gbps', 0):.0f} | "
f">= {_format_gbps(data.get('min_required_gbps', 0))} | "
f"{row.get('status', '?')} |"
)
lines.append("")
@ -485,7 +485,7 @@ class ReportGenerator:
lines.append("|----------|----------------------|-------------|-----------|------------|-----------|--------|")
for topo in data.get("topologies", []):
threshold = topo.get("min_required_gbps", 0) or 0
threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-"
threshold_text = f">= {_format_gbps(threshold)} GB/s" if threshold else "-"
cuda_visible = topo.get("cuda_visible_devices") or "-"
lines.append(
f"| {topo.get('label', '')} | {cuda_visible} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
@ -956,3 +956,13 @@ class ReportGenerator:
items.append(("Training", f"{status} ({detail})"))
return items
def _format_gbps(value) -> str:
try:
numeric = float(value)
except (TypeError, ValueError):
return str(value)
if numeric.is_integer():
return f"{numeric:.0f}"
return f"{numeric:.2f}"

View File

@ -7,17 +7,8 @@
**Result: FAIL**
Missing required evidence:
- GPU Info
- Health Check
- Memory Bandwidth
- Compute Throughput
- NVLink/NVSwitch
- NCCL
- Stress Test
- RDMA
- DCGM
- Training
Failed or unverified items:
- Multi-node NCCL: FAIL
## Summary
@ -36,10 +27,10 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|----------|----------------------|-------------|-----------|------------|-----------|--------|
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.15 GB/s | 16G | 47.18 GB/s | >= 49 GB/s | FAIL |
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.62 GB/s | 16G | 136.67 GB/s | >= 137 GB/s | FAIL |
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.19 GB/s | 16G | 334.85 GB/s | >= 335 GB/s | FAIL |
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.56 GB/s | 16G | 354.21 GB/s | >= 492 GB/s | FAIL |
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.15 GB/s | 16G | 47.18 GB/s | >= 48.90 GB/s | FAIL |
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.62 GB/s | 16G | 136.67 GB/s | >= 136.93 GB/s | FAIL |
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.19 GB/s | 16G | 334.85 GB/s | >= 335.48 GB/s | FAIL |
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.56 GB/s | 16G | 354.21 GB/s | >= 491.84 GB/s | FAIL |
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|----------|--------------|-----------------|------------------|-------------------|
@ -59,10 +50,10 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|----------|----------------------|-------------|-----------|------------|-----------|--------|
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.92 GB/s | >= 27 GB/s | FAIL |
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.71 GB/s | 16G | 47.93 GB/s | >= 54 GB/s | FAIL |
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.63 GB/s | 16G | 72.67 GB/s | >= 74 GB/s | FAIL |
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.82 GB/s | 16G | 36.86 GB/s | >= 77 GB/s | FAIL |
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.92 GB/s | >= 27.25 GB/s | FAIL |
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.71 GB/s | 16G | 47.93 GB/s | >= 54.41 GB/s | FAIL |
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.63 GB/s | 16G | 72.67 GB/s | >= 73.73 GB/s | FAIL |
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.82 GB/s | 16G | 36.86 GB/s | >= 76.54 GB/s | FAIL |
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|----------|--------------|-----------------|------------------|-------------------|
@ -81,4 +72,4 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
**Overall: FAIL**
---
*Generated by GPU Test Suite v0.2.0*
*Generated by GPU Test Suite v0.2.0*