Clarify multinode NCCL report thresholds
This commit is contained in:
parent
4f38b3a2a0
commit
63c32fd75d
@ -463,7 +463,7 @@ class MultiNodeNCCLTest:
|
||||
topo.get("label", ""),
|
||||
f"{topo.get('peak_busbw_gbps', 0):.2f} GB/s",
|
||||
str(topo.get("peak_size", "")),
|
||||
f">= {topo.get('min_required_gbps', 0):.0f} GB/s" if topo.get("min_required_gbps") else "-",
|
||||
f">= {_format_gbps(topo.get('min_required_gbps', 0))} GB/s" if topo.get("min_required_gbps") else "-",
|
||||
topo.get("status", "?"),
|
||||
)
|
||||
c.print(table)
|
||||
@ -475,3 +475,13 @@ def _format_size(size_bytes: int) -> str:
|
||||
if size_bytes >= factor and size_bytes % factor == 0:
|
||||
return f"{size_bytes // factor}{suffix}"
|
||||
return str(size_bytes)
|
||||
|
||||
|
||||
def _format_gbps(value) -> str:
|
||||
try:
|
||||
numeric = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
if numeric.is_integer():
|
||||
return f"{numeric:.0f}"
|
||||
return f"{numeric:.2f}"
|
||||
|
||||
@ -439,7 +439,7 @@ class ReportGenerator:
|
||||
if row.get("status") != "PASS"
|
||||
]
|
||||
failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-"
|
||||
lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {req:.0f} | {status} |")
|
||||
lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {_format_gbps(req)} | {status} |")
|
||||
elif isinstance(data, dict) and data.get("error"):
|
||||
lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |")
|
||||
lines.append("")
|
||||
@ -457,7 +457,7 @@ class ReportGenerator:
|
||||
f"{row.get('worst_busbw_gbps', 0):.1f} | "
|
||||
f"{row.get('mean_busbw_gbps', 0):.1f} | "
|
||||
f"{row.get('stddev_pct', 0):.2f}% | "
|
||||
f">= {data.get('min_required_gbps', 0):.0f} | "
|
||||
f">= {_format_gbps(data.get('min_required_gbps', 0))} | "
|
||||
f"{row.get('status', '?')} |"
|
||||
)
|
||||
lines.append("")
|
||||
@ -485,7 +485,7 @@ class ReportGenerator:
|
||||
lines.append("|----------|----------------------|-------------|-----------|------------|-----------|--------|")
|
||||
for topo in data.get("topologies", []):
|
||||
threshold = topo.get("min_required_gbps", 0) or 0
|
||||
threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-"
|
||||
threshold_text = f">= {_format_gbps(threshold)} GB/s" if threshold else "-"
|
||||
cuda_visible = topo.get("cuda_visible_devices") or "-"
|
||||
lines.append(
|
||||
f"| {topo.get('label', '')} | {cuda_visible} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
|
||||
@ -956,3 +956,13 @@ class ReportGenerator:
|
||||
items.append(("Training", f"{status} ({detail})"))
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def _format_gbps(value) -> str:
|
||||
try:
|
||||
numeric = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
if numeric.is_integer():
|
||||
return f"{numeric:.0f}"
|
||||
return f"{numeric:.2f}"
|
||||
|
||||
@ -7,17 +7,8 @@
|
||||
|
||||
**Result: FAIL**
|
||||
|
||||
Missing required evidence:
|
||||
- GPU Info
|
||||
- Health Check
|
||||
- Memory Bandwidth
|
||||
- Compute Throughput
|
||||
- NVLink/NVSwitch
|
||||
- NCCL
|
||||
- Stress Test
|
||||
- RDMA
|
||||
- DCGM
|
||||
- Training
|
||||
Failed or unverified items:
|
||||
- Multi-node NCCL: FAIL
|
||||
|
||||
## Summary
|
||||
|
||||
@ -36,10 +27,10 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
|
||||
|
||||
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||
|----------|----------------------|-------------|-----------|------------|-----------|--------|
|
||||
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.15 GB/s | 16G | 47.18 GB/s | >= 49 GB/s | FAIL |
|
||||
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.62 GB/s | 16G | 136.67 GB/s | >= 137 GB/s | FAIL |
|
||||
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.19 GB/s | 16G | 334.85 GB/s | >= 335 GB/s | FAIL |
|
||||
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.56 GB/s | 16G | 354.21 GB/s | >= 492 GB/s | FAIL |
|
||||
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.15 GB/s | 16G | 47.18 GB/s | >= 48.90 GB/s | FAIL |
|
||||
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.62 GB/s | 16G | 136.67 GB/s | >= 136.93 GB/s | FAIL |
|
||||
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.19 GB/s | 16G | 334.85 GB/s | >= 335.48 GB/s | FAIL |
|
||||
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.56 GB/s | 16G | 354.21 GB/s | >= 491.84 GB/s | FAIL |
|
||||
|
||||
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|
||||
|----------|--------------|-----------------|------------------|-------------------|
|
||||
@ -59,10 +50,10 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
|
||||
|
||||
| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |
|
||||
|----------|----------------------|-------------|-----------|------------|-----------|--------|
|
||||
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.92 GB/s | >= 27 GB/s | FAIL |
|
||||
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.71 GB/s | 16G | 47.93 GB/s | >= 54 GB/s | FAIL |
|
||||
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.63 GB/s | 16G | 72.67 GB/s | >= 74 GB/s | FAIL |
|
||||
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.82 GB/s | 16G | 36.86 GB/s | >= 77 GB/s | FAIL |
|
||||
| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.92 GB/s | >= 27.25 GB/s | FAIL |
|
||||
| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.71 GB/s | 16G | 47.93 GB/s | >= 54.41 GB/s | FAIL |
|
||||
| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.63 GB/s | 16G | 72.67 GB/s | >= 73.73 GB/s | FAIL |
|
||||
| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.82 GB/s | 16G | 36.86 GB/s | >= 76.54 GB/s | FAIL |
|
||||
|
||||
| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |
|
||||
|----------|--------------|-----------------|------------------|-------------------|
|
||||
@ -81,4 +72,4 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7
|
||||
**Overall: FAIL**
|
||||
|
||||
---
|
||||
*Generated by GPU Test Suite v0.2.0*
|
||||
*Generated by GPU Test Suite v0.2.0*
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user