diff --git a/modules/multinode_nccl_test.py b/modules/multinode_nccl_test.py index 737d58c..4e72525 100644 --- a/modules/multinode_nccl_test.py +++ b/modules/multinode_nccl_test.py @@ -463,7 +463,7 @@ class MultiNodeNCCLTest: topo.get("label", ""), f"{topo.get('peak_busbw_gbps', 0):.2f} GB/s", str(topo.get("peak_size", "")), - f">= {topo.get('min_required_gbps', 0):.0f} GB/s" if topo.get("min_required_gbps") else "-", + f">= {_format_gbps(topo.get('min_required_gbps', 0))} GB/s" if topo.get("min_required_gbps") else "-", topo.get("status", "?"), ) c.print(table) @@ -475,3 +475,13 @@ def _format_size(size_bytes: int) -> str: if size_bytes >= factor and size_bytes % factor == 0: return f"{size_bytes // factor}{suffix}" return str(size_bytes) + + +def _format_gbps(value) -> str: + try: + numeric = float(value) + except (TypeError, ValueError): + return str(value) + if numeric.is_integer(): + return f"{numeric:.0f}" + return f"{numeric:.2f}" diff --git a/modules/report.py b/modules/report.py index c905d0b..79640c7 100644 --- a/modules/report.py +++ b/modules/report.py @@ -439,7 +439,7 @@ class ReportGenerator: if row.get("status") != "PASS" ] failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-" - lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {req:.0f} | {status} |") + lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {_format_gbps(req)} | {status} |") elif isinstance(data, dict) and data.get("error"): lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |") lines.append("") @@ -457,7 +457,7 @@ class ReportGenerator: f"{row.get('worst_busbw_gbps', 0):.1f} | " f"{row.get('mean_busbw_gbps', 0):.1f} | " f"{row.get('stddev_pct', 0):.2f}% | " - f">= {data.get('min_required_gbps', 0):.0f} | " + f">= {_format_gbps(data.get('min_required_gbps', 0))} | " f"{row.get('status', '?')} |" ) lines.append("") @@ -485,7 +485,7 @@ class ReportGenerator: lines.append("|----------|----------------------|-------------|-----------|------------|-----------|--------|") for topo in data.get("topologies", []): threshold = topo.get("min_required_gbps", 0) or 0 - threshold_text = f">= {threshold:.0f} GB/s" if threshold else "-" + threshold_text = f">= {_format_gbps(threshold)} GB/s" if threshold else "-" cuda_visible = topo.get("cuda_visible_devices") or "-" lines.append( f"| {topo.get('label', '')} | {cuda_visible} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | " @@ -956,3 +956,13 @@ class ReportGenerator: items.append(("Training", f"{status} ({detail})")) return items + + +def _format_gbps(value) -> str: + try: + numeric = float(value) + except (TypeError, ValueError): + return str(value) + if numeric.is_integer(): + return f"{numeric:.0f}" + return f"{numeric:.2f}" diff --git a/reports_multinode_nccl_pdf_matrix_20260523_112247.md b/reports_multinode_nccl_pdf_matrix_20260523_112247.md index e67c8a4..8d07aef 100644 --- a/reports_multinode_nccl_pdf_matrix_20260523_112247.md +++ b/reports_multinode_nccl_pdf_matrix_20260523_112247.md @@ -7,17 +7,8 @@ **Result: FAIL** -Missing required evidence: -- GPU Info -- Health Check -- Memory Bandwidth -- Compute Throughput -- NVLink/NVSwitch -- NCCL -- Stress Test -- RDMA -- DCGM -- Training +Failed or unverified items: +- Multi-node NCCL: FAIL ## Summary @@ -36,10 +27,10 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7 | Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | |----------|----------------------|-------------|-----------|------------|-----------|--------| -| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.15 GB/s | 16G | 47.18 GB/s | >= 49 GB/s | FAIL | -| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.62 GB/s | 16G | 136.67 GB/s | >= 137 GB/s | FAIL | -| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.19 GB/s | 16G | 334.85 GB/s | >= 335 GB/s | FAIL | -| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.56 GB/s | 16G | 354.21 GB/s | >= 492 GB/s | FAIL | +| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 47.15 GB/s | 16G | 47.18 GB/s | >= 48.90 GB/s | FAIL | +| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 136.62 GB/s | 16G | 136.67 GB/s | >= 136.93 GB/s | FAIL | +| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 335.19 GB/s | 16G | 334.85 GB/s | >= 335.48 GB/s | FAIL | +| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 354.56 GB/s | 16G | 354.21 GB/s | >= 491.84 GB/s | FAIL | | Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs | |----------|--------------|-----------------|------------------|-------------------| @@ -59,10 +50,10 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7 | Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status | |----------|----------------------|-------------|-----------|------------|-----------|--------| -| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.92 GB/s | >= 27 GB/s | FAIL | -| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.71 GB/s | 16G | 47.93 GB/s | >= 54 GB/s | FAIL | -| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.63 GB/s | 16G | 72.67 GB/s | >= 74 GB/s | FAIL | -| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.82 GB/s | 16G | 36.86 GB/s | >= 77 GB/s | FAIL | +| 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) | - | 24.85 GB/s | 16G | 24.92 GB/s | >= 27.25 GB/s | FAIL | +| 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) | - | 47.71 GB/s | 16G | 47.93 GB/s | >= 54.41 GB/s | FAIL | +| 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) | 0,1,4,5 | 72.63 GB/s | 16G | 72.67 GB/s | >= 73.73 GB/s | FAIL | +| 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) | - | 36.82 GB/s | 16G | 36.86 GB/s | >= 76.54 GB/s | FAIL | | Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs | |----------|--------------|-----------------|------------------|-------------------| @@ -81,4 +72,4 @@ Source: nccl-tests-mpirun | Mode: cross-leaf-pdf-matrix-nccl-2.27.7 **Overall: FAIL** --- -*Generated by GPU Test Suite v0.2.0* \ No newline at end of file +*Generated by GPU Test Suite v0.2.0*