871 lines
43 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Report generation module - export test results to JSON/HTML/Markdown."""
import json
import os
from datetime import datetime
from pathlib import Path
from typing import Optional
try:
from importlib.metadata import version as _pkg_version
__version__ = _pkg_version("gpu-server-test-suite")
except Exception:
__version__ = "0.2.0"
from rich.console import Console
from rich.panel import Panel
HTML_TEMPLATE = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>GPU Test Report - {timestamp}</title>
<style>
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, monospace;
background: #0d1117; color: #c9d1d9; padding: 2rem; }}
.header {{ background: linear-gradient(135deg, #1a1a2e, #16213e);
padding: 2rem; border-radius: 8px; margin-bottom: 2rem;
border: 1px solid #30363d; }}
.header h1 {{ color: #58a6ff; font-size: 1.5rem; }}
.header .meta {{ color: #8b949e; margin-top: 0.5rem; }}
.section {{ background: #161b22; border: 1px solid #30363d;
border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; }}
.section h2 {{ color: #58a6ff; margin-bottom: 1rem; font-size: 1.2rem;
border-bottom: 1px solid #30363d; padding-bottom: 0.5rem; }}
table {{ width: 100%; border-collapse: collapse; margin: 0.5rem 0; }}
th {{ background: #21262d; color: #8b949e; text-align: left;
padding: 0.5rem; font-weight: 600; font-size: 0.85rem; }}
td {{ padding: 0.5rem; border-bottom: 1px solid #21262d; font-size: 0.9rem; }}
.pass {{ color: #3fb950; }} .warn {{ color: #d29922; }} .fail {{ color: #f85149; }}
.metric {{ display: inline-block; background: #21262d; padding: 0.75rem 1.5rem;
border-radius: 6px; margin: 0.25rem; text-align: center; min-width: 120px; }}
.metric .value {{ font-size: 1.3rem; font-weight: bold; color: #58a6ff; }}
.metric .label {{ font-size: 0.75rem; color: #8b949e; margin-top: 0.25rem; }}
.verdict {{ padding: 1rem; border-radius: 6px; text-align: center; font-size: 1.1rem;
font-weight: bold; margin: 1rem 0; }}
.verdict.pass {{ background: #0d2818; color: #3fb950; border: 1px solid #238636; }}
.verdict.fail {{ background: #2d0b0b; color: #f85149; border: 1px solid #da3633; }}
</style>
</head>
<body>
<div class="header">
<h1>GPU Training Server Test Report</h1>
<div class="meta">Generated: {timestamp} | Server: {hostname}</div>
</div>
{content}
</body>
</html>"""
class ReportGenerator:
def __init__(self, config: dict):
self.config = config
self.console = Console()
self.report_cfg = config.get("report", {})
def generate(self, results: dict, fmt: str = None, output: str = None) -> str:
fmt = fmt or self.report_cfg.get("format", "json")
output_dir = self.report_cfg.get("output_dir", "./reports")
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if not output:
output = os.path.join(output_dir, f"gpu_report_{timestamp}.{fmt}")
if fmt == "json":
return self._generate_json(results, output)
elif fmt == "html":
return self._generate_html(results, output)
elif fmt == "md":
return self._generate_markdown(results, output)
else:
self.console.print(f"[red]Unsupported format: {fmt}[/red]")
return ""
def _generate_json(self, results: dict, output: str) -> str:
with open(output, "w") as f:
json.dump(results, f, indent=2, default=str)
self.console.print(f"[green]JSON report saved to: {output}[/green]")
return output
def _generate_html(self, results: dict, output: str) -> str:
import socket
hostname = results.get("hostname") or socket.gethostname()
timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
sections = []
if "gpu_info" in results:
gpus = results["gpu_info"].get("gpus", [])
rows = ""
for g in gpus:
rows += f"<tr><td>GPU {g['index']}</td><td>{g['name']}</td>"
rows += f"<td>{g['vram_total_mb']} MB</td>"
rows += f"<td>{g['temperature']}°C</td>"
rows += f"<td>{g['clock_sm']} MHz</td></tr>"
sections.append(
f'<div class="section"><h2>GPU Information</h2>'
f'<p>Driver: {results["gpu_info"].get("driver_version", "N/A")} | '
f'CUDA: {results["gpu_info"].get("cuda_version", "N/A")} | '
f'Count: {len(gpus)}</p>'
f'<table><tr><th>GPU</th><th>Model</th><th>VRAM</th><th>Temp</th><th>SM Clock</th></tr>'
f'{rows}</table></div>'
)
if "health" in results:
h = results["health"]
passed = h.get("passed", False)
cls = "pass" if passed else "fail"
txt = "ALL PASSED" if passed else "SOME CHECKS FAILED"
sections.append(f'<div class="verdict {cls}">{txt}</div>')
if "benchmark" in results and "memory" in results["benchmark"]:
mem = results["benchmark"]["memory"]
sections.append(
f'<div class="section"><h2>Memory Bandwidth</h2>'
f'<div class="metric"><div class="value">{mem.get("d2d_bandwidth_gbps", "N/A")} GB/s</div>'
f'<div class="label">D2D (HBM)</div></div>'
f'<div class="metric"><div class="value">{mem.get("efficiency_pct", "N/A")}%</div>'
f'<div class="label">Efficiency vs Peak ({mem.get("peak_bandwidth_gbps", "N/A")} GB/s)</div></div>'
f'</div>'
)
if "benchmark" in results and "compute" in results["benchmark"]:
comp = results["benchmark"]["compute"]
dtype_rows = ""
per_dtype = comp.get("per_dtype_tflops", {})
eff = comp.get("efficiency_pct", {})
for dt, tflops in per_dtype.items():
ef = eff.get(dt, 0)
cls = "pass" if ef >= 80 else ("warn" if ef >= 50 else "fail")
if isinstance(tflops, (int, float)):
dtype_rows += f'<tr><td>{dt.upper()}</td><td>{tflops:.1f} TFLOPS</td>'
dtype_rows += f'<td class="{cls}">{ef:.1f}%</td></tr>'
if dtype_rows:
sections.append(
f'<div class="section"><h2>Compute Throughput</h2>'
f'<table><tr><th>DType</th><th>Achieved</th><th>Efficiency</th></tr>'
f'{dtype_rows}</table></div>'
)
if "training" in results:
t = results["training"]
sections.append(
f'<div class="section"><h2>Training Simulation</h2>'
f'<div class="metric"><div class="value">{t.get("throughput_tokens_per_sec", "N/A")}</div>'
f'<div class="label">Tokens/sec</div></div>'
f'<div class="metric"><div class="value">{t.get("avg_step_time_ms", "N/A")} ms</div>'
f'<div class="label">Avg Step Time</div></div>'
f'<div class="metric"><div class="value">{t.get("peak_memory_gb", "N/A")} GB</div>'
f'<div class="label">Peak Memory</div></div>'
f'</div>'
)
content = "\n".join(sections)
html = HTML_TEMPLATE.format(timestamp=timestamp, hostname=hostname, content=content)
with open(output, "w") as f:
f.write(html)
self.console.print(f"[green]HTML report saved to: {output}[/green]")
return output
# ------------------------------------------------------------------
# Markdown report
# ------------------------------------------------------------------
def _generate_markdown(self, results: dict, output: str) -> str:
import socket
hostname = results.get("hostname") or socket.gethostname()
timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
lines: list[str] = []
# --- Header ---
lines.append("# GPU Test Report\n")
lines.append(f"- **Date:** {timestamp}")
lines.append(f"- **Host:** {hostname}")
# Extract GPU info for header
gpu_info = results.get("gpu_info")
if gpu_info and not gpu_info.get("error"):
gpus = gpu_info.get("gpus", [])
gpu_name = gpus[0]["name"] if gpus else "Unknown"
lines.append(f"- **GPU:** {gpu_name} x{gpu_info.get('gpu_count', len(gpus))}")
lines.append(f"- **Driver:** {gpu_info.get('driver_version', 'N/A')} | "
f"**CUDA:** {gpu_info.get('cuda_version', 'N/A')}")
lines.append("")
# --- Summary table ---
summary_items = self._build_summary(results)
if summary_items:
verdict, failures, missing = self._overall_acceptance_verdict(summary_items)
lines.append("## Overall Acceptance Verdict\n")
lines.append(f"**Result: {verdict}**")
lines.append("")
if failures:
lines.append("Failed or unverified items:")
for name, status in failures:
lines.append(f"- {name}: {status}")
lines.append("")
if missing:
lines.append("Missing required evidence:")
for name in missing:
lines.append(f"- {name}")
lines.append("")
lines.append("## Summary\n")
lines.append("| Test | Result |")
lines.append("|------|--------|")
for name, verdict in summary_items:
lines.append(f"| {name} | {verdict} |")
lines.append("")
# --- GPU Information ---
if gpu_info and not gpu_info.get("error"):
lines.append("## GPU Information\n")
gpus = gpu_info.get("gpus", [])
lines.append("| GPU | Model | VRAM | Temp | Power | SM Clock |")
lines.append("|-----|-------|------|------|-------|----------|")
for g in gpus:
vram = f"{g.get('vram_total_mb', 0)} MB"
temp = f"{g.get('temperature', 'N/A')}C"
power = f"{g.get('power_draw', 0):.0f}/{g.get('power_limit', 0):.0f}W"
clock = f"{g.get('clock_sm', 0)} MHz"
lines.append(f"| {g['index']} | {g['name']} | {vram} | {temp} | {power} | {clock} |")
lines.append("")
# --- Health Check ---
health = results.get("health")
if health and not health.get("error"):
lines.append("## Health Check\n")
passed = health.get("passed", False)
lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
gpu_health = health.get("gpu_health", [])
if gpu_health:
lines.append("| GPU | Temp | Power | ECC | PCIe | Throttle | Status |")
lines.append("|-----|------|-------|-----|------|----------|--------|")
for gh in gpu_health:
checks = gh.get("checks", {})
temp_c = checks.get("temperature", {})
pwr = checks.get("power", {})
ecc = checks.get("ecc_errors", {})
pcie = checks.get("pcie_link", {})
throttle = checks.get("throttling", {})
temp_str = f"{temp_c.get('value', '?')}C {temp_c.get('status', '')}"
pwr_str = f"{pwr.get('value', 0):.0f}W {pwr.get('status', '')}"
ecc_str = f"S:{ecc.get('single', 0)} D:{ecc.get('double', 0)}"
pcie_str = f"Gen{pcie.get('gen', '?')}x{pcie.get('width', '?')}"
throt_str = throttle.get("status", "?")
status = gh.get("status", "?")
lines.append(f"| {gh['index']} | {temp_str} | {pwr_str} | "
f"{ecc_str} | {pcie_str} | {throt_str} | **{status}** |")
lines.append("")
# --- Memory Bandwidth ---
mem_data = self._extract_memory_results(results)
if mem_data and not mem_data.get("error"):
lines.append("## Memory Bandwidth\n")
lines.append(f"Source: {mem_data.get('source', 'unknown')}\n")
lines.append("| Metric | Value | Peak | Efficiency |")
lines.append("|--------|-------|------|------------|")
d2d = mem_data.get("d2d_bandwidth_gbps") or 0
h2d = mem_data.get("h2d_bandwidth_gbps") or 0
d2h = mem_data.get("d2h_bandwidth_gbps") or 0
# New format with per-metric peaks
h2d_peak = mem_data.get("h2d_peak_gbps") or 0
d2h_peak = mem_data.get("d2h_peak_gbps") or 0
d2d_peak = mem_data.get("d2d_peak_gbps") or 0
h2d_eff = mem_data.get("h2d_efficiency_pct") or 0
d2h_eff = mem_data.get("d2h_efficiency_pct") or 0
d2d_eff = mem_data.get("d2d_efficiency_pct") or 0
# Fallback for old format
if not d2d_peak:
d2d_peak = mem_data.get("peak_bandwidth_gbps") or 0
d2d_eff = mem_data.get("efficiency_pct") or 0
lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |")
lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |")
lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |")
lines.append("")
# PyTorch fallback can't accurately measure HBM peak (intra-GPU copy_()
# only reaches ~20% of HBM bandwidth). When fallback is used, report
# the number but mark as WARN with a note instead of evaluating as FAIL.
if mem_data.get("source") == "pytorch":
lines.append(
f"**Verdict: WARN** (D2D {d2d:.1f} GB/s via PyTorch fallback; "
"nvbandwidth unavailable — figure is indicative only, not a true HBM peak)\n"
)
else:
# Tightened to match production acceptance: PASS >= 80%, WARN 6080%, FAIL < 60%.
verdict = "PASS" if d2d_eff >= 80 else ("WARN" if d2d_eff >= 60 else "FAIL")
lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n")
# --- Compute Throughput ---
comp_data = self._extract_compute_results(results)
if comp_data and not comp_data.get("error"):
lines.append("## Compute Throughput\n")
per_dtype = comp_data.get("per_dtype_tflops", {})
peak_tflops = comp_data.get("peak_tflops", {})
eff_pct = comp_data.get("efficiency_pct", {})
# Absolute PASS thresholds (TFLOPS) from gpu_specs.compute_pass_thresholds_tflops.
# When present, override the legacy 80%-of-peak rule on a per-dtype basis.
pass_thresholds = comp_data.get("pass_thresholds_tflops", {}) or {}
use_abs = bool(pass_thresholds)
if use_abs:
lines.append("| DType | Achieved (TFLOPS) | Peak | Threshold | Status |")
else:
lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
lines.append("|-------|-------------------|------|------------|--------|")
worst_eff = 100.0
overall_status = "PASS"
rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
for dt, val in per_dtype.items():
if isinstance(val, str):
# skipped or error
lines.append(f"| {dt.upper()} | {val} | - | N/A | SKIP |")
else:
pk = peak_tflops.get(dt, 0)
ef = eff_pct.get(dt, 0)
if isinstance(ef, (int, float)) and ef > 0:
worst_eff = min(worst_eff, ef)
thr = pass_thresholds.get(dt)
if use_abs and thr:
if val >= thr:
status = "PASS"
else:
status = "FAIL"
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
else:
status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
if rank.get(status, 0) > rank.get(overall_status, 0):
overall_status = status
lines.append("")
if use_abs:
if any(not row.get("passed", False) for row in (comp_data.get("consistency", {}) or {}).values()):
overall_status = "FAIL"
lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
else:
overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")
consistency = comp_data.get("consistency", {}) or {}
if consistency:
lines.append("### Compute Consistency\n")
lines.append("| DType | Min | Mean | Max | Spread | Limit | Status |")
lines.append("|-------|-----|------|-----|--------|-------|--------|")
for dt, row in consistency.items():
status = "PASS" if row.get("passed") else "FAIL"
lines.append(
f"| {dt.upper()} | {row.get('min_tflops', 0):.1f} | "
f"{row.get('mean_tflops', 0):.1f} | {row.get('max_tflops', 0):.1f} | "
f"{row.get('spread_pct', 0):.2f}% | <= {row.get('max_allowed_pct', 3)}% | {status} |"
)
lines.append("")
per_gpu = comp_data.get("per_gpu", []) or []
dtype_order = [dt for dt in per_dtype.keys() if not isinstance(per_dtype.get(dt), str)]
if per_gpu and dtype_order:
lines.append("### Compute Per-GPU TFLOPS\n")
headers = ["GPU", *[dt.upper() for dt in dtype_order]]
lines.append("| " + " | ".join(headers) + " |")
lines.append("|" + "|".join(["---"] * len(headers)) + "|")
for row in per_gpu:
cells = [str(row.get("index", ""))]
for dt in dtype_order:
val = row.get(dt, "")
cells.append(f"{val:.1f}" if isinstance(val, (int, float)) else str(val))
lines.append("| " + " | ".join(cells) + " |")
lines.append("")
# --- NCCL ---
nvlink = results.get("nvlink")
if nvlink and not nvlink.get("error"):
lines.append("## NVLink/NVSwitch\n")
lines.append(f"**Overall: {'PASS' if nvlink.get('passed') else 'FAIL'}**\n")
lines.append("| GPU | Active Links | Issues |")
lines.append("|-----|--------------|--------|")
for g in nvlink.get("gpus", []):
issues = []
if g.get("inactive_links"):
issues.append("inactive=" + ",".join(g["inactive_links"]))
if g.get("speed_issues"):
issues.append(f"speed issues={len(g['speed_issues'])}")
if g.get("error_issues"):
issues.append(f"errors={len(g['error_issues'])}")
lines.append(f"| {g.get('gpu')} | {g.get('active_links')}/{g.get('expected_links')} | {', '.join(issues) or 'OK'} |")
lines.append("")
elif nvlink and nvlink.get("error"):
lines.append("## NVLink/NVSwitch\n")
lines.append(f"**Overall: FAIL** ({nvlink.get('error')})\n")
dcgm = results.get("dcgm")
if dcgm and not dcgm.get("error"):
lines.append("## DCGM Diagnostic\n")
lines.append(f"**Overall: {'PASS' if dcgm.get('passed') else 'FAIL'}**\n")
if dcgm.get("subtests"):
lines.append("| Subtest | Status |")
lines.append("|---------|--------|")
for s in dcgm.get("subtests", []):
lines.append(f"| {s.get('name', '')} | {s.get('status', '')} |")
lines.append("")
elif dcgm and dcgm.get("error"):
lines.append("## DCGM Diagnostic\n")
lines.append(f"**Overall: FAIL** ({dcgm.get('error')})\n")
# --- NCCL ---
nccl = results.get("nccl")
if nccl and not nccl.get("error"):
lines.append("## NCCL Multi-GPU\n")
lines.append(f"Source: {nccl.get('source', 'unknown')} | "
f"GPUs: {nccl.get('gpu_count', '?')}\n")
if nccl.get("source") == "torchrun_fallback":
lines.append("> Functional NCCL smoke only: nccl-tests bus bandwidth was not measured, so this does not satisfy production acceptance.\n")
tests = nccl.get("tests", {})
if tests:
lines.append("> Summary reports the best Bus BW observed for each operation. PASS/FAIL is evaluated across every tested message size and repeat run shown in the detail table below.\n")
lines.append("| Operation | Best Bus BW (GB/s) | Failed Sizes | Threshold | Status |")
lines.append("|-----------|--------------------|--------------|-----------|--------|")
for op, data in tests.items():
if isinstance(data, dict) and not data.get("error"):
bw = data.get("best_busbw_gbps", 0)
req = data.get("min_required_gbps", 0)
status = data.get("status", "?")
failed_sizes = [
str(row.get("size", "?"))
for row in data.get("by_size", [])
if row.get("status") != "PASS"
]
failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-"
lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {req:.0f} | {status} |")
elif isinstance(data, dict) and data.get("error"):
lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |")
lines.append("")
for op, data in tests.items():
by_size = data.get("by_size", []) if isinstance(data, dict) else []
if not by_size:
continue
lines.append(f"### NCCL {op} by size\n")
lines.append("| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |")
lines.append("|------|---------------------|-------|------|--------|-----------|--------|")
for row in by_size:
runs = ", ".join(str(v) for v in row.get("runs_busbw_gbps", []))
lines.append(
f"| {row.get('size', '')} | {runs} | "
f"{row.get('worst_busbw_gbps', 0):.1f} | "
f"{row.get('mean_busbw_gbps', 0):.1f} | "
f"{row.get('stddev_pct', 0):.2f}% | "
f">= {data.get('min_required_gbps', 0):.0f} | "
f"{row.get('status', '?')} |"
)
lines.append("")
passed = nccl.get("passed", False)
lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
# --- Stress Test ---
stress = results.get("stress")
if stress and not stress.get("error"):
lines.append("## Stress Test\n")
passed = stress.get("passed", False)
duration = stress.get("duration_sec") or 0
elapsed = stress.get("elapsed_sec") or 0
source = stress.get("source", "unknown")
lines.append(f"- **Source:** {source}")
lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
telemetry = stress.get("telemetry") or {}
if telemetry:
lines.append(f"- **Telemetry samples:** {telemetry.get('samples', 0)}")
lines.append(f"- **Max temp:** {telemetry.get('max_temp_c', {})}")
lines.append(f"- **Avg power:** {telemetry.get('avg_power_w', {})}")
lines.append(f"- **Temp delta:** {telemetry.get('temp_delta_c', 'N/A')} C")
lines.append(f"- **TFLOPS jitter:** {telemetry.get('tflops_jitter_pct', 'N/A')}%")
lines.append(f"- **Steady TFLOPS samples:** {telemetry.get('steady_tflops_samples', 0)}")
lines.append(f"- **Throttle events:** {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}")
lines.append(f"- **XID events:** {len(telemetry.get('xid_events', []))}")
failures = telemetry.get("failures") or []
if failures:
lines.append("- **Failure reasons:**")
for reason in failures:
lines.append(f" - {reason}")
lines.append(f"- **Result: {'PASS' if passed else 'FAIL'}**")
lines.append("")
# --- RDMA ---
rdma = results.get("rdma")
if rdma and (rdma.get("skipped") or rdma.get("status") == "SKIP"):
lines.append("## RDMA/InfiniBand\n")
lines.append(f"**Overall: SKIP** [{rdma.get('reason', 'no IB hardware detected')}]\n")
elif rdma and not rdma.get("error"):
lines.append("## RDMA/InfiniBand\n")
rdma_legacy_note = self._rdma_legacy_note(rdma)
if rdma_legacy_note:
lines.append(f"> {rdma_legacy_note}\n")
port_checks = rdma.get("port_checks", [])
if port_checks:
lines.append("### RDMA Port Checks\n")
lines.append("| Device | Port | State | Rate | Required | Status |")
lines.append("|--------|------|-------|------|----------|--------|")
for p in port_checks:
lines.append(
f"| {p.get('device', '')} | {p.get('port', '')} | "
f"{p.get('state', '')} | {p.get('rate', '')} | "
f">= {p.get('min_rate_gbps', 400):.0f}Gbps ACTIVE | {p.get('status', '?')} |"
)
lines.append("")
bw_tests = rdma.get("bandwidth_tests", [])
lat_tests = rdma.get("latency_tests", [])
ibping_tests = rdma.get("ibping_tests", [])
if bw_tests or lat_tests or ibping_tests:
lines.append("| Test | Value | Threshold | Status |")
lines.append("|------|-------|-----------|--------|")
for bt in bw_tests:
if bt.get("error"):
lines.append(f"| {bt.get('test', 'ib_bw')} | {bt.get('error')} | required runnable test | {bt.get('status', 'FAIL')} |")
else:
threshold, status = self._rdma_bandwidth_verdict(bt)
lines.append(f"| {bt['test']} | {bt.get('bandwidth_gbps', 0):.1f} GB/s | "
f">= {threshold:g} GB/s | {status} |")
for lt in lat_tests:
if lt.get("error"):
lines.append(f"| {lt.get('test', 'ib_lat')} | {lt.get('error')} | required runnable test | {lt.get('status', 'FAIL')} |")
else:
threshold, status = self._rdma_latency_verdict(lt)
lines.append(f"| {lt['test']} | {lt.get('latency_us', 0):.2f} us | "
f"<= {threshold:g} us | {status} |")
for it in ibping_tests:
direction = it.get("direction") or it.get("role", "N/A")
if it.get("error"):
lines.append(f"| {it.get('test', 'ibping')} | {it.get('error')} | bidirectional peer evidence | {it.get('status', 'FAIL')} |")
else:
lines.append(f"| {it['test']} | {direction} target={it.get('target', 'N/A')} count={it.get('count', 'N/A')} | "
f"0% packet loss | {it.get('status', '?')} |")
lines.append("")
fabric = rdma.get("fabric_counters") or {}
if fabric:
counters = fabric.get("counters", {})
lines.append(f"- **PFC/ECN/CNP/congestion counters checked:** {len(counters)}")
lines.append(f"- **PFC/ECN/CNP/congestion non-zero:** {'yes' if fabric.get('failed') else 'no'}")
if not counters:
lines.append("- **PFC/ECN/CNP/congestion evidence:** missing")
failures = rdma.get("failures") or []
if not failures:
failures = self._rdma_failure_reasons(rdma)
if failures:
lines.append("- **Failure reasons:**")
for reason in failures:
lines.append(f" - {reason}")
passed = rdma.get("passed", False)
lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
# --- Training ---
training = results.get("training")
if training and not training.get("error"):
training_status, training_detail, training_missing = self._training_verdict(training)
lines.append("## Training Simulation\n")
lines.append("| Metric | Value |")
lines.append("|--------|-------|")
lines.append(f"| Model | {training.get('model', 'N/A')} |")
lines.append(f"| Params | {training.get('total_params_m', 0):.1f}M |")
lines.append(f"| Throughput | {training.get('throughput_tokens_per_sec', 0):.0f} tokens/sec |")
lines.append(f"| Avg Step Time | {training.get('avg_step_time_ms', 0):.1f} ms |")
lines.append(f"| Warmup Steps | {training.get('warmup_steps', 'N/A')} |")
lines.append(f"| Peak Memory | {training.get('peak_memory_gb', 0):.1f} GB |")
lines.append(f"| Final Loss | {training.get('final_loss', 'N/A')} |")
lines.append(f"| Step Jitter | {training.get('step_jitter_pct', 'N/A')}% |")
lines.append(f"| Distributed Mode | {training.get('distributed_mode', 'N/A')} |")
if training_missing:
lines.append(f"| Acceptance Gaps | missing {', '.join(training_missing)} |")
lines.append(f"| Verdict | {training_status} ({training_detail}) |")
lines.append("")
# --- Footer ---
lines.append("---")
lines.append(f"*Generated by GPU Test Suite v{__version__}*")
content = "\n".join(lines)
with open(output, "w") as f:
f.write(content)
self.console.print(f"[green]Markdown report saved to: {output}[/green]")
return output
def _extract_memory_results(self, results: dict) -> dict:
"""Extract memory benchmark data from either full-suite or single-test format."""
if "memory_bench" in results:
data = results["memory_bench"]
return data.get("memory", data) if isinstance(data, dict) else {}
if "benchmark" in results:
bench = results["benchmark"]
if isinstance(bench, dict) and "memory" in bench:
return bench["memory"]
return {}
def _extract_compute_results(self, results: dict) -> dict:
"""Extract compute benchmark data from either full-suite or single-test format."""
if "compute_bench" in results:
data = results["compute_bench"]
return data.get("compute", data) if isinstance(data, dict) else {}
if "benchmark" in results:
bench = results["benchmark"]
if isinstance(bench, dict) and "compute" in bench:
return bench["compute"]
return {}
@staticmethod
def _training_verdict(training: dict) -> tuple[str, str, list[str]]:
"""Return report status for both current and legacy training result schemas."""
tps = float(training.get("throughput_tokens_per_sec", 0) or 0)
if "passed" in training:
status = "PASS" if training.get("passed") else "FAIL"
return status, f"{tps:.0f} tokens/sec", []
required = ["passed", "step_jitter_pct", "distributed_mode", "loss_finite"]
missing = [k for k in required if k not in training]
return "UNVERIFIED", f"{tps:.0f} tokens/sec; legacy result lacks explicit acceptance verdict", missing
def _rdma_cfg_value(self, key: str, default: float) -> float:
try:
return float((self.config.get("rdma", {}) or {}).get(key, default))
except (TypeError, ValueError):
return default
def _rdma_bandwidth_verdict(self, row: dict) -> tuple[float, str]:
threshold = self._rdma_cfg_value("min_bandwidth_gbps", 47.0)
value = float(row.get("bandwidth_gbps", 0) or 0)
return threshold, "PASS" if value >= threshold else "FAIL"
def _rdma_latency_verdict(self, row: dict) -> tuple[float, str]:
name = row.get("test", "")
if name == "ib_write_lat":
threshold = self._rdma_cfg_value("max_write_latency_us", 2.0)
elif name == "ib_read_lat":
threshold = self._rdma_cfg_value("max_read_latency_us", 3.5)
else:
threshold = self._rdma_cfg_value("max_latency_us", 3.5)
value = float(row.get("latency_us", 0) or 0)
return threshold, "PASS" if 0 < value <= threshold else "FAIL"
def _rdma_legacy_note(self, rdma: dict) -> str:
"""Flag old RDMA result schemas whose embedded thresholds were looser."""
for row in rdma.get("bandwidth_tests", []) or []:
if row.get("min_required_gbps") != self._rdma_cfg_value("min_bandwidth_gbps", 47.0):
return (
"Legacy RDMA result re-evaluated with current PDF acceptance thresholds; "
"old WARN statuses and old 50GB/s/10us limits are not used for verdict."
)
for row in rdma.get("latency_tests", []) or []:
threshold, _ = self._rdma_latency_verdict(row)
if row.get("max_allowed_us") != threshold:
return (
"Legacy RDMA result re-evaluated with current PDF acceptance thresholds; "
"old WARN statuses and old 50GB/s/10us limits are not used for verdict."
)
return ""
def _rdma_failure_reasons(self, rdma: dict) -> list[str]:
failures = []
for row in rdma.get("bandwidth_tests", []) or []:
threshold, status = self._rdma_bandwidth_verdict(row)
if status != "PASS":
failures.append(
f"{row.get('test')} bandwidth {row.get('bandwidth_gbps', 0)}GB/s < {threshold:g}GB/s"
)
for row in rdma.get("latency_tests", []) or []:
threshold, status = self._rdma_latency_verdict(row)
if status != "PASS":
failures.append(
f"{row.get('test')} latency {row.get('latency_us', 0)}us > {threshold:g}us"
)
for row in rdma.get("ibping_tests", []) or []:
if row.get("status") != "PASS":
failures.append(f"{row.get('test')} failed")
return failures
@staticmethod
def _overall_acceptance_verdict(summary_items: list[tuple[str, str]]) -> tuple[str, list[tuple[str, str]], list[str]]:
"""PDF-style machine verdict: every required item must be present and PASS."""
required = [
"GPU Info",
"Health Check",
"Memory Bandwidth",
"Compute Throughput",
"NVLink/NVSwitch",
"NCCL",
"Stress Test",
"RDMA",
"DCGM",
"Training",
]
status_by_name = dict(summary_items)
missing = [name for name in required if name not in status_by_name]
failures = [
(name, status)
for name, status in summary_items
if name in required and not str(status).startswith("PASS")
]
verdict = "PASS" if not missing and not failures else "FAIL"
return verdict, failures, missing
def _build_summary(self, results: dict) -> list[tuple[str, str]]:
"""Build summary verdict list from results."""
items = []
# GPU Info
if "gpu_info" in results:
gi = results["gpu_info"]
if gi.get("error"):
items.append(("GPU Info", f"ERROR: {gi['error']}"))
else:
items.append(("GPU Info", f"PASS ({gi.get('gpu_count', '?')} GPUs detected)"))
# Health
if "health" in results:
h = results["health"]
if h.get("error"):
items.append(("Health Check", f"ERROR: {h['error']}"))
elif h.get("passed"):
items.append(("Health Check", "PASS"))
else:
items.append(("Health Check", "FAIL"))
# Memory Bandwidth
mem = self._extract_memory_results(results)
if mem:
if mem.get("error"):
items.append(("Memory Bandwidth", f"ERROR: {mem['error']}"))
elif mem.get("source") == "pytorch":
# PyTorch fallback can't reach HBM peak — report as WARN, not FAIL.
d2d = mem.get("d2d_bandwidth_gbps") or 0
items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
else:
eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
# Compute
comp = self._extract_compute_results(results)
if comp:
if comp.get("error"):
items.append(("Compute Throughput", f"ERROR: {comp['error']}"))
else:
per_dtype = comp.get("per_dtype_tflops", {})
eff_pct = comp.get("efficiency_pct", {})
pass_thresholds = comp.get("pass_thresholds_tflops", {}) or {}
if pass_thresholds:
# Absolute TFLOPS judgment, mirroring the per-dtype table above.
rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
worst_status = "PASS"
worst_dt = None
lowest_margin = None
for dt, thr in pass_thresholds.items():
val = per_dtype.get(dt)
if not isinstance(val, (int, float)):
continue
if val >= thr:
st = "PASS"
else:
st = "FAIL"
margin = val / thr if thr else 0
if lowest_margin is None or margin < lowest_margin:
lowest_margin = margin
worst_dt = dt
if rank[st] > rank[worst_status]:
worst_status = st
if worst_dt:
consistency = comp.get("consistency", {}) or {}
failed_consistency = [
(dt, row)
for dt, row in consistency.items()
if not row.get("passed", False)
]
if failed_consistency:
worst_status = "FAIL"
fail_dt, fail_row = failed_consistency[0]
items.append((
"Compute Throughput",
f"FAIL ({fail_dt.upper()} spread "
f"{fail_row.get('spread_pct', 0):.2f}% > "
f"{fail_row.get('max_allowed_pct', 3)}%)"
))
else:
items.append((
"Compute Throughput",
f"{worst_status} (worst {worst_dt.upper()} "
f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
))
else:
items.append(("Compute Throughput", f"{worst_status}"))
else:
valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
if valid_effs:
worst = min(valid_effs)
verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL")
items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)"))
else:
items.append(("Compute Throughput", "N/A"))
# NCCL
if "nvlink" in results:
nvl = results["nvlink"]
if nvl.get("error"):
items.append(("NVLink/NVSwitch", f"ERROR: {nvl['error']}"))
elif nvl.get("passed"):
items.append(("NVLink/NVSwitch", "PASS"))
else:
items.append(("NVLink/NVSwitch", "FAIL"))
if "dcgm" in results:
d = results["dcgm"]
if d.get("error"):
items.append(("DCGM", f"ERROR: {d['error']}"))
elif d.get("passed"):
items.append(("DCGM", "PASS"))
else:
items.append(("DCGM", "FAIL"))
# NCCL
if "nccl" in results:
n = results["nccl"]
if n.get("error"):
items.append(("NCCL", f"ERROR: {n['error']}"))
elif n.get("source") == "torchrun_fallback":
items.append(("NCCL", "FAIL (no nccl-tests bus BW)"))
elif n.get("passed"):
items.append(("NCCL", "PASS"))
else:
items.append(("NCCL", "FAIL"))
# Stress
if "stress" in results:
s = results["stress"]
if s.get("error"):
items.append(("Stress Test", f"ERROR: {s['error']}"))
elif s.get("passed"):
items.append(("Stress Test", "PASS"))
else:
items.append(("Stress Test", "FAIL"))
# RDMA
if "rdma" in results:
r = results["rdma"]
if r.get("skipped") or r.get("status") == "SKIP":
items.append(("RDMA", f"SKIP ({r.get('reason', 'no IB hardware')})"))
elif r.get("error"):
items.append(("RDMA", f"ERROR: {r['error']}"))
elif r.get("passed"):
items.append(("RDMA", "PASS"))
else:
items.append(("RDMA", "FAIL"))
# Training
if "training" in results:
t = results["training"]
if t.get("error"):
items.append(("Training", f"ERROR: {t['error']}"))
else:
status, detail, _missing = self._training_verdict(t)
items.append(("Training", f"{status} ({detail})"))
return items