test_gpu_scripts/modules/report.py

"""Report generation module - export test results to JSON/HTML/Markdown."""

import json
import os
from datetime import datetime
from pathlib import Path
from typing import Optional

try:
    from importlib.metadata import version as _pkg_version
    __version__ = _pkg_version("gpu-server-test-suite")
except Exception:
    __version__ = "0.2.0"

from rich.console import Console
from rich.panel import Panel

HTML_TEMPLATE = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>GPU Test Report - {timestamp}</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, monospace;
               background: #0d1117; color: #c9d1d9; padding: 2rem; }}
        .header {{ background: linear-gradient(135deg, #1a1a2e, #16213e);
                   padding: 2rem; border-radius: 8px; margin-bottom: 2rem;
                   border: 1px solid #30363d; }}
        .header h1 {{ color: #58a6ff; font-size: 1.5rem; }}
        .header .meta {{ color: #8b949e; margin-top: 0.5rem; }}
        .section {{ background: #161b22; border: 1px solid #30363d;
                    border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; }}
        .section h2 {{ color: #58a6ff; margin-bottom: 1rem; font-size: 1.2rem;
                       border-bottom: 1px solid #30363d; padding-bottom: 0.5rem; }}
        table {{ width: 100%; border-collapse: collapse; margin: 0.5rem 0; }}
        th {{ background: #21262d; color: #8b949e; text-align: left;
             padding: 0.5rem; font-weight: 600; font-size: 0.85rem; }}
        td {{ padding: 0.5rem; border-bottom: 1px solid #21262d; font-size: 0.9rem; }}
        .pass {{ color: #3fb950; }} .warn {{ color: #d29922; }} .fail {{ color: #f85149; }}
        .metric {{ display: inline-block; background: #21262d; padding: 0.75rem 1.5rem;
                  border-radius: 6px; margin: 0.25rem; text-align: center; min-width: 120px; }}
        .metric .value {{ font-size: 1.3rem; font-weight: bold; color: #58a6ff; }}
        .metric .label {{ font-size: 0.75rem; color: #8b949e; margin-top: 0.25rem; }}
        .verdict {{ padding: 1rem; border-radius: 6px; text-align: center; font-size: 1.1rem;
                   font-weight: bold; margin: 1rem 0; }}
        .verdict.pass {{ background: #0d2818; color: #3fb950; border: 1px solid #238636; }}
        .verdict.fail {{ background: #2d0b0b; color: #f85149; border: 1px solid #da3633; }}
    </style>
</head>
<body>
    <div class="header">
        <h1>GPU Training Server Test Report</h1>
        <div class="meta">Generated: {timestamp} | Server: {hostname}</div>
    </div>
    {content}
</body>
</html>"""


class ReportGenerator:

    def __init__(self, config: dict):
        self.config = config
        self.console = Console()
        self.report_cfg = config.get("report", {})

    def generate(self, results: dict, fmt: str = None, output: str = None) -> str:
        fmt = fmt or self.report_cfg.get("format", "json")
        output_dir = self.report_cfg.get("output_dir", "./reports")
        os.makedirs(output_dir, exist_ok=True)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        if not output:
            output = os.path.join(output_dir, f"gpu_report_{timestamp}.{fmt}")

        if fmt == "json":
            return self._generate_json(results, output)
        elif fmt == "html":
            return self._generate_html(results, output)
        elif fmt == "md":
            return self._generate_markdown(results, output)
        else:
            self.console.print(f"[red]Unsupported format: {fmt}[/red]")
            return ""

    def _generate_json(self, results: dict, output: str) -> str:
        with open(output, "w") as f:
            json.dump(results, f, indent=2, default=str)
        self.console.print(f"[green]JSON report saved to: {output}[/green]")
        return output

    def _generate_html(self, results: dict, output: str) -> str:
        import socket
        hostname = socket.gethostname()
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        sections = []

        if "gpu_info" in results:
            gpus = results["gpu_info"].get("gpus", [])
            rows = ""
            for g in gpus:
                rows += f"<tr><td>GPU {g['index']}</td><td>{g['name']}</td>"
                rows += f"<td>{g['vram_total_mb']} MB</td>"
                rows += f"<td>{g['temperature']}°C</td>"
                rows += f"<td>{g['clock_sm']} MHz</td></tr>"
            sections.append(
                f'<div class="section"><h2>GPU Information</h2>'
                f'<p>Driver: {results["gpu_info"].get("driver_version", "N/A")} | '
                f'CUDA: {results["gpu_info"].get("cuda_version", "N/A")} | '
                f'Count: {len(gpus)}</p>'
                f'<table><tr><th>GPU</th><th>Model</th><th>VRAM</th><th>Temp</th><th>SM Clock</th></tr>'
                f'{rows}</table></div>'
            )

        if "health" in results:
            h = results["health"]
            passed = h.get("passed", False)
            cls = "pass" if passed else "fail"
            txt = "ALL PASSED" if passed else "SOME CHECKS FAILED"
            sections.append(f'<div class="verdict {cls}">{txt}</div>')

        if "benchmark" in results and "memory" in results["benchmark"]:
            mem = results["benchmark"]["memory"]
            sections.append(
                f'<div class="section"><h2>Memory Bandwidth</h2>'
                f'<div class="metric"><div class="value">{mem.get("d2d_bandwidth_gbps", "N/A")} GB/s</div>'
                f'<div class="label">D2D (HBM)</div></div>'
                f'<div class="metric"><div class="value">{mem.get("efficiency_pct", "N/A")}%</div>'
                f'<div class="label">Efficiency vs Peak ({mem.get("peak_bandwidth_gbps", "N/A")} GB/s)</div></div>'
                f'</div>'
            )

        if "benchmark" in results and "compute" in results["benchmark"]:
            comp = results["benchmark"]["compute"]
            dtype_rows = ""
            per_dtype = comp.get("per_dtype_tflops", {})
            eff = comp.get("efficiency_pct", {})
            for dt, tflops in per_dtype.items():
                ef = eff.get(dt, 0)
                cls = "pass" if ef >= 80 else ("warn" if ef >= 50 else "fail")
                if isinstance(tflops, (int, float)):
                    dtype_rows += f'<tr><td>{dt.upper()}</td><td>{tflops:.1f} TFLOPS</td>'
                    dtype_rows += f'<td class="{cls}">{ef:.1f}%</td></tr>'
            if dtype_rows:
                sections.append(
                    f'<div class="section"><h2>Compute Throughput</h2>'
                    f'<table><tr><th>DType</th><th>Achieved</th><th>Efficiency</th></tr>'
                    f'{dtype_rows}</table></div>'
                )

        if "training" in results:
            t = results["training"]
            sections.append(
                f'<div class="section"><h2>Training Simulation</h2>'
                f'<div class="metric"><div class="value">{t.get("throughput_tokens_per_sec", "N/A")}</div>'
                f'<div class="label">Tokens/sec</div></div>'
                f'<div class="metric"><div class="value">{t.get("avg_step_time_ms", "N/A")} ms</div>'
                f'<div class="label">Avg Step Time</div></div>'
                f'<div class="metric"><div class="value">{t.get("peak_memory_gb", "N/A")} GB</div>'
                f'<div class="label">Peak Memory</div></div>'
                f'</div>'
            )

        content = "\n".join(sections)
        html = HTML_TEMPLATE.format(timestamp=timestamp, hostname=hostname, content=content)

        with open(output, "w") as f:
            f.write(html)
        self.console.print(f"[green]HTML report saved to: {output}[/green]")
        return output

    # ------------------------------------------------------------------
    # Markdown report
    # ------------------------------------------------------------------

    def _generate_markdown(self, results: dict, output: str) -> str:
        import socket
        hostname = socket.gethostname()
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        lines: list[str] = []

        # --- Header ---
        lines.append("# GPU Test Report\n")
        lines.append(f"- **Date:** {timestamp}")
        lines.append(f"- **Host:** {hostname}")

        # Extract GPU info for header
        gpu_info = results.get("gpu_info")
        if gpu_info and not gpu_info.get("error"):
            gpus = gpu_info.get("gpus", [])
            gpu_name = gpus[0]["name"] if gpus else "Unknown"
            lines.append(f"- **GPU:** {gpu_name} x{gpu_info.get('gpu_count', len(gpus))}")
            lines.append(f"- **Driver:** {gpu_info.get('driver_version', 'N/A')} | "
                         f"**CUDA:** {gpu_info.get('cuda_version', 'N/A')}")
        lines.append("")

        # --- Summary table ---
        summary_items = self._build_summary(results)
        if summary_items:
            lines.append("## Summary\n")
            lines.append("| Test | Result |")
            lines.append("|------|--------|")
            for name, verdict in summary_items:
                lines.append(f"| {name} | {verdict} |")
            lines.append("")

        # --- GPU Information ---
        if gpu_info and not gpu_info.get("error"):
            lines.append("## GPU Information\n")
            gpus = gpu_info.get("gpus", [])
            lines.append("| GPU | Model | VRAM | Temp | Power | SM Clock |")
            lines.append("|-----|-------|------|------|-------|----------|")
            for g in gpus:
                vram = f"{g.get('vram_total_mb', 0)} MB"
                temp = f"{g.get('temperature', 'N/A')}C"
                power = f"{g.get('power_draw', 0):.0f}/{g.get('power_limit', 0):.0f}W"
                clock = f"{g.get('clock_sm', 0)} MHz"
                lines.append(f"| {g['index']} | {g['name']} | {vram} | {temp} | {power} | {clock} |")
            lines.append("")

        # --- Health Check ---
        health = results.get("health")
        if health and not health.get("error"):
            lines.append("## Health Check\n")
            passed = health.get("passed", False)
            lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
            gpu_health = health.get("gpu_health", [])
            if gpu_health:
                lines.append("| GPU | Temp | Power | ECC | PCIe | Throttle | Status |")
                lines.append("|-----|------|-------|-----|------|----------|--------|")
                for gh in gpu_health:
                    checks = gh.get("checks", {})
                    temp_c = checks.get("temperature", {})
                    pwr = checks.get("power", {})
                    ecc = checks.get("ecc_errors", {})
                    pcie = checks.get("pcie_link", {})
                    throttle = checks.get("throttling", {})
                    temp_str = f"{temp_c.get('value', '?')}C {temp_c.get('status', '')}"
                    pwr_str = f"{pwr.get('value', 0):.0f}W {pwr.get('status', '')}"
                    ecc_str = f"S:{ecc.get('single', 0)} D:{ecc.get('double', 0)}"
                    pcie_str = f"Gen{pcie.get('gen', '?')}x{pcie.get('width', '?')}"
                    throt_str = throttle.get("status", "?")
                    status = gh.get("status", "?")
                    lines.append(f"| {gh['index']} | {temp_str} | {pwr_str} | "
                                 f"{ecc_str} | {pcie_str} | {throt_str} | **{status}** |")
            lines.append("")

        # --- Memory Bandwidth ---
        mem_data = self._extract_memory_results(results)
        if mem_data and not mem_data.get("error"):
            lines.append("## Memory Bandwidth\n")
            lines.append(f"Source: {mem_data.get('source', 'unknown')}\n")
            lines.append("| Metric | Value | Peak | Efficiency |")
            lines.append("|--------|-------|------|------------|")
            d2d = mem_data.get("d2d_bandwidth_gbps") or 0
            h2d = mem_data.get("h2d_bandwidth_gbps") or 0
            d2h = mem_data.get("d2h_bandwidth_gbps") or 0
            # New format with per-metric peaks
            h2d_peak = mem_data.get("h2d_peak_gbps") or 0
            d2h_peak = mem_data.get("d2h_peak_gbps") or 0
            d2d_peak = mem_data.get("d2d_peak_gbps") or 0
            h2d_eff = mem_data.get("h2d_efficiency_pct") or 0
            d2h_eff = mem_data.get("d2h_efficiency_pct") or 0
            d2d_eff = mem_data.get("d2d_efficiency_pct") or 0
            # Fallback for old format
            if not d2d_peak:
                d2d_peak = mem_data.get("peak_bandwidth_gbps") or 0
                d2d_eff = mem_data.get("efficiency_pct") or 0
            lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |")
            lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |")
            lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |")
            lines.append("")
            # PyTorch fallback can't accurately measure HBM peak (intra-GPU copy_()
            # only reaches ~20% of HBM bandwidth). When fallback is used, report
            # the number but mark as WARN with a note instead of evaluating as FAIL.
            if mem_data.get("source") == "pytorch":
                lines.append(
                    f"**Verdict: WARN** (D2D {d2d:.1f} GB/s via PyTorch fallback; "
                    "nvbandwidth unavailable — figure is indicative only, not a true HBM peak)\n"
                )
            else:
                # Tightened to match production acceptance: PASS >= 80%, WARN 60–80%, FAIL < 60%.
                verdict = "PASS" if d2d_eff >= 80 else ("WARN" if d2d_eff >= 60 else "FAIL")
                lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n")

        # --- Compute Throughput ---
        comp_data = self._extract_compute_results(results)
        if comp_data and not comp_data.get("error"):
            lines.append("## Compute Throughput\n")
            per_dtype = comp_data.get("per_dtype_tflops", {})
            peak_tflops = comp_data.get("peak_tflops", {})
            eff_pct = comp_data.get("efficiency_pct", {})
            # Absolute PASS thresholds (TFLOPS) from gpu_specs.compute_pass_thresholds_tflops.
            # When present, override the legacy 80%-of-peak rule on a per-dtype basis.
            pass_thresholds = comp_data.get("pass_thresholds_tflops", {}) or {}
            use_abs = bool(pass_thresholds)
            if use_abs:
                lines.append("| DType | Achieved (TFLOPS) | Peak | Threshold | Status |")
            else:
                lines.append("| DType | Achieved (TFLOPS) | Peak | Efficiency | Status |")
            lines.append("|-------|-------------------|------|------------|--------|")
            worst_eff = 100.0
            overall_status = "PASS"
            rank = {"PASS": 0, "WARN": 1, "FAIL": 2, "SKIP": 0}
            for dt, val in per_dtype.items():
                if isinstance(val, str):
                    # skipped or error
                    lines.append(f"| {dt.upper()} | {val} | - | N/A | SKIP |")
                else:
                    pk = peak_tflops.get(dt, 0)
                    ef = eff_pct.get(dt, 0)
                    if isinstance(ef, (int, float)) and ef > 0:
                        worst_eff = min(worst_eff, ef)
                    thr = pass_thresholds.get(dt)
                    if use_abs and thr:
                        if val >= thr:
                            status = "PASS"
                        elif val >= thr * 0.9:
                            status = "WARN"
                        else:
                            status = "FAIL"
                        lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
                    else:
                        status = "PASS" if ef >= 80 else ("WARN" if ef >= 50 else "FAIL")
                        lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | {ef:.1f}% | {status} |")
                    if rank.get(status, 0) > rank.get(overall_status, 0):
                        overall_status = status
            lines.append("")
            if use_abs:
                lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
            else:
                overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
                lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")

        # --- NCCL ---
        nccl = results.get("nccl")
        if nccl and not nccl.get("error"):
            lines.append("## NCCL Multi-GPU\n")
            lines.append(f"Source: {nccl.get('source', 'unknown')} | "
                         f"GPUs: {nccl.get('gpu_count', '?')}\n")
            tests = nccl.get("tests", {})
            if tests:
                lines.append("| Operation | Bus BW (GB/s) | Threshold | Status |")
                lines.append("|-----------|---------------|-----------|--------|")
                for op, data in tests.items():
                    if isinstance(data, dict) and not data.get("error"):
                        bw = data.get("best_busbw_gbps", 0)
                        req = data.get("min_required_gbps", 0)
                        status = data.get("status", "?")
                        lines.append(f"| {op} | {bw:.1f} | >= {req:.0f} | {status} |")
                    elif isinstance(data, dict) and data.get("error"):
                        lines.append(f"| {op} | - | - | ERROR: {data['error']} |")
                lines.append("")
            passed = nccl.get("passed", False)
            lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")

        # --- Stress Test ---
        stress = results.get("stress")
        if stress and not stress.get("error"):
            lines.append("## Stress Test\n")
            passed = stress.get("passed", False)
            duration = stress.get("duration_sec") or 0
            elapsed = stress.get("elapsed_sec") or 0
            source = stress.get("source", "unknown")
            lines.append(f"- **Source:** {source}")
            lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
            lines.append(f"- **Result: {'PASS' if passed else 'FAIL'}**")
            lines.append("")

        # --- RDMA ---
        rdma = results.get("rdma")
        if rdma and (rdma.get("skipped") or rdma.get("status") == "SKIP"):
            lines.append("## RDMA/InfiniBand\n")
            lines.append(f"**Overall: SKIP** [{rdma.get('reason', 'no IB hardware detected')}]\n")
        elif rdma and not rdma.get("error"):
            lines.append("## RDMA/InfiniBand\n")
            bw_tests = rdma.get("bandwidth_tests", [])
            lat_tests = rdma.get("latency_tests", [])
            if bw_tests or lat_tests:
                lines.append("| Test | Value | Threshold | Status |")
                lines.append("|------|-------|-----------|--------|")
                for bt in bw_tests:
                    if not bt.get("error"):
                        lines.append(f"| {bt['test']} | {bt.get('bandwidth_gbps', 0):.1f} GB/s | "
                                     f">= {bt.get('min_required_gbps', 0)} GB/s | {bt.get('status', '?')} |")
                for lt in lat_tests:
                    if not lt.get("error"):
                        lines.append(f"| {lt['test']} | {lt.get('latency_us', 0):.2f} us | "
                                     f"<= {lt.get('max_allowed_us', 0)} us | {lt.get('status', '?')} |")
                lines.append("")
            passed = rdma.get("passed", False)
            lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")

        # --- Training ---
        training = results.get("training")
        if training and not training.get("error"):
            lines.append("## Training Simulation\n")
            lines.append("| Metric | Value |")
            lines.append("|--------|-------|")
            lines.append(f"| Model | {training.get('model', 'N/A')} |")
            lines.append(f"| Params | {training.get('total_params_m', 0):.1f}M |")
            lines.append(f"| Throughput | {training.get('throughput_tokens_per_sec', 0):.0f} tokens/sec |")
            lines.append(f"| Avg Step Time | {training.get('avg_step_time_ms', 0):.1f} ms |")
            lines.append(f"| Peak Memory | {training.get('peak_memory_gb', 0):.1f} GB |")
            lines.append(f"| Final Loss | {training.get('final_loss', 'N/A')} |")
            lines.append("")

        # --- Footer ---
        lines.append("---")
        lines.append(f"*Generated by GPU Test Suite v{__version__}*")

        content = "\n".join(lines)
        with open(output, "w") as f:
            f.write(content)
        self.console.print(f"[green]Markdown report saved to: {output}[/green]")
        return output

    def _extract_memory_results(self, results: dict) -> dict:
        """Extract memory benchmark data from either full-suite or single-test format."""
        if "memory_bench" in results:
            data = results["memory_bench"]
            return data.get("memory", data) if isinstance(data, dict) else {}
        if "benchmark" in results:
            bench = results["benchmark"]
            if isinstance(bench, dict) and "memory" in bench:
                return bench["memory"]
        return {}

    def _extract_compute_results(self, results: dict) -> dict:
        """Extract compute benchmark data from either full-suite or single-test format."""
        if "compute_bench" in results:
            data = results["compute_bench"]
            return data.get("compute", data) if isinstance(data, dict) else {}
        if "benchmark" in results:
            bench = results["benchmark"]
            if isinstance(bench, dict) and "compute" in bench:
                return bench["compute"]
        return {}

    def _build_summary(self, results: dict) -> list[tuple[str, str]]:
        """Build summary verdict list from results."""
        items = []

        # GPU Info
        if "gpu_info" in results:
            gi = results["gpu_info"]
            if gi.get("error"):
                items.append(("GPU Info", f"ERROR: {gi['error']}"))
            else:
                items.append(("GPU Info", f"PASS ({gi.get('gpu_count', '?')} GPUs detected)"))

        # Health
        if "health" in results:
            h = results["health"]
            if h.get("error"):
                items.append(("Health Check", f"ERROR: {h['error']}"))
            elif h.get("passed"):
                items.append(("Health Check", "PASS"))
            else:
                items.append(("Health Check", "FAIL"))

        # Memory Bandwidth
        mem = self._extract_memory_results(results)
        if mem:
            if mem.get("error"):
                items.append(("Memory Bandwidth", f"ERROR: {mem['error']}"))
            elif mem.get("source") == "pytorch":
                # PyTorch fallback can't reach HBM peak — report as WARN, not FAIL.
                d2d = mem.get("d2d_bandwidth_gbps") or 0
                items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
            else:
                eff = mem.get("efficiency_pct") or 0
                verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
                items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))

        # Compute
        comp = self._extract_compute_results(results)
        if comp:
            if comp.get("error"):
                items.append(("Compute Throughput", f"ERROR: {comp['error']}"))
            else:
                per_dtype = comp.get("per_dtype_tflops", {})
                eff_pct = comp.get("efficiency_pct", {})
                pass_thresholds = comp.get("pass_thresholds_tflops", {}) or {}
                if pass_thresholds:
                    # Absolute TFLOPS judgment, mirroring the per-dtype table above.
                    rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
                    worst_status = "PASS"
                    worst_dt = None
                    for dt, thr in pass_thresholds.items():
                        val = per_dtype.get(dt)
                        if not isinstance(val, (int, float)):
                            continue
                        if val >= thr:
                            st = "PASS"
                        elif val >= thr * 0.9:
                            st = "WARN"
                        else:
                            st = "FAIL"
                        if rank[st] > rank[worst_status]:
                            worst_status = st
                            worst_dt = dt
                    if worst_dt:
                        items.append((
                            "Compute Throughput",
                            f"{worst_status} (worst {worst_dt.upper()} "
                            f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
                        ))
                    else:
                        items.append(("Compute Throughput", f"{worst_status}"))
                else:
                    valid_effs = [v for v in eff_pct.values() if isinstance(v, (int, float)) and v > 0]
                    if valid_effs:
                        worst = min(valid_effs)
                        verdict = "PASS" if worst >= 80 else ("WARN" if worst >= 50 else "FAIL")
                        items.append(("Compute Throughput", f"{verdict} (worst {worst:.1f}%)"))
                    else:
                        items.append(("Compute Throughput", "N/A"))

        # NCCL
        if "nccl" in results:
            n = results["nccl"]
            if n.get("error"):
                items.append(("NCCL", f"ERROR: {n['error']}"))
            elif n.get("passed"):
                items.append(("NCCL", "PASS"))
            else:
                items.append(("NCCL", "FAIL"))

        # Stress
        if "stress" in results:
            s = results["stress"]
            if s.get("error"):
                items.append(("Stress Test", f"ERROR: {s['error']}"))
            elif s.get("passed"):
                items.append(("Stress Test", "PASS"))
            else:
                items.append(("Stress Test", "FAIL"))

        # RDMA
        if "rdma" in results:
            r = results["rdma"]
            if r.get("skipped") or r.get("status") == "SKIP":
                items.append(("RDMA", f"SKIP ({r.get('reason', 'no IB hardware')})"))
            elif r.get("error"):
                items.append(("RDMA", f"ERROR: {r['error']}"))
            elif r.get("passed"):
                items.append(("RDMA", "PASS"))
            else:
                items.append(("RDMA", "FAIL"))

        # Training
        if "training" in results:
            t = results["training"]
            if t.get("error"):
                items.append(("Training", f"ERROR: {t['error']}"))
            else:
                tps = t.get("throughput_tokens_per_sec", 0)
                items.append(("Training", f"PASS ({tps:.0f} tokens/sec)"))

        return items