add: training simulation and report generation modules

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-25 17:24:01 +08:00 · 2026-04-25 17:24:01 +08:00 · 82cd4d5180
commit 82cd4d5180
parent 1c6ba4809a
2 changed files with 431 additions and 0 deletions
--- a/modules/report.py
+++ b/modules/report.py
@ -0,0 +1,165 @@
+"""Report generation module - export test results to JSON/HTML."""
+
+import json
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+from rich.console import Console
+from rich.panel import Panel
+
+HTML_TEMPLATE = """<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>H200 Test Report - {timestamp}</title>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, monospace;
+               background: #0d1117; color: #c9d1d9; padding: 2rem; }}
+        .header {{ background: linear-gradient(135deg, #1a1a2e, #16213e);
+                   padding: 2rem; border-radius: 8px; margin-bottom: 2rem;
+                   border: 1px solid #30363d; }}
+        .header h1 {{ color: #58a6ff; font-size: 1.5rem; }}
+        .header .meta {{ color: #8b949e; margin-top: 0.5rem; }}
+        .section {{ background: #161b22; border: 1px solid #30363d;
+                    border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; }}
+        .section h2 {{ color: #58a6ff; margin-bottom: 1rem; font-size: 1.2rem;
+                       border-bottom: 1px solid #30363d; padding-bottom: 0.5rem; }}
+        table {{ width: 100%; border-collapse: collapse; margin: 0.5rem 0; }}
+        th {{ background: #21262d; color: #8b949e; text-align: left;
+             padding: 0.5rem; font-weight: 600; font-size: 0.85rem; }}
+        td {{ padding: 0.5rem; border-bottom: 1px solid #21262d; font-size: 0.9rem; }}
+        .pass {{ color: #3fb950; }} .warn {{ color: #d29922; }} .fail {{ color: #f85149; }}
+        .metric {{ display: inline-block; background: #21262d; padding: 0.75rem 1.5rem;
+                  border-radius: 6px; margin: 0.25rem; text-align: center; min-width: 120px; }}
+        .metric .value {{ font-size: 1.3rem; font-weight: bold; color: #58a6ff; }}
+        .metric .label {{ font-size: 0.75rem; color: #8b949e; margin-top: 0.25rem; }}
+        .verdict {{ padding: 1rem; border-radius: 6px; text-align: center; font-size: 1.1rem;
+                   font-weight: bold; margin: 1rem 0; }}
+        .verdict.pass {{ background: #0d2818; color: #3fb950; border: 1px solid #238636; }}
+        .verdict.fail {{ background: #2d0b0b; color: #f85149; border: 1px solid #da3633; }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>H200 Training Server Test Report</h1>
+        <div class="meta">Generated: {timestamp} | Server: {hostname}</div>
+    </div>
+    {content}
+</body>
+</html>"""
+
+
+class ReportGenerator:
+
+    def __init__(self, config: dict):
+        self.config = config
+        self.console = Console()
+        self.report_cfg = config.get("report", {})
+
+    def generate(self, results: dict, fmt: str = None, output: str = None) -> str:
+        fmt = fmt or self.report_cfg.get("format", "json")
+        output_dir = self.report_cfg.get("output_dir", "./reports")
+        os.makedirs(output_dir, exist_ok=True)
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        if not output:
+            output = os.path.join(output_dir, f"h200_report_{timestamp}.{fmt}")
+
+        if fmt == "json":
+            return self._generate_json(results, output)
+        elif fmt == "html":
+            return self._generate_html(results, output)
+        else:
+            self.console.print(f"[red]Unsupported format: {fmt}[/red]")
+            return ""
+
+    def _generate_json(self, results: dict, output: str) -> str:
+        with open(output, "w") as f:
+            json.dump(results, f, indent=2, default=str)
+        self.console.print(f"[green]JSON report saved to: {output}[/green]")
+        return output
+
+    def _generate_html(self, results: dict, output: str) -> str:
+        import socket
+        hostname = socket.gethostname()
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        sections = []
+
+        if "gpu_info" in results:
+            gpus = results["gpu_info"].get("gpus", [])
+            rows = ""
+            for g in gpus:
+                rows += f"<tr><td>GPU {g['index']}</td><td>{g['name']}</td>"
+                rows += f"<td>{g['vram_total_mb']} MB</td>"
+                rows += f"<td>{g['temperature']}°C</td>"
+                rows += f"<td>{g['clock_sm']} MHz</td></tr>"
+            sections.append(
+                f'<div class="section"><h2>GPU Information</h2>'
+                f'<p>Driver: {results["gpu_info"].get("driver_version", "N/A")} | '
+                f'CUDA: {results["gpu_info"].get("cuda_version", "N/A")} | '
+                f'Count: {len(gpus)}</p>'
+                f'<table><tr><th>GPU</th><th>Model</th><th>VRAM</th><th>Temp</th><th>SM Clock</th></tr>'
+                f'{rows}</table></div>'
+            )
+
+        if "health" in results:
+            h = results["health"]
+            passed = h.get("passed", False)
+            cls = "pass" if passed else "fail"
+            txt = "ALL PASSED" if passed else "SOME CHECKS FAILED"
+            sections.append(f'<div class="verdict {cls}">{txt}</div>')
+
+        if "benchmark" in results and "memory" in results["benchmark"]:
+            mem = results["benchmark"]["memory"]
+            sections.append(
+                f'<div class="section"><h2>Memory Bandwidth</h2>'
+                f'<div class="metric"><div class="value">{mem.get("d2d_bandwidth_gbps", "N/A")} GB/s</div>'
+                f'<div class="label">D2D (HBM3e)</div></div>'
+                f'<div class="metric"><div class="value">{mem.get("efficiency_pct", "N/A")}%</div>'
+                f'<div class="label">Efficiency vs Peak ({mem.get("peak_bandwidth_gbps", 989)} GB/s)</div></div>'
+                f'</div>'
+            )
+
+        if "benchmark" in results and "compute" in results["benchmark"]:
+            comp = results["benchmark"]["compute"]
+            dtype_rows = ""
+            per_dtype = comp.get("per_dtype_tflops", {})
+            eff = comp.get("efficiency_pct", {})
+            for dt, tflops in per_dtype.items():
+                ef = eff.get(dt, 0)
+                cls = "pass" if ef >= 80 else ("warn" if ef >= 50 else "fail")
+                if isinstance(tflops, (int, float)):
+                    dtype_rows += f'<tr><td>{dt.upper()}</td><td>{tflops:.1f} TFLOPS</td>'
+                    dtype_rows += f'<td class="{cls}">{ef:.1f}%</td></tr>'
+            if dtype_rows:
+                sections.append(
+                    f'<div class="section"><h2>Compute Throughput</h2>'
+                    f'<table><tr><th>DType</th><th>Achieved</th><th>Efficiency</th></tr>'
+                    f'{dtype_rows}</table></div>'
+                )
+
+        if "training" in results:
+            t = results["training"]
+            sections.append(
+                f'<div class="section"><h2>Training Simulation</h2>'
+                f'<div class="metric"><div class="value">{t.get("throughput_tokens_per_sec", "N/A")}</div>'
+                f'<div class="label">Tokens/sec</div></div>'
+                f'<div class="metric"><div class="value">{t.get("avg_step_time_ms", "N/A")} ms</div>'
+                f'<div class="label">Avg Step Time</div></div>'
+                f'<div class="metric"><div class="value">{t.get("peak_memory_gb", "N/A")} GB</div>'
+                f'<div class="label">Peak Memory</div></div>'
+                f'</div>'
+            )
+
+        content = "\n".join(sections)
+        html = HTML_TEMPLATE.format(timestamp=timestamp, hostname=hostname, content=content)
+
+        with open(output, "w") as f:
+            f.write(html)
+        self.console.print(f"[green]HTML report saved to: {output}[/green]")
+        return output
--- a/modules/training_sim.py
+++ b/modules/training_sim.py
@ -0,0 +1,266 @@
+"""Training simulation module - LLM training workload with PyTorch."""
+
+import time
+import subprocess
+import shutil
+from datetime import datetime
+from typing import Optional
+
+from rich.console import Console
+from rich.table import Table
+from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
+
+TORCH_AVAILABLE = False
+try:
+    import torch
+    if torch.cuda.is_available():
+        TORCH_AVAILABLE = True
+except ImportError:
+    pass
+
+
+class TrainingSim:
+
+    def __init__(self, config: dict):
+        self.config = config
+        self.console = Console()
+        self.train_cfg = config.get("training", {})
+
+    def run(self) -> dict:
+        if not TORCH_AVAILABLE:
+            self.console.print("[yellow]PyTorch not available - skipping training simulation[/yellow]")
+            return {"error": "pytorch_not_available"}
+
+        gpu_count = torch.cuda.device_count()
+        model_name = self.train_cfg.get("model", "gpt2")
+        batch_size = self.train_cfg.get("batch_size", 8)
+        seq_length = self.train_cfg.get("seq_length", 2048)
+        num_steps = self.train_cfg.get("num_steps", 50)
+        dtype_str = self.train_cfg.get("dtype", "bf16")
+
+        dtype_map = {
+            "fp32": torch.float32,
+            "fp16": torch.float16,
+            "bf16": torch.bfloat16,
+        }
+        dtype = dtype_map.get(dtype_str, torch.bfloat16)
+
+        self.console.print(f"[cyan]Training Simulation[/cyan]")
+        self.console.print(f"  Model: {model_name} | Batch: {batch_size} | Seq: {seq_length} | "
+                           f"DType: {dtype_str} | Steps: {num_steps} | GPUs: {gpu_count}")
+
+        try:
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+        except ImportError:
+            self.console.print("[yellow]transformers not installed - using synthetic model[/yellow]")
+            return self._run_synthetic(gpu_count, batch_size, seq_length, num_steps, dtype)
+
+        try:
+            self.console.print(f"  Loading {model_name}...")
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=dtype,
+                device_map="auto" if gpu_count > 1 else None,
+            )
+
+            total_params = sum(p.numel() for p in model.parameters())
+            self.console.print(f"  Parameters: {total_params / 1e6:.1f}M")
+
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            if tokenizer.pad_token is None:
+                tokenizer.pad_token = tokenizer.eos_token
+
+            input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, seq_length))
+            attention_mask = torch.ones_like(input_ids)
+
+            model.train()
+            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
+
+            if dtype in (torch.float16, torch.bfloat16):
+                scaler = torch.cuda.amp.GradScaler(enabled=(dtype == torch.float16))
+
+            step_times = []
+            mem_usage = []
+
+            with Progress(
+                SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
+                BarColumn(), TextColumn("{task.completed}/{task.total}"),
+                TimeElapsedColumn(), console=self.console,
+            ) as progress:
+                task = progress.add_task("Training steps...", total=num_steps)
+
+                for step in range(num_steps):
+                    torch.cuda.synchronize()
+                    t0 = time.perf_counter()
+
+                    input_ids = input_ids.to(model.device)
+                    attention_mask = attention_mask.to(model.device)
+
+                    if dtype in (torch.float16, torch.bfloat16) and dtype != torch.bfloat16:
+                        with torch.cuda.amp.autocast(dtype=dtype):
+                            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
+                            loss = outputs.loss
+                    else:
+                        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
+                        loss = outputs.loss
+
+                    loss.backward()
+                    optimizer.step()
+                    optimizer.zero_grad()
+
+                    torch.cuda.synchronize()
+                    elapsed = time.perf_counter() - t0
+                    step_times.append(elapsed)
+
+                    if torch.cuda.is_available():
+                        mem_used = torch.cuda.max_memory_allocated() / 1024**3
+                        mem_usage.append(mem_used)
+                        torch.cuda.reset_peak_memory_stats()
+
+                    progress.advance(task)
+
+            avg_step_time = sum(step_times) / len(step_times)
+            throughput = batch_size * seq_length / avg_step_time
+
+            return {
+                "model": model_name,
+                "total_params_m": round(total_params / 1e6, 1),
+                "gpu_count": gpu_count,
+                "dtype": dtype_str,
+                "batch_size": batch_size,
+                "seq_length": seq_length,
+                "num_steps": num_steps,
+                "avg_step_time_ms": round(avg_step_time * 1000, 1),
+                "throughput_tokens_per_sec": round(throughput, 0),
+                "throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
+                "peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
+                "final_loss": round(loss.item(), 4) if hasattr(loss, 'item') else None,
+                "timestamp": datetime.now().isoformat(),
+            }
+
+        except Exception as e:
+            self.console.print(f"[yellow]Model loading failed: {e}[/yellow]")
+            return self._run_synthetic(gpu_count, batch_size, seq_length, num_steps, dtype)
+
+    def _run_synthetic(self, gpu_count, batch_size, seq_length, num_steps, dtype) -> dict:
+        self.console.print("  Running synthetic training benchmark...")
+
+        hidden_size = 4096
+        num_layers = 6
+        num_heads = 32
+        vocab_size = 32000
+
+        class SyntheticTransformer(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.embed = torch.nn.Embedding(vocab_size, hidden_size)
+                self.layers = torch.nn.ModuleList([
+                    torch.nn.TransformerEncoderLayer(
+                        d_model=hidden_size, nhead=num_heads,
+                        dim_feedforward=hidden_size * 4,
+                        batch_first=True,
+                        dtype=dtype,
+                    ) for _ in range(num_layers)
+                ])
+                self.head = torch.nn.Linear(hidden_size, vocab_size, dtype=dtype)
+
+            def forward(self, x):
+                h = self.embed(x).to(dtype)
+                for layer in self.layers:
+                    h = layer(h)
+                return self.head(h)
+
+        model = SyntheticTransformer().cuda()
+        total_params = sum(p.numel() for p in model.parameters())
+
+        self.console.print(f"  Synthetic params: {total_params / 1e6:.1f}M")
+
+        model.train()
+        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+
+        input_ids = torch.randint(0, vocab_size, (batch_size, seq_length)).cuda()
+
+        step_times = []
+        mem_usage = []
+
+        with Progress(
+            SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
+            BarColumn(), TextColumn("{task.completed}/{task.total}"),
+            TimeElapsedColumn(), console=self.console,
+        ) as progress:
+            task = progress.add_task("Synthetic training...", total=num_steps)
+
+            for step in range(num_steps):
+                torch.cuda.synchronize()
+                t0 = time.perf_counter()
+
+                logits = model(input_ids)
+                loss = torch.nn.functional.cross_entropy(
+                    logits.view(-1, vocab_size), input_ids.view(-1)
+                )
+                loss.backward()
+                optimizer.step()
+                optimizer.zero_grad()
+
+                torch.cuda.synchronize()
+                elapsed = time.perf_counter() - t0
+                step_times.append(elapsed)
+
+                mem_used = torch.cuda.max_memory_allocated() / 1024**3
+                mem_usage.append(mem_used)
+                torch.cuda.reset_peak_memory_stats()
+
+                progress.advance(task)
+
+        avg_step_time = sum(step_times) / len(step_times)
+        throughput = batch_size * seq_length / avg_step_time
+
+        return {
+            "model": "synthetic_transformer",
+            "total_params_m": round(total_params / 1e6, 1),
+            "num_layers": num_layers,
+            "hidden_size": hidden_size,
+            "gpu_count": gpu_count,
+            "dtype": str(dtype).replace("torch.", ""),
+            "batch_size": batch_size,
+            "seq_length": seq_length,
+            "num_steps": num_steps,
+            "avg_step_time_ms": round(avg_step_time * 1000, 1),
+            "throughput_tokens_per_sec": round(throughput, 0),
+            "throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
+            "peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
+            "final_loss": round(loss.item(), 4),
+            "timestamp": datetime.now().isoformat(),
+        }
+
+    @staticmethod
+    def print_results(results: dict, console: Console = None):
+        c = console or Console()
+        if "error" in results:
+            c.print(f"[bold red]Error: {results['error']}[/bold red]")
+            return
+
+        c.print(f"\n[bold cyan]Training Simulation Results[/bold cyan]")
+
+        table = Table(box=None, padding=(0, 1))
+        table.add_column("Metric", style="bold")
+        table.add_column("Value")
+
+        metrics = [
+            ("Model", results.get("model", "N/A")),
+            ("Parameters", f"{results.get('total_params_m', 'N/A')}M"),
+            ("GPU Count", str(results.get("gpu_count", "N/A"))),
+            ("DType", results.get("dtype", "N/A")),
+            ("Batch Size", str(results.get("batch_size", "N/A"))),
+            ("Seq Length", str(results.get("seq_length", "N/A"))),
+            ("Steps", str(results.get("num_steps", "N/A"))),
+            ("Avg Step Time", f"{results.get('avg_step_time_ms', 'N/A')} ms"),
+            ("Throughput", f"{results.get('throughput_tokens_per_sec', 'N/A')} tokens/s"),
+            ("Samples/sec", f"{results.get('throughput_samples_per_sec', 'N/A')}"),
+            ("Peak Memory", f"{results.get('peak_memory_gb', 'N/A')} GB"),
+            ("Final Loss", str(results.get("final_loss", "N/A"))),
+        ]
+        for label, val in metrics:
+            table.add_row(label, str(val))
+
+        c.print(table)