add: training simulation and report generation modules
Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
1c6ba4809a
commit
82cd4d5180
165
modules/report.py
Normal file
165
modules/report.py
Normal file
@ -0,0 +1,165 @@
|
||||
"""Report generation module - export test results to JSON/HTML."""
|
||||
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
|
||||
HTML_TEMPLATE = """<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>H200 Test Report - {timestamp}</title>
|
||||
<style>
|
||||
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
|
||||
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, monospace;
|
||||
background: #0d1117; color: #c9d1d9; padding: 2rem; }}
|
||||
.header {{ background: linear-gradient(135deg, #1a1a2e, #16213e);
|
||||
padding: 2rem; border-radius: 8px; margin-bottom: 2rem;
|
||||
border: 1px solid #30363d; }}
|
||||
.header h1 {{ color: #58a6ff; font-size: 1.5rem; }}
|
||||
.header .meta {{ color: #8b949e; margin-top: 0.5rem; }}
|
||||
.section {{ background: #161b22; border: 1px solid #30363d;
|
||||
border-radius: 8px; padding: 1.5rem; margin-bottom: 1.5rem; }}
|
||||
.section h2 {{ color: #58a6ff; margin-bottom: 1rem; font-size: 1.2rem;
|
||||
border-bottom: 1px solid #30363d; padding-bottom: 0.5rem; }}
|
||||
table {{ width: 100%; border-collapse: collapse; margin: 0.5rem 0; }}
|
||||
th {{ background: #21262d; color: #8b949e; text-align: left;
|
||||
padding: 0.5rem; font-weight: 600; font-size: 0.85rem; }}
|
||||
td {{ padding: 0.5rem; border-bottom: 1px solid #21262d; font-size: 0.9rem; }}
|
||||
.pass {{ color: #3fb950; }} .warn {{ color: #d29922; }} .fail {{ color: #f85149; }}
|
||||
.metric {{ display: inline-block; background: #21262d; padding: 0.75rem 1.5rem;
|
||||
border-radius: 6px; margin: 0.25rem; text-align: center; min-width: 120px; }}
|
||||
.metric .value {{ font-size: 1.3rem; font-weight: bold; color: #58a6ff; }}
|
||||
.metric .label {{ font-size: 0.75rem; color: #8b949e; margin-top: 0.25rem; }}
|
||||
.verdict {{ padding: 1rem; border-radius: 6px; text-align: center; font-size: 1.1rem;
|
||||
font-weight: bold; margin: 1rem 0; }}
|
||||
.verdict.pass {{ background: #0d2818; color: #3fb950; border: 1px solid #238636; }}
|
||||
.verdict.fail {{ background: #2d0b0b; color: #f85149; border: 1px solid #da3633; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<h1>H200 Training Server Test Report</h1>
|
||||
<div class="meta">Generated: {timestamp} | Server: {hostname}</div>
|
||||
</div>
|
||||
{content}
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
|
||||
class ReportGenerator:
|
||||
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
self.console = Console()
|
||||
self.report_cfg = config.get("report", {})
|
||||
|
||||
def generate(self, results: dict, fmt: str = None, output: str = None) -> str:
|
||||
fmt = fmt or self.report_cfg.get("format", "json")
|
||||
output_dir = self.report_cfg.get("output_dir", "./reports")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
if not output:
|
||||
output = os.path.join(output_dir, f"h200_report_{timestamp}.{fmt}")
|
||||
|
||||
if fmt == "json":
|
||||
return self._generate_json(results, output)
|
||||
elif fmt == "html":
|
||||
return self._generate_html(results, output)
|
||||
else:
|
||||
self.console.print(f"[red]Unsupported format: {fmt}[/red]")
|
||||
return ""
|
||||
|
||||
def _generate_json(self, results: dict, output: str) -> str:
|
||||
with open(output, "w") as f:
|
||||
json.dump(results, f, indent=2, default=str)
|
||||
self.console.print(f"[green]JSON report saved to: {output}[/green]")
|
||||
return output
|
||||
|
||||
def _generate_html(self, results: dict, output: str) -> str:
|
||||
import socket
|
||||
hostname = socket.gethostname()
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
sections = []
|
||||
|
||||
if "gpu_info" in results:
|
||||
gpus = results["gpu_info"].get("gpus", [])
|
||||
rows = ""
|
||||
for g in gpus:
|
||||
rows += f"<tr><td>GPU {g['index']}</td><td>{g['name']}</td>"
|
||||
rows += f"<td>{g['vram_total_mb']} MB</td>"
|
||||
rows += f"<td>{g['temperature']}°C</td>"
|
||||
rows += f"<td>{g['clock_sm']} MHz</td></tr>"
|
||||
sections.append(
|
||||
f'<div class="section"><h2>GPU Information</h2>'
|
||||
f'<p>Driver: {results["gpu_info"].get("driver_version", "N/A")} | '
|
||||
f'CUDA: {results["gpu_info"].get("cuda_version", "N/A")} | '
|
||||
f'Count: {len(gpus)}</p>'
|
||||
f'<table><tr><th>GPU</th><th>Model</th><th>VRAM</th><th>Temp</th><th>SM Clock</th></tr>'
|
||||
f'{rows}</table></div>'
|
||||
)
|
||||
|
||||
if "health" in results:
|
||||
h = results["health"]
|
||||
passed = h.get("passed", False)
|
||||
cls = "pass" if passed else "fail"
|
||||
txt = "ALL PASSED" if passed else "SOME CHECKS FAILED"
|
||||
sections.append(f'<div class="verdict {cls}">{txt}</div>')
|
||||
|
||||
if "benchmark" in results and "memory" in results["benchmark"]:
|
||||
mem = results["benchmark"]["memory"]
|
||||
sections.append(
|
||||
f'<div class="section"><h2>Memory Bandwidth</h2>'
|
||||
f'<div class="metric"><div class="value">{mem.get("d2d_bandwidth_gbps", "N/A")} GB/s</div>'
|
||||
f'<div class="label">D2D (HBM3e)</div></div>'
|
||||
f'<div class="metric"><div class="value">{mem.get("efficiency_pct", "N/A")}%</div>'
|
||||
f'<div class="label">Efficiency vs Peak ({mem.get("peak_bandwidth_gbps", 989)} GB/s)</div></div>'
|
||||
f'</div>'
|
||||
)
|
||||
|
||||
if "benchmark" in results and "compute" in results["benchmark"]:
|
||||
comp = results["benchmark"]["compute"]
|
||||
dtype_rows = ""
|
||||
per_dtype = comp.get("per_dtype_tflops", {})
|
||||
eff = comp.get("efficiency_pct", {})
|
||||
for dt, tflops in per_dtype.items():
|
||||
ef = eff.get(dt, 0)
|
||||
cls = "pass" if ef >= 80 else ("warn" if ef >= 50 else "fail")
|
||||
if isinstance(tflops, (int, float)):
|
||||
dtype_rows += f'<tr><td>{dt.upper()}</td><td>{tflops:.1f} TFLOPS</td>'
|
||||
dtype_rows += f'<td class="{cls}">{ef:.1f}%</td></tr>'
|
||||
if dtype_rows:
|
||||
sections.append(
|
||||
f'<div class="section"><h2>Compute Throughput</h2>'
|
||||
f'<table><tr><th>DType</th><th>Achieved</th><th>Efficiency</th></tr>'
|
||||
f'{dtype_rows}</table></div>'
|
||||
)
|
||||
|
||||
if "training" in results:
|
||||
t = results["training"]
|
||||
sections.append(
|
||||
f'<div class="section"><h2>Training Simulation</h2>'
|
||||
f'<div class="metric"><div class="value">{t.get("throughput_tokens_per_sec", "N/A")}</div>'
|
||||
f'<div class="label">Tokens/sec</div></div>'
|
||||
f'<div class="metric"><div class="value">{t.get("avg_step_time_ms", "N/A")} ms</div>'
|
||||
f'<div class="label">Avg Step Time</div></div>'
|
||||
f'<div class="metric"><div class="value">{t.get("peak_memory_gb", "N/A")} GB</div>'
|
||||
f'<div class="label">Peak Memory</div></div>'
|
||||
f'</div>'
|
||||
)
|
||||
|
||||
content = "\n".join(sections)
|
||||
html = HTML_TEMPLATE.format(timestamp=timestamp, hostname=hostname, content=content)
|
||||
|
||||
with open(output, "w") as f:
|
||||
f.write(html)
|
||||
self.console.print(f"[green]HTML report saved to: {output}[/green]")
|
||||
return output
|
||||
266
modules/training_sim.py
Normal file
266
modules/training_sim.py
Normal file
@ -0,0 +1,266 @@
|
||||
"""Training simulation module - LLM training workload with PyTorch."""
|
||||
|
||||
import time
|
||||
import subprocess
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
|
||||
|
||||
TORCH_AVAILABLE = False
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
TORCH_AVAILABLE = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
class TrainingSim:
|
||||
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
self.console = Console()
|
||||
self.train_cfg = config.get("training", {})
|
||||
|
||||
def run(self) -> dict:
|
||||
if not TORCH_AVAILABLE:
|
||||
self.console.print("[yellow]PyTorch not available - skipping training simulation[/yellow]")
|
||||
return {"error": "pytorch_not_available"}
|
||||
|
||||
gpu_count = torch.cuda.device_count()
|
||||
model_name = self.train_cfg.get("model", "gpt2")
|
||||
batch_size = self.train_cfg.get("batch_size", 8)
|
||||
seq_length = self.train_cfg.get("seq_length", 2048)
|
||||
num_steps = self.train_cfg.get("num_steps", 50)
|
||||
dtype_str = self.train_cfg.get("dtype", "bf16")
|
||||
|
||||
dtype_map = {
|
||||
"fp32": torch.float32,
|
||||
"fp16": torch.float16,
|
||||
"bf16": torch.bfloat16,
|
||||
}
|
||||
dtype = dtype_map.get(dtype_str, torch.bfloat16)
|
||||
|
||||
self.console.print(f"[cyan]Training Simulation[/cyan]")
|
||||
self.console.print(f" Model: {model_name} | Batch: {batch_size} | Seq: {seq_length} | "
|
||||
f"DType: {dtype_str} | Steps: {num_steps} | GPUs: {gpu_count}")
|
||||
|
||||
try:
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
except ImportError:
|
||||
self.console.print("[yellow]transformers not installed - using synthetic model[/yellow]")
|
||||
return self._run_synthetic(gpu_count, batch_size, seq_length, num_steps, dtype)
|
||||
|
||||
try:
|
||||
self.console.print(f" Loading {model_name}...")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
torch_dtype=dtype,
|
||||
device_map="auto" if gpu_count > 1 else None,
|
||||
)
|
||||
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
self.console.print(f" Parameters: {total_params / 1e6:.1f}M")
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
if tokenizer.pad_token is None:
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
input_ids = torch.randint(0, tokenizer.vocab_size, (batch_size, seq_length))
|
||||
attention_mask = torch.ones_like(input_ids)
|
||||
|
||||
model.train()
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
|
||||
|
||||
if dtype in (torch.float16, torch.bfloat16):
|
||||
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == torch.float16))
|
||||
|
||||
step_times = []
|
||||
mem_usage = []
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(), TextColumn("{task.completed}/{task.total}"),
|
||||
TimeElapsedColumn(), console=self.console,
|
||||
) as progress:
|
||||
task = progress.add_task("Training steps...", total=num_steps)
|
||||
|
||||
for step in range(num_steps):
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.perf_counter()
|
||||
|
||||
input_ids = input_ids.to(model.device)
|
||||
attention_mask = attention_mask.to(model.device)
|
||||
|
||||
if dtype in (torch.float16, torch.bfloat16) and dtype != torch.bfloat16:
|
||||
with torch.cuda.amp.autocast(dtype=dtype):
|
||||
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
|
||||
loss = outputs.loss
|
||||
else:
|
||||
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
|
||||
loss = outputs.loss
|
||||
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
elapsed = time.perf_counter() - t0
|
||||
step_times.append(elapsed)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
mem_used = torch.cuda.max_memory_allocated() / 1024**3
|
||||
mem_usage.append(mem_used)
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
progress.advance(task)
|
||||
|
||||
avg_step_time = sum(step_times) / len(step_times)
|
||||
throughput = batch_size * seq_length / avg_step_time
|
||||
|
||||
return {
|
||||
"model": model_name,
|
||||
"total_params_m": round(total_params / 1e6, 1),
|
||||
"gpu_count": gpu_count,
|
||||
"dtype": dtype_str,
|
||||
"batch_size": batch_size,
|
||||
"seq_length": seq_length,
|
||||
"num_steps": num_steps,
|
||||
"avg_step_time_ms": round(avg_step_time * 1000, 1),
|
||||
"throughput_tokens_per_sec": round(throughput, 0),
|
||||
"throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
|
||||
"peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
|
||||
"final_loss": round(loss.item(), 4) if hasattr(loss, 'item') else None,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
self.console.print(f"[yellow]Model loading failed: {e}[/yellow]")
|
||||
return self._run_synthetic(gpu_count, batch_size, seq_length, num_steps, dtype)
|
||||
|
||||
def _run_synthetic(self, gpu_count, batch_size, seq_length, num_steps, dtype) -> dict:
|
||||
self.console.print(" Running synthetic training benchmark...")
|
||||
|
||||
hidden_size = 4096
|
||||
num_layers = 6
|
||||
num_heads = 32
|
||||
vocab_size = 32000
|
||||
|
||||
class SyntheticTransformer(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.embed = torch.nn.Embedding(vocab_size, hidden_size)
|
||||
self.layers = torch.nn.ModuleList([
|
||||
torch.nn.TransformerEncoderLayer(
|
||||
d_model=hidden_size, nhead=num_heads,
|
||||
dim_feedforward=hidden_size * 4,
|
||||
batch_first=True,
|
||||
dtype=dtype,
|
||||
) for _ in range(num_layers)
|
||||
])
|
||||
self.head = torch.nn.Linear(hidden_size, vocab_size, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
h = self.embed(x).to(dtype)
|
||||
for layer in self.layers:
|
||||
h = layer(h)
|
||||
return self.head(h)
|
||||
|
||||
model = SyntheticTransformer().cuda()
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
|
||||
self.console.print(f" Synthetic params: {total_params / 1e6:.1f}M")
|
||||
|
||||
model.train()
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
|
||||
|
||||
input_ids = torch.randint(0, vocab_size, (batch_size, seq_length)).cuda()
|
||||
|
||||
step_times = []
|
||||
mem_usage = []
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(), TextColumn("{task.completed}/{task.total}"),
|
||||
TimeElapsedColumn(), console=self.console,
|
||||
) as progress:
|
||||
task = progress.add_task("Synthetic training...", total=num_steps)
|
||||
|
||||
for step in range(num_steps):
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.perf_counter()
|
||||
|
||||
logits = model(input_ids)
|
||||
loss = torch.nn.functional.cross_entropy(
|
||||
logits.view(-1, vocab_size), input_ids.view(-1)
|
||||
)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
torch.cuda.synchronize()
|
||||
elapsed = time.perf_counter() - t0
|
||||
step_times.append(elapsed)
|
||||
|
||||
mem_used = torch.cuda.max_memory_allocated() / 1024**3
|
||||
mem_usage.append(mem_used)
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
progress.advance(task)
|
||||
|
||||
avg_step_time = sum(step_times) / len(step_times)
|
||||
throughput = batch_size * seq_length / avg_step_time
|
||||
|
||||
return {
|
||||
"model": "synthetic_transformer",
|
||||
"total_params_m": round(total_params / 1e6, 1),
|
||||
"num_layers": num_layers,
|
||||
"hidden_size": hidden_size,
|
||||
"gpu_count": gpu_count,
|
||||
"dtype": str(dtype).replace("torch.", ""),
|
||||
"batch_size": batch_size,
|
||||
"seq_length": seq_length,
|
||||
"num_steps": num_steps,
|
||||
"avg_step_time_ms": round(avg_step_time * 1000, 1),
|
||||
"throughput_tokens_per_sec": round(throughput, 0),
|
||||
"throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
|
||||
"peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
|
||||
"final_loss": round(loss.item(), 4),
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def print_results(results: dict, console: Console = None):
|
||||
c = console or Console()
|
||||
if "error" in results:
|
||||
c.print(f"[bold red]Error: {results['error']}[/bold red]")
|
||||
return
|
||||
|
||||
c.print(f"\n[bold cyan]Training Simulation Results[/bold cyan]")
|
||||
|
||||
table = Table(box=None, padding=(0, 1))
|
||||
table.add_column("Metric", style="bold")
|
||||
table.add_column("Value")
|
||||
|
||||
metrics = [
|
||||
("Model", results.get("model", "N/A")),
|
||||
("Parameters", f"{results.get('total_params_m', 'N/A')}M"),
|
||||
("GPU Count", str(results.get("gpu_count", "N/A"))),
|
||||
("DType", results.get("dtype", "N/A")),
|
||||
("Batch Size", str(results.get("batch_size", "N/A"))),
|
||||
("Seq Length", str(results.get("seq_length", "N/A"))),
|
||||
("Steps", str(results.get("num_steps", "N/A"))),
|
||||
("Avg Step Time", f"{results.get('avg_step_time_ms', 'N/A')} ms"),
|
||||
("Throughput", f"{results.get('throughput_tokens_per_sec', 'N/A')} tokens/s"),
|
||||
("Samples/sec", f"{results.get('throughput_samples_per_sec', 'N/A')}"),
|
||||
("Peak Memory", f"{results.get('peak_memory_gb', 'N/A')} GB"),
|
||||
("Final Loss", str(results.get("final_loss", "N/A"))),
|
||||
]
|
||||
for label, val in metrics:
|
||||
table.add_row(label, str(val))
|
||||
|
||||
c.print(table)
|
||||
Loading…
x
Reference in New Issue
Block a user