test_gpu_scripts/h200_tester.py
qinyusen d4f46b6394 add: CLI entry point with interactive menu and argument parsing
Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-25 17:23:35 +08:00

371 lines
13 KiB
Python

#!/usr/bin/env python3
"""H200 Training Server Test Suite - Main CLI Entry Point."""
import argparse
import json
import os
import signal
import sys
import time
from datetime import datetime
from pathlib import Path
import yaml
from rich.console import Console
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
from rich.table import Table
from rich.text import Text
from rich import box
from modules.gpu_info import GPUInfo
from modules.health_check import HealthCheck
from modules.benchmark import Benchmark
from modules.nccl_test import NCCLTest
from modules.training_sim import TrainingSim
from modules.stress_test import StressTest
from modules.rdma_test import RDMATest
from modules.report import ReportGenerator
DEFAULT_CONFIG = {
"benchmark": {
"memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3},
"compute": {
"dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"],
"matrix_size": 4096,
"warmup": 10,
"iterations": 100,
},
},
"health": {"temp_warning": 80, "temp_critical": 90, "power_limit": 700},
"nccl": {
"min_bandwidth_gbps": 400,
"test_allreduce": True,
"test_alltoall": True,
"test_broadcast": True,
"test_reduce_scatter": False,
"test_allgather": False,
"test_sendrecv": False,
},
"stress": {
"duration_sec": 60,
"use_doubles": False,
"use_tensor_cores": True,
"memory_pct": 90,
"gpus": "all",
},
"rdma": {
"min_bandwidth_gbps": 50,
"max_latency_us": 10,
"ib_iterations": 1000,
"msg_size": 65536,
"ib_device": None,
"ib_port": 1,
},
"training": {
"model": "gpt2",
"batch_size": 8,
"seq_length": 2048,
"num_steps": 50,
"dtype": "bf16",
},
"report": {"output_dir": "./reports", "format": "json"},
"tools": {"install_dir": "/opt/h200-test-tools"},
}
BANNER = r"""
[bold cyan]
╔══════════════════════════════════════════════════╗
║ ║
║ H200 Training Server Test Suite ║
║ GPU Diagnostics & Benchmarking Tool ║
║ ║
╚══════════════════════════════════════════════════╝
[/bold cyan]
"""
def load_config() -> dict:
"""Load config from yaml file, fallback to defaults."""
config_path = Path(__file__).parent / "configs" / "default.yaml"
if config_path.exists():
with open(config_path) as f:
return yaml.safe_load(f) or DEFAULT_CONFIG
return DEFAULT_CONFIG.copy()
def check_prerequisites(console: Console) -> bool:
"""Check if required tools are available."""
import shutil
ok = True
if not shutil.which("nvidia-smi"):
console.print("[bold red]ERROR: nvidia-smi not found![/bold red]")
console.print(" Please install NVIDIA drivers first.")
ok = False
return ok
def interactive_menu(config: dict):
"""Run interactive menu loop."""
console = Console()
console.print(BANNER)
if not check_prerequisites(console):
return
results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}}
menu_items = [
("1", "GPU Information", "gpu_info"),
("2", "Health Check", "health"),
("3", "Memory Benchmark (nvbandwidth)", "memory_bench"),
("4", "Compute Benchmark", "compute_bench"),
("5", "NCCL Multi-GPU Test", "nccl"),
("6", "GPU Stress Test (gpu-burn)", "stress"),
("7", "RDMA/IB Test", "rdma"),
("8", "Training Simulation", "training"),
("9", "Full Test Suite (All Tests)", "all"),
("0", "Generate Report", "report"),
]
while True:
console.print()
table = Table(
title="[bold cyan]Select a Test[/bold cyan]",
box=box.ROUNDED,
border_style="cyan",
show_header=False,
padding=(0, 2),
)
table.add_column("Key", style="bold yellow", width=5)
table.add_column("Test Name")
table.add_column("Description", style="dim")
descriptions = {
"gpu_info": "Detect GPUs, show specs & NVLink topology",
"health": "Temperature, power, ECC errors, PCIe, DCGM",
"memory_bench": "HBM3e bandwidth via nvbandwidth",
"compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8",
"nccl": "AllReduce, AllToAll, Broadcast via nccl-tests",
"stress": "Long-running GPU stress via gpu-burn",
"rdma": "InfiniBand bandwidth & latency (ib_write_bw)",
"training": "Simulate LLM training with PyTorch",
"all": "Run all tests sequentially",
"report": "Export results to JSON/HTML",
}
for key, name, action in menu_items:
table.add_row(f"[{key}]", name, descriptions.get(action, ""))
table.add_row("[q]", Text("Quit", style="bold red"), "Exit the program")
console.print(table)
choice = console.input("\n[bold green]Enter choice > [/bold green]").strip().lower()
if choice == "q":
console.print("[dim]Goodbye![/dim]")
break
action_map = {item[0]: item[2] for item in menu_items}
action = action_map.get(choice)
if action is None:
console.print(f"[yellow]Invalid choice: {choice}[/yellow]")
continue
result = _run_test(action, config, console)
if result:
results_store["tests"][action] = result
return results_store
def _run_test(test_name: str, config: dict, console: Console) -> dict:
"""Execute a single test by name."""
try:
if test_name == "gpu_info":
m = GPUInfo(config)
result = m.run()
m.print_results(result)
return result
elif test_name == "health":
m = HealthCheck(config)
result = m.run()
m.print_results(result)
return result
elif test_name == "memory_bench":
m = Benchmark(config)
result = m.run_memory_benchmark()
Benchmark.print_results(result)
return result
elif test_name == "compute_bench":
m = Benchmark(config)
result = m.run_compute_benchmark()
Benchmark.print_results(result)
return result
elif test_name == "nccl":
m = NCCLTest(config)
result = m.run()
m.print_results(result)
return result
elif test_name == "stress":
m = StressTest(config)
result = m.run()
m.print_results(result)
return result
elif test_name == "rdma":
m = RDMATest(config)
result = m.run()
m.print_results(result)
return result
elif test_name == "training":
m = TrainingSim(config)
result = m.run()
m.print_results(result)
return result
elif test_name == "all":
return _run_full_suite(config, console)
elif test_name == "report":
console.print("[yellow]No test results to export. Run tests first.[/yellow]")
return {}
except KeyboardInterrupt:
console.print("\n[yellow]Test interrupted by user.[/yellow]")
return {"error": "interrupted"}
except Exception as e:
console.print(f"[bold red]Test failed: {e}[/bold red]")
return {"error": str(e)}
def _run_full_suite(config: dict, console: Console) -> dict:
"""Run all tests sequentially."""
console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE))
all_results: dict = {"timestamp": datetime.now().isoformat()}
tests = [
("gpu_info", "GPU Information", GPUInfo),
("health", "Health Check", HealthCheck),
("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)),
("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)),
("nccl", "NCCL Test", NCCLTest),
("stress", "GPU Stress Test", StressTest),
("rdma", "RDMA/IB Test", RDMATest),
("training", "Training Simulation", TrainingSim),
]
for i, (key, name, mod_cls) in enumerate(tests, 1):
console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]")
try:
mod = mod_cls(config)
if key == "memory_bench":
result = mod.run_memory_benchmark()
mod.print_results(result)
elif key == "compute_bench":
result = mod.run_compute_benchmark()
mod.print_results(result)
else:
result = mod.run()
mod.print_results(result)
all_results[key] = result
except Exception as e:
console.print(f"[bold red]{name} FAILED: {e}[/bold red]")
all_results[key] = {"error": str(e)}
# Summary
console.print("\n" + "=" * 60)
passed = sum(1 for v in all_results.values() if not isinstance(v, dict) or "error" not in v)
total = len(tests)
color = "green" if passed == total else ("yellow" if passed > 0 else "red")
console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]")
return all_results
def main():
parser = argparse.ArgumentParser(
description="H200 Training Server Test Suite",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python h200_tester.py # Interactive menu
python h200_tester.py --test gpu-info # GPU info
python h200_tester.py --test health # Health check
python h200_tester.py --test benchmark --type memory
python h200_tester.py --test benchmark --type compute --dtype fp16
python h200_tester.py --test nccl # NCCL test
python h200_tester.py --test training # Training sim
python h200_tester.py --test all # Full suite
python h200_tester.py --report --format json --output report.json
""",
)
parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"],
help="Run a specific test")
parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)")
parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"],
help="Compute benchmark dtype (with --test benchmark --type compute)")
parser.add_argument("--interactive", action="store_true", help="Force interactive mode")
parser.add_argument("--report", action="store_true", help="Generate report from last results")
parser.add_argument("--format", choices=["json", "html"], default="json", help="Report format")
parser.add_argument("--output", default=None, help="Report output file path")
parser.add_argument("--config", default=None, help="Path to config YAML file")
args = parser.parse_args()
config = load_config()
# Override config with CLI args
if args.config:
with open(args.config) as f:
config = yaml.safe_load(f)
console = Console()
# Handle --report standalone
if args.report and not args.test:
console.print("[yellow]Run tests first to generate a report.[/yellow]")
return
# Interactive mode
if args.interactive or not args.test:
interactive_menu(config)
return
# CLI mode
if not check_prerequisites(console):
sys.exit(1)
test_map = {
"gpu-info": "gpu_info",
"health": "health",
"benchmark": None,
"nccl": "nccl",
"stress": "stress",
"rdma": "rdma",
"training": "training",
"all": "all",
}
if args.test == "benchmark":
bench = Benchmark(config)
if args.type == "memory":
result = bench.run_memory_benchmark()
Benchmark.print_results(result)
elif args.type == "compute":
result = bench.run_compute_benchmark(dtypes=[args.dtype] if args.dtype else None)
Benchmark.print_results(result)
else:
# Run both
result = bench.run()
Benchmark.print_results(result)
elif args.test == "all":
_run_full_suite(config, console)
else:
_run_test(test_map[args.test], config, console)
if __name__ == "__main__":
main()