From d4f46b63947c6b55450a852cbdfab1dafb6a0350 Mon Sep 17 00:00:00 2001 From: qinyusen Date: Sat, 25 Apr 2026 17:23:35 +0800 Subject: [PATCH] add: CLI entry point with interactive menu and argument parsing Co-authored-by: Sisyphus --- h200_tester.py | 370 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 370 insertions(+) create mode 100644 h200_tester.py diff --git a/h200_tester.py b/h200_tester.py new file mode 100644 index 0000000..6492fd3 --- /dev/null +++ b/h200_tester.py @@ -0,0 +1,370 @@ +#!/usr/bin/env python3 +"""H200 Training Server Test Suite - Main CLI Entry Point.""" + +import argparse +import json +import os +import signal +import sys +import time +from datetime import datetime +from pathlib import Path + +import yaml +from rich.console import Console +from rich.panel import Panel +from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn +from rich.table import Table +from rich.text import Text +from rich import box + +from modules.gpu_info import GPUInfo +from modules.health_check import HealthCheck +from modules.benchmark import Benchmark +from modules.nccl_test import NCCLTest +from modules.training_sim import TrainingSim +from modules.stress_test import StressTest +from modules.rdma_test import RDMATest +from modules.report import ReportGenerator + +DEFAULT_CONFIG = { + "benchmark": { + "memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3}, + "compute": { + "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"], + "matrix_size": 4096, + "warmup": 10, + "iterations": 100, + }, + }, + "health": {"temp_warning": 80, "temp_critical": 90, "power_limit": 700}, + "nccl": { + "min_bandwidth_gbps": 400, + "test_allreduce": True, + "test_alltoall": True, + "test_broadcast": True, + "test_reduce_scatter": False, + "test_allgather": False, + "test_sendrecv": False, + }, + "stress": { + "duration_sec": 60, + "use_doubles": False, + "use_tensor_cores": True, + "memory_pct": 90, + "gpus": "all", + }, + "rdma": { + "min_bandwidth_gbps": 50, + "max_latency_us": 10, + "ib_iterations": 1000, + "msg_size": 65536, + "ib_device": None, + "ib_port": 1, + }, + "training": { + "model": "gpt2", + "batch_size": 8, + "seq_length": 2048, + "num_steps": 50, + "dtype": "bf16", + }, + "report": {"output_dir": "./reports", "format": "json"}, + "tools": {"install_dir": "/opt/h200-test-tools"}, +} + +BANNER = r""" +[bold cyan] +╔══════════════════════════════════════════════════╗ +║ ║ +║ H200 Training Server Test Suite ║ +║ GPU Diagnostics & Benchmarking Tool ║ +║ ║ +╚══════════════════════════════════════════════════╝ +[/bold cyan] +""" + + +def load_config() -> dict: + """Load config from yaml file, fallback to defaults.""" + config_path = Path(__file__).parent / "configs" / "default.yaml" + if config_path.exists(): + with open(config_path) as f: + return yaml.safe_load(f) or DEFAULT_CONFIG + return DEFAULT_CONFIG.copy() + + +def check_prerequisites(console: Console) -> bool: + """Check if required tools are available.""" + import shutil + + ok = True + if not shutil.which("nvidia-smi"): + console.print("[bold red]ERROR: nvidia-smi not found![/bold red]") + console.print(" Please install NVIDIA drivers first.") + ok = False + return ok + + +def interactive_menu(config: dict): + """Run interactive menu loop.""" + console = Console() + + console.print(BANNER) + if not check_prerequisites(console): + return + + results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}} + + menu_items = [ + ("1", "GPU Information", "gpu_info"), + ("2", "Health Check", "health"), + ("3", "Memory Benchmark (nvbandwidth)", "memory_bench"), + ("4", "Compute Benchmark", "compute_bench"), + ("5", "NCCL Multi-GPU Test", "nccl"), + ("6", "GPU Stress Test (gpu-burn)", "stress"), + ("7", "RDMA/IB Test", "rdma"), + ("8", "Training Simulation", "training"), + ("9", "Full Test Suite (All Tests)", "all"), + ("0", "Generate Report", "report"), + ] + + while True: + console.print() + table = Table( + title="[bold cyan]Select a Test[/bold cyan]", + box=box.ROUNDED, + border_style="cyan", + show_header=False, + padding=(0, 2), + ) + table.add_column("Key", style="bold yellow", width=5) + table.add_column("Test Name") + table.add_column("Description", style="dim") + descriptions = { + "gpu_info": "Detect GPUs, show specs & NVLink topology", + "health": "Temperature, power, ECC errors, PCIe, DCGM", + "memory_bench": "HBM3e bandwidth via nvbandwidth", + "compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8", + "nccl": "AllReduce, AllToAll, Broadcast via nccl-tests", + "stress": "Long-running GPU stress via gpu-burn", + "rdma": "InfiniBand bandwidth & latency (ib_write_bw)", + "training": "Simulate LLM training with PyTorch", + "all": "Run all tests sequentially", + "report": "Export results to JSON/HTML", + } + for key, name, action in menu_items: + table.add_row(f"[{key}]", name, descriptions.get(action, "")) + table.add_row("[q]", Text("Quit", style="bold red"), "Exit the program") + + console.print(table) + choice = console.input("\n[bold green]Enter choice > [/bold green]").strip().lower() + + if choice == "q": + console.print("[dim]Goodbye![/dim]") + break + + action_map = {item[0]: item[2] for item in menu_items} + action = action_map.get(choice) + if action is None: + console.print(f"[yellow]Invalid choice: {choice}[/yellow]") + continue + + result = _run_test(action, config, console) + if result: + results_store["tests"][action] = result + + return results_store + + +def _run_test(test_name: str, config: dict, console: Console) -> dict: + """Execute a single test by name.""" + try: + if test_name == "gpu_info": + m = GPUInfo(config) + result = m.run() + m.print_results(result) + return result + + elif test_name == "health": + m = HealthCheck(config) + result = m.run() + m.print_results(result) + return result + + elif test_name == "memory_bench": + m = Benchmark(config) + result = m.run_memory_benchmark() + Benchmark.print_results(result) + return result + + elif test_name == "compute_bench": + m = Benchmark(config) + result = m.run_compute_benchmark() + Benchmark.print_results(result) + return result + + elif test_name == "nccl": + m = NCCLTest(config) + result = m.run() + m.print_results(result) + return result + + elif test_name == "stress": + m = StressTest(config) + result = m.run() + m.print_results(result) + return result + + elif test_name == "rdma": + m = RDMATest(config) + result = m.run() + m.print_results(result) + return result + + elif test_name == "training": + m = TrainingSim(config) + result = m.run() + m.print_results(result) + return result + + elif test_name == "all": + return _run_full_suite(config, console) + + elif test_name == "report": + console.print("[yellow]No test results to export. Run tests first.[/yellow]") + return {} + + except KeyboardInterrupt: + console.print("\n[yellow]Test interrupted by user.[/yellow]") + return {"error": "interrupted"} + except Exception as e: + console.print(f"[bold red]Test failed: {e}[/bold red]") + return {"error": str(e)} + + +def _run_full_suite(config: dict, console: Console) -> dict: + """Run all tests sequentially.""" + console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE)) + all_results: dict = {"timestamp": datetime.now().isoformat()} + tests = [ + ("gpu_info", "GPU Information", GPUInfo), + ("health", "Health Check", HealthCheck), + ("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)), + ("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)), + ("nccl", "NCCL Test", NCCLTest), + ("stress", "GPU Stress Test", StressTest), + ("rdma", "RDMA/IB Test", RDMATest), + ("training", "Training Simulation", TrainingSim), + ] + + for i, (key, name, mod_cls) in enumerate(tests, 1): + console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]") + try: + mod = mod_cls(config) + if key == "memory_bench": + result = mod.run_memory_benchmark() + mod.print_results(result) + elif key == "compute_bench": + result = mod.run_compute_benchmark() + mod.print_results(result) + else: + result = mod.run() + mod.print_results(result) + all_results[key] = result + except Exception as e: + console.print(f"[bold red]{name} FAILED: {e}[/bold red]") + all_results[key] = {"error": str(e)} + + # Summary + console.print("\n" + "=" * 60) + passed = sum(1 for v in all_results.values() if not isinstance(v, dict) or "error" not in v) + total = len(tests) + color = "green" if passed == total else ("yellow" if passed > 0 else "red") + console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]") + return all_results + + +def main(): + parser = argparse.ArgumentParser( + description="H200 Training Server Test Suite", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python h200_tester.py # Interactive menu + python h200_tester.py --test gpu-info # GPU info + python h200_tester.py --test health # Health check + python h200_tester.py --test benchmark --type memory + python h200_tester.py --test benchmark --type compute --dtype fp16 + python h200_tester.py --test nccl # NCCL test + python h200_tester.py --test training # Training sim + python h200_tester.py --test all # Full suite + python h200_tester.py --report --format json --output report.json + """, + ) + parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"], + help="Run a specific test") + parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)") + parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"], + help="Compute benchmark dtype (with --test benchmark --type compute)") + parser.add_argument("--interactive", action="store_true", help="Force interactive mode") + parser.add_argument("--report", action="store_true", help="Generate report from last results") + parser.add_argument("--format", choices=["json", "html"], default="json", help="Report format") + parser.add_argument("--output", default=None, help="Report output file path") + parser.add_argument("--config", default=None, help="Path to config YAML file") + + args = parser.parse_args() + config = load_config() + + # Override config with CLI args + if args.config: + with open(args.config) as f: + config = yaml.safe_load(f) + + console = Console() + + # Handle --report standalone + if args.report and not args.test: + console.print("[yellow]Run tests first to generate a report.[/yellow]") + return + + # Interactive mode + if args.interactive or not args.test: + interactive_menu(config) + return + + # CLI mode + if not check_prerequisites(console): + sys.exit(1) + + test_map = { + "gpu-info": "gpu_info", + "health": "health", + "benchmark": None, + "nccl": "nccl", + "stress": "stress", + "rdma": "rdma", + "training": "training", + "all": "all", + } + + if args.test == "benchmark": + bench = Benchmark(config) + if args.type == "memory": + result = bench.run_memory_benchmark() + Benchmark.print_results(result) + elif args.type == "compute": + result = bench.run_compute_benchmark(dtypes=[args.dtype] if args.dtype else None) + Benchmark.print_results(result) + else: + # Run both + result = bench.run() + Benchmark.print_results(result) + elif args.test == "all": + _run_full_suite(config, console) + else: + _run_test(test_map[args.test], config, console) + + +if __name__ == "__main__": + main()