#!/usr/bin/env python3 """GPU Training Server Test Suite - Main CLI Entry Point.""" import argparse import json import os import signal import sys import time from datetime import datetime from pathlib import Path import yaml from rich.console import Console from rich.panel import Panel from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn from rich.table import Table from rich.text import Text from rich import box from modules.gpu_info import GPUInfo from modules.health_check import HealthCheck from modules.benchmark import Benchmark from modules.nccl_test import NCCLTest from modules.training_sim import TrainingSim from modules.stress_test import StressTest from modules.rdma_test import RDMATest from modules.report import ReportGenerator from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility DEFAULT_CONFIG = { "benchmark": { "memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3}, "compute": { "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"], "matrix_size": 4096, "warmup": 10, "iterations": 100, }, }, "health": {"temp_warning": 80, "temp_critical": 90, "power_limit": None}, "nccl": { "min_bandwidth_gbps": None, "test_allreduce": True, "test_alltoall": True, "test_broadcast": True, "test_reduce_scatter": False, "test_allgather": False, "test_sendrecv": False, }, "stress": { "duration_sec": 60, "use_doubles": False, "use_tensor_cores": True, "memory_pct": 90, "gpus": "all", }, "rdma": { "min_bandwidth_gbps": 50, "max_latency_us": 10, "ib_iterations": 1000, "msg_size": 65536, "ib_device": None, "ib_port": 1, }, "training": { "model": "gpt2", "batch_size": 8, "seq_length": 2048, "num_steps": 50, "dtype": "bf16", }, "report": {"output_dir": "./reports", "format": "json"}, "tools": {"install_dir": "/opt/gpu-test-tools"}, } def _build_banner() -> str: gpu_list = " / ".join(g.upper() for g in get_supported_gpus()) return ( "[bold cyan]\n" "╔══════════════════════════════════════════════════════════╗\n" "║ ║\n" "║ GPU Training Server Test Suite ║\n" "║ Diagnostics & Benchmarking Tool ║\n" f"║ Supports: {gpu_list:<40s} ║\n" "║ ║\n" "╚══════════════════════════════════════════════════════════╝\n" "[/bold cyan]" ) def load_config() -> dict: """Load config from yaml file, fallback to defaults.""" config_path = Path(__file__).parent / "configs" / "default.yaml" if config_path.exists(): with open(config_path) as f: return yaml.safe_load(f) or DEFAULT_CONFIG return DEFAULT_CONFIG.copy() def check_prerequisites(console: Console) -> bool: """Check if required tools are available.""" import shutil ok = True if not shutil.which("nvidia-smi"): console.print("[bold red]ERROR: nvidia-smi not found![/bold red]") console.print(" Please install NVIDIA drivers first.") ok = False return ok def interactive_menu(config: dict): """Run interactive menu loop.""" console = Console() console.print(_build_banner()) gpu_type = detect_gpu_type() gpu_label = get_gpu_label(gpu_type) if gpu_type != "unknown": console.print(f"[bold green]Detected GPU: {gpu_label} ({gpu_type.upper()})[/bold green]\n") else: console.print("[yellow]GPU type could not be auto-detected. Using default thresholds.[/yellow]\n") # Driver / CUDA compatibility check compat_warnings = validate_driver_compatibility(gpu_type) for w in compat_warnings: console.print(f"[bold yellow]\u26a0 {w}[/bold yellow]") if not check_prerequisites(console): return results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}} menu_items = [ ("1", "GPU Information", "gpu_info"), ("2", "Health Check", "health"), ("3", "Memory Benchmark (nvbandwidth)", "memory_bench"), ("4", "Compute Benchmark", "compute_bench"), ("5", "NCCL Multi-GPU Test", "nccl"), ("6", "GPU Stress Test (gpu-burn)", "stress"), ("7", "RDMA/IB Test", "rdma"), ("8", "Training Simulation", "training"), ("9", "Full Test Suite (All Tests)", "all"), ("0", "Generate Report", "report"), ] while True: console.print() table = Table( title="[bold cyan]Select a Test[/bold cyan]", box=box.ROUNDED, border_style="cyan", show_header=False, padding=(0, 2), ) table.add_column("Key", style="bold yellow", width=5) table.add_column("Test Name") table.add_column("Description", style="dim") descriptions = { "gpu_info": "Detect GPUs, show specs & NVLink topology", "health": "Temperature, power, ECC errors, PCIe, DCGM", "memory_bench": "HBM bandwidth via nvbandwidth", "compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8", "nccl": "AllReduce, AllToAll, Broadcast via nccl-tests", "stress": "Long-running GPU stress via gpu-burn", "rdma": "InfiniBand bandwidth & latency (ib_write_bw)", "training": "Simulate LLM training with PyTorch", "all": "Run all tests sequentially", "report": "Export results to JSON/HTML", } for key, name, action in menu_items: table.add_row(f"[{key}]", name, descriptions.get(action, "")) table.add_row("[q]", Text("Quit", style="bold red"), "Exit the program") console.print(table) choice = console.input("\n[bold green]Enter choice > [/bold green]").strip().lower() if choice == "q": if results_store.get("tests"): _save_results_prompt(results_store, config, console) console.print("[dim]Goodbye![/dim]") break action_map = {item[0]: item[2] for item in menu_items} action = action_map.get(choice) if action is None: console.print(f"[yellow]Invalid choice: {choice}[/yellow]") continue result = _run_test(action, config, console) if result: if result.get("__report__"): if results_store.get("tests"): rg = ReportGenerator(config) rg.generate(results_store) else: console.print("[yellow]No test results to export. Run tests first.[/yellow]") else: results_store["tests"][action] = result return results_store def _save_results_prompt(results_store: dict, config: dict, console: Console): if not results_store.get("tests"): return save = console.input("[bold green]Save results before quitting? [y/N]: [/bold green]").strip().lower() if save == "y": rg = ReportGenerator(config) rg.generate(results_store) def _run_test(test_name: str, config: dict, console: Console) -> dict: """Execute a single test by name.""" try: if test_name == "gpu_info": m = GPUInfo(config) result = m.run() m.print_results(result) return result elif test_name == "health": m = HealthCheck(config) result = m.run() m.print_results(result) return result elif test_name == "memory_bench": m = Benchmark(config) result = m.run_memory_benchmark() Benchmark.print_results(result) return result elif test_name == "compute_bench": m = Benchmark(config) result = m.run_compute_benchmark() Benchmark.print_results(result) return result elif test_name == "nccl": m = NCCLTest(config) result = m.run() m.print_results(result) return result elif test_name == "stress": m = StressTest(config) result = m.run() m.print_results(result) return result elif test_name == "rdma": m = RDMATest(config) result = m.run() m.print_results(result) return result elif test_name == "training": m = TrainingSim(config) result = m.run() m.print_results(result) return result elif test_name == "all": return _run_full_suite(config, console) elif test_name == "report": return {"__report__": True} except KeyboardInterrupt: console.print("\n[yellow]Test interrupted by user.[/yellow]") return {"error": "interrupted"} except Exception as e: console.print(f"[bold red]Test failed: {e}[/bold red]") return {"error": str(e)} def _run_full_suite(config: dict, console: Console) -> dict: """Run all tests sequentially.""" console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE)) all_results: dict = {"timestamp": datetime.now().isoformat()} tests = [ ("gpu_info", "GPU Information", GPUInfo), ("health", "Health Check", HealthCheck), ("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)), ("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)), ("nccl", "NCCL Test", NCCLTest), ("stress", "GPU Stress Test", StressTest), ("rdma", "RDMA/IB Test", RDMATest), ("training", "Training Simulation", TrainingSim), ] for i, (key, name, mod_cls) in enumerate(tests, 1): console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]") try: mod = mod_cls(config) if key == "memory_bench": result = mod.run_memory_benchmark() mod.print_results(result) elif key == "compute_bench": result = mod.run_compute_benchmark() mod.print_results(result) else: result = mod.run() mod.print_results(result) all_results[key] = result except Exception as e: console.print(f"[bold red]{name} FAILED: {e}[/bold red]") all_results[key] = {"error": str(e)} # Summary console.print("\n" + "=" * 60) # Only count test results, exclude metadata like timestamp test_results = {k: v for k, v in all_results.items() if k != "timestamp"} passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v) total = len(test_results) color = "green" if passed == total else ("yellow" if passed > 0 else "red") console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]") return all_results def main(): gpu_list_str = " / ".join(g.upper() for g in get_supported_gpus()) parser = argparse.ArgumentParser( description=f"GPU Training Server Test Suite ({gpu_list_str})", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python gpu_tester.py # Interactive menu python gpu_tester.py --gpu-type a800 # Override GPU type python gpu_tester.py --test gpu-info # GPU info python gpu_tester.py --test health # Health check python gpu_tester.py --test benchmark --type memory python gpu_tester.py --test benchmark --type compute --dtype fp16 python gpu_tester.py --test nccl # NCCL test python gpu_tester.py --test training # Training sim python gpu_tester.py --test all # Full suite python gpu_tester.py --report --format json --output report.json """, ) parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"], help="Run a specific test") parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)") parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"], help="Compute benchmark dtype (with --test benchmark --type compute)") parser.add_argument("--interactive", action="store_true", help="Force interactive mode") parser.add_argument("--report", action="store_true", help="Generate report from last results") parser.add_argument("--format", choices=["json", "html", "md"], default="json", help="Report format") parser.add_argument("--output", default=None, help="Report output file path") parser.add_argument("--config", default=None, help="Path to config YAML file") parser.add_argument( "--gpu-type", choices=["auto"] + get_supported_gpus(), default="auto", help="Override GPU type detection", ) args = parser.parse_args() config = load_config() # Override config with CLI args (load before gpu_type so custom configs work) if args.config: with open(args.config) as f: config = yaml.safe_load(f) # Set GPU type after config is finalized if args.gpu_type and args.gpu_type != "auto": config["gpu_type"] = args.gpu_type else: config["gpu_type"] = detect_gpu_type() console = Console() # Driver / CUDA compatibility check compat_warnings = validate_driver_compatibility(config["gpu_type"]) for w in compat_warnings: console.print(f"[bold yellow]\u26a0 {w}[/bold yellow]") # Handle --report standalone if args.report and not args.test: console.print("[yellow]Run tests first to generate a report.[/yellow]") return # Interactive mode if args.interactive or not args.test: interactive_menu(config) return # CLI mode if not check_prerequisites(console): sys.exit(1) test_map = { "gpu-info": "gpu_info", "health": "health", "benchmark": None, "nccl": "nccl", "stress": "stress", "rdma": "rdma", "training": "training", "all": "all", } if args.test == "benchmark": bench = Benchmark(config) if args.type == "memory": result = bench.run_memory_benchmark() Benchmark.print_results(result) elif args.type == "compute": result = bench.run_compute_benchmark(dtypes=[args.dtype] if args.dtype else None) Benchmark.print_results(result) else: result = bench.run() Benchmark.print_results(result) if args.report: ReportGenerator(config).generate({"benchmark": result, "timestamp": datetime.now().isoformat()}, fmt=args.format, output=args.output) elif args.test == "all": results = _run_full_suite(config, console) if args.report: ReportGenerator(config).generate(results, fmt=args.format, output=args.output) has_errors = any("error" in v for v in results.values() if isinstance(v, dict)) sys.exit(1 if has_errors else 0) else: result = _run_test(test_map[args.test], config, console) if args.report and result: ReportGenerator(config).generate({args.test: result, "timestamp": datetime.now().isoformat()}, fmt=args.format, output=args.output) if __name__ == "__main__": main()