test_gpu_scripts/h200_tester.py

#!/usr/bin/env python3
"""H200 Training Server Test Suite - Main CLI Entry Point."""

import argparse
import json
import os
import signal
import sys
import time
from datetime import datetime
from pathlib import Path

import yaml
from rich.console import Console
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
from rich.table import Table
from rich.text import Text
from rich import box

from modules.gpu_info import GPUInfo
from modules.health_check import HealthCheck
from modules.benchmark import Benchmark
from modules.nccl_test import NCCLTest
from modules.training_sim import TrainingSim
from modules.stress_test import StressTest
from modules.rdma_test import RDMATest
from modules.report import ReportGenerator

DEFAULT_CONFIG = {
    "benchmark": {
        "memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3},
        "compute": {
            "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"],
            "matrix_size": 4096,
            "warmup": 10,
            "iterations": 100,
        },
    },
    "health": {"temp_warning": 80, "temp_critical": 90, "power_limit": 700},
    "nccl": {
        "min_bandwidth_gbps": 400,
        "test_allreduce": True,
        "test_alltoall": True,
        "test_broadcast": True,
        "test_reduce_scatter": False,
        "test_allgather": False,
        "test_sendrecv": False,
    },
    "stress": {
        "duration_sec": 60,
        "use_doubles": False,
        "use_tensor_cores": True,
        "memory_pct": 90,
        "gpus": "all",
    },
    "rdma": {
        "min_bandwidth_gbps": 50,
        "max_latency_us": 10,
        "ib_iterations": 1000,
        "msg_size": 65536,
        "ib_device": None,
        "ib_port": 1,
    },
    "training": {
        "model": "gpt2",
        "batch_size": 8,
        "seq_length": 2048,
        "num_steps": 50,
        "dtype": "bf16",
    },
    "report": {"output_dir": "./reports", "format": "json"},
    "tools": {"install_dir": "/opt/h200-test-tools"},
}

BANNER = r"""
[bold cyan]
╔══════════════════════════════════════════════════╗
║                                                  ║
║       H200 Training Server Test Suite            ║
║       GPU Diagnostics & Benchmarking Tool        ║
║                                                  ║
╚══════════════════════════════════════════════════╝
[/bold cyan]
"""


def load_config() -> dict:
    """Load config from yaml file, fallback to defaults."""
    config_path = Path(__file__).parent / "configs" / "default.yaml"
    if config_path.exists():
        with open(config_path) as f:
            return yaml.safe_load(f) or DEFAULT_CONFIG
    return DEFAULT_CONFIG.copy()


def check_prerequisites(console: Console) -> bool:
    """Check if required tools are available."""
    import shutil

    ok = True
    if not shutil.which("nvidia-smi"):
        console.print("[bold red]ERROR: nvidia-smi not found![/bold red]")
        console.print("  Please install NVIDIA drivers first.")
        ok = False
    return ok


def interactive_menu(config: dict):
    """Run interactive menu loop."""
    console = Console()

    console.print(BANNER)
    if not check_prerequisites(console):
        return

    results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}}

    menu_items = [
        ("1", "GPU Information", "gpu_info"),
        ("2", "Health Check", "health"),
        ("3", "Memory Benchmark (nvbandwidth)", "memory_bench"),
        ("4", "Compute Benchmark", "compute_bench"),
        ("5", "NCCL Multi-GPU Test", "nccl"),
        ("6", "GPU Stress Test (gpu-burn)", "stress"),
        ("7", "RDMA/IB Test", "rdma"),
        ("8", "Training Simulation", "training"),
        ("9", "Full Test Suite (All Tests)", "all"),
        ("0", "Generate Report", "report"),
    ]

    while True:
        console.print()
        table = Table(
            title="[bold cyan]Select a Test[/bold cyan]",
            box=box.ROUNDED,
            border_style="cyan",
            show_header=False,
            padding=(0, 2),
        )
        table.add_column("Key", style="bold yellow", width=5)
        table.add_column("Test Name")
        table.add_column("Description", style="dim")
        descriptions = {
            "gpu_info": "Detect GPUs, show specs & NVLink topology",
            "health": "Temperature, power, ECC errors, PCIe, DCGM",
            "memory_bench": "HBM3e bandwidth via nvbandwidth",
            "compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8",
            "nccl": "AllReduce, AllToAll, Broadcast via nccl-tests",
            "stress": "Long-running GPU stress via gpu-burn",
            "rdma": "InfiniBand bandwidth & latency (ib_write_bw)",
            "training": "Simulate LLM training with PyTorch",
            "all": "Run all tests sequentially",
            "report": "Export results to JSON/HTML",
        }
        for key, name, action in menu_items:
            table.add_row(f"[{key}]", name, descriptions.get(action, ""))
        table.add_row("[q]", Text("Quit", style="bold red"), "Exit the program")

        console.print(table)
        choice = console.input("\n[bold green]Enter choice > [/bold green]").strip().lower()

        if choice == "q":
            console.print("[dim]Goodbye![/dim]")
            break

        action_map = {item[0]: item[2] for item in menu_items}
        action = action_map.get(choice)
        if action is None:
            console.print(f"[yellow]Invalid choice: {choice}[/yellow]")
            continue

        result = _run_test(action, config, console)
        if result:
            results_store["tests"][action] = result

    return results_store


def _run_test(test_name: str, config: dict, console: Console) -> dict:
    """Execute a single test by name."""
    try:
        if test_name == "gpu_info":
            m = GPUInfo(config)
            result = m.run()
            m.print_results(result)
            return result

        elif test_name == "health":
            m = HealthCheck(config)
            result = m.run()
            m.print_results(result)
            return result

        elif test_name == "memory_bench":
            m = Benchmark(config)
            result = m.run_memory_benchmark()
            Benchmark.print_results(result)
            return result

        elif test_name == "compute_bench":
            m = Benchmark(config)
            result = m.run_compute_benchmark()
            Benchmark.print_results(result)
            return result

        elif test_name == "nccl":
            m = NCCLTest(config)
            result = m.run()
            m.print_results(result)
            return result

        elif test_name == "stress":
            m = StressTest(config)
            result = m.run()
            m.print_results(result)
            return result

        elif test_name == "rdma":
            m = RDMATest(config)
            result = m.run()
            m.print_results(result)
            return result

        elif test_name == "training":
            m = TrainingSim(config)
            result = m.run()
            m.print_results(result)
            return result

        elif test_name == "all":
            return _run_full_suite(config, console)

        elif test_name == "report":
            console.print("[yellow]No test results to export. Run tests first.[/yellow]")
            return {}

    except KeyboardInterrupt:
        console.print("\n[yellow]Test interrupted by user.[/yellow]")
        return {"error": "interrupted"}
    except Exception as e:
        console.print(f"[bold red]Test failed: {e}[/bold red]")
        return {"error": str(e)}


def _run_full_suite(config: dict, console: Console) -> dict:
    """Run all tests sequentially."""
    console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE))
    all_results: dict = {"timestamp": datetime.now().isoformat()}
    tests = [
        ("gpu_info", "GPU Information", GPUInfo),
        ("health", "Health Check", HealthCheck),
        ("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)),
        ("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)),
        ("nccl", "NCCL Test", NCCLTest),
        ("stress", "GPU Stress Test", StressTest),
        ("rdma", "RDMA/IB Test", RDMATest),
        ("training", "Training Simulation", TrainingSim),
    ]

    for i, (key, name, mod_cls) in enumerate(tests, 1):
        console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]")
        try:
            mod = mod_cls(config)
            if key == "memory_bench":
                result = mod.run_memory_benchmark()
                mod.print_results(result)
            elif key == "compute_bench":
                result = mod.run_compute_benchmark()
                mod.print_results(result)
            else:
                result = mod.run()
                mod.print_results(result)
            all_results[key] = result
        except Exception as e:
            console.print(f"[bold red]{name} FAILED: {e}[/bold red]")
            all_results[key] = {"error": str(e)}

    # Summary
    console.print("\n" + "=" * 60)
    passed = sum(1 for v in all_results.values() if not isinstance(v, dict) or "error" not in v)
    total = len(tests)
    color = "green" if passed == total else ("yellow" if passed > 0 else "red")
    console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]")
    return all_results


def main():
    parser = argparse.ArgumentParser(
        description="H200 Training Server Test Suite",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python h200_tester.py                        # Interactive menu
  python h200_tester.py --test gpu-info        # GPU info
  python h200_tester.py --test health           # Health check
  python h200_tester.py --test benchmark --type memory
  python h200_tester.py --test benchmark --type compute --dtype fp16
  python h200_tester.py --test nccl             # NCCL test
  python h200_tester.py --test training         # Training sim
  python h200_tester.py --test all              # Full suite
  python h200_tester.py --report --format json --output report.json
        """,
    )
    parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"],
                        help="Run a specific test")
    parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)")
    parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"],
                        help="Compute benchmark dtype (with --test benchmark --type compute)")
    parser.add_argument("--interactive", action="store_true", help="Force interactive mode")
    parser.add_argument("--report", action="store_true", help="Generate report from last results")
    parser.add_argument("--format", choices=["json", "html"], default="json", help="Report format")
    parser.add_argument("--output", default=None, help="Report output file path")
    parser.add_argument("--config", default=None, help="Path to config YAML file")

    args = parser.parse_args()
    config = load_config()

    # Override config with CLI args
    if args.config:
        with open(args.config) as f:
            config = yaml.safe_load(f)

    console = Console()

    # Handle --report standalone
    if args.report and not args.test:
        console.print("[yellow]Run tests first to generate a report.[/yellow]")
        return

    # Interactive mode
    if args.interactive or not args.test:
        interactive_menu(config)
        return

    # CLI mode
    if not check_prerequisites(console):
        sys.exit(1)

    test_map = {
        "gpu-info": "gpu_info",
        "health": "health",
        "benchmark": None,
        "nccl": "nccl",
        "stress": "stress",
        "rdma": "rdma",
        "training": "training",
        "all": "all",
    }

    if args.test == "benchmark":
        bench = Benchmark(config)
        if args.type == "memory":
            result = bench.run_memory_benchmark()
            Benchmark.print_results(result)
        elif args.type == "compute":
            result = bench.run_compute_benchmark(dtypes=[args.dtype] if args.dtype else None)
            Benchmark.print_results(result)
        else:
            # Run both
            result = bench.run()
            Benchmark.print_results(result)
    elif args.test == "all":
        _run_full_suite(config, console)
    else:
        _run_test(test_map[args.test], config, console)


if __name__ == "__main__":
    main()