Key changes: - stress_test: use torch.cuda.mem_get_info() for free memory instead of total, allocate 40% to avoid OOM when other processes occupy GPU memory - benchmark: fix D2D efficiency by comparing to NVLink per-direction bandwidth (not HBM), add H2D/D2H efficiency against PCIe peak - nccl_test: implement direct binary → mpirun → torchrun fallback chain, fix min_bw None bug when YAML value is empty - report: update memory section to use per-metric peak fields - install_deps.sh: add NCCL compatibility detection, enhance CUDA version detection with CUDA_HOME/standard paths, improve _map_cuda_tag logging - gpu_info: parse CUDA version from nvidia-smi header (query field removed in newer drivers) - health_check: parse throttle_reasons bitmask properly, ignore gpu_idle bit - gpu_tester: fix suite summary to exclude metadata keys from pass count 🤖 Generated with [Qoder][https://qoder.com]
432 lines
16 KiB
Python
432 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""GPU Training Server Test Suite (A100/A800/H100/H200/B200/B300) - Main CLI Entry Point."""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import signal
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
from rich.console import Console
|
|
from rich.panel import Panel
|
|
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
from rich.table import Table
|
|
from rich.text import Text
|
|
from rich import box
|
|
|
|
from modules.gpu_info import GPUInfo
|
|
from modules.health_check import HealthCheck
|
|
from modules.benchmark import Benchmark
|
|
from modules.nccl_test import NCCLTest
|
|
from modules.training_sim import TrainingSim
|
|
from modules.stress_test import StressTest
|
|
from modules.rdma_test import RDMATest
|
|
from modules.report import ReportGenerator
|
|
from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility
|
|
|
|
DEFAULT_CONFIG = {
|
|
"benchmark": {
|
|
"memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3},
|
|
"compute": {
|
|
"dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"],
|
|
"matrix_size": 4096,
|
|
"warmup": 10,
|
|
"iterations": 100,
|
|
},
|
|
},
|
|
"health": {"temp_warning": 80, "temp_critical": 90, "power_limit": None},
|
|
"nccl": {
|
|
"min_bandwidth_gbps": None,
|
|
"test_allreduce": True,
|
|
"test_alltoall": True,
|
|
"test_broadcast": True,
|
|
"test_reduce_scatter": False,
|
|
"test_allgather": False,
|
|
"test_sendrecv": False,
|
|
},
|
|
"stress": {
|
|
"duration_sec": 60,
|
|
"use_doubles": False,
|
|
"use_tensor_cores": True,
|
|
"memory_pct": 90,
|
|
"gpus": "all",
|
|
},
|
|
"rdma": {
|
|
"min_bandwidth_gbps": 50,
|
|
"max_latency_us": 10,
|
|
"ib_iterations": 1000,
|
|
"msg_size": 65536,
|
|
"ib_device": None,
|
|
"ib_port": 1,
|
|
},
|
|
"training": {
|
|
"model": "gpt2",
|
|
"batch_size": 8,
|
|
"seq_length": 2048,
|
|
"num_steps": 50,
|
|
"dtype": "bf16",
|
|
},
|
|
"report": {"output_dir": "./reports", "format": "json"},
|
|
"tools": {"install_dir": "/opt/gpu-test-tools"},
|
|
}
|
|
|
|
BANNER = r"""
|
|
[bold cyan]
|
|
╔══════════════════════════════════════════════════════╗
|
|
║ ║
|
|
║ GPU Training Server Test Suite ║
|
|
║ Diagnostics & Benchmarking Tool ║
|
|
║ Supports: A100 / A800 / H100 / H200 / B200 / B300 ║
|
|
║ ║
|
|
╚══════════════════════════════════════════════════════════╝
|
|
[/bold cyan]
|
|
"""
|
|
|
|
|
|
def load_config() -> dict:
|
|
"""Load config from yaml file, fallback to defaults."""
|
|
config_path = Path(__file__).parent / "configs" / "default.yaml"
|
|
if config_path.exists():
|
|
with open(config_path) as f:
|
|
return yaml.safe_load(f) or DEFAULT_CONFIG
|
|
return DEFAULT_CONFIG.copy()
|
|
|
|
|
|
def check_prerequisites(console: Console) -> bool:
|
|
"""Check if required tools are available."""
|
|
import shutil
|
|
|
|
ok = True
|
|
if not shutil.which("nvidia-smi"):
|
|
console.print("[bold red]ERROR: nvidia-smi not found![/bold red]")
|
|
console.print(" Please install NVIDIA drivers first.")
|
|
ok = False
|
|
return ok
|
|
|
|
|
|
def interactive_menu(config: dict):
|
|
"""Run interactive menu loop."""
|
|
console = Console()
|
|
|
|
console.print(BANNER)
|
|
|
|
gpu_type = detect_gpu_type()
|
|
gpu_label = get_gpu_label(gpu_type)
|
|
if gpu_type != "unknown":
|
|
console.print(f"[bold green]Detected GPU: {gpu_label} ({gpu_type.upper()})[/bold green]\n")
|
|
else:
|
|
console.print("[yellow]GPU type could not be auto-detected. Using default thresholds.[/yellow]\n")
|
|
|
|
# Driver / CUDA compatibility check
|
|
compat_warnings = validate_driver_compatibility(gpu_type)
|
|
for w in compat_warnings:
|
|
console.print(f"[bold yellow]\u26a0 {w}[/bold yellow]")
|
|
|
|
if not check_prerequisites(console):
|
|
return
|
|
|
|
results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}}
|
|
|
|
menu_items = [
|
|
("1", "GPU Information", "gpu_info"),
|
|
("2", "Health Check", "health"),
|
|
("3", "Memory Benchmark (nvbandwidth)", "memory_bench"),
|
|
("4", "Compute Benchmark", "compute_bench"),
|
|
("5", "NCCL Multi-GPU Test", "nccl"),
|
|
("6", "GPU Stress Test (gpu-burn)", "stress"),
|
|
("7", "RDMA/IB Test", "rdma"),
|
|
("8", "Training Simulation", "training"),
|
|
("9", "Full Test Suite (All Tests)", "all"),
|
|
("0", "Generate Report", "report"),
|
|
]
|
|
|
|
while True:
|
|
console.print()
|
|
table = Table(
|
|
title="[bold cyan]Select a Test[/bold cyan]",
|
|
box=box.ROUNDED,
|
|
border_style="cyan",
|
|
show_header=False,
|
|
padding=(0, 2),
|
|
)
|
|
table.add_column("Key", style="bold yellow", width=5)
|
|
table.add_column("Test Name")
|
|
table.add_column("Description", style="dim")
|
|
descriptions = {
|
|
"gpu_info": "Detect GPUs, show specs & NVLink topology",
|
|
"health": "Temperature, power, ECC errors, PCIe, DCGM",
|
|
"memory_bench": "HBM bandwidth via nvbandwidth",
|
|
"compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8",
|
|
"nccl": "AllReduce, AllToAll, Broadcast via nccl-tests",
|
|
"stress": "Long-running GPU stress via gpu-burn",
|
|
"rdma": "InfiniBand bandwidth & latency (ib_write_bw)",
|
|
"training": "Simulate LLM training with PyTorch",
|
|
"all": "Run all tests sequentially",
|
|
"report": "Export results to JSON/HTML",
|
|
}
|
|
for key, name, action in menu_items:
|
|
table.add_row(f"[{key}]", name, descriptions.get(action, ""))
|
|
table.add_row("[q]", Text("Quit", style="bold red"), "Exit the program")
|
|
|
|
console.print(table)
|
|
choice = console.input("\n[bold green]Enter choice > [/bold green]").strip().lower()
|
|
|
|
if choice == "q":
|
|
if results_store.get("tests"):
|
|
_save_results_prompt(results_store, config, console)
|
|
console.print("[dim]Goodbye![/dim]")
|
|
break
|
|
|
|
action_map = {item[0]: item[2] for item in menu_items}
|
|
action = action_map.get(choice)
|
|
if action is None:
|
|
console.print(f"[yellow]Invalid choice: {choice}[/yellow]")
|
|
continue
|
|
|
|
result = _run_test(action, config, console)
|
|
if result:
|
|
if result.get("__report__"):
|
|
if results_store.get("tests"):
|
|
rg = ReportGenerator(config)
|
|
rg.generate(results_store)
|
|
else:
|
|
console.print("[yellow]No test results to export. Run tests first.[/yellow]")
|
|
else:
|
|
results_store["tests"][action] = result
|
|
|
|
return results_store
|
|
|
|
|
|
def _save_results_prompt(results_store: dict, config: dict, console: Console):
|
|
if not results_store.get("tests"):
|
|
return
|
|
save = console.input("[bold green]Save results before quitting? [y/N]: [/bold green]").strip().lower()
|
|
if save == "y":
|
|
rg = ReportGenerator(config)
|
|
rg.generate(results_store)
|
|
|
|
|
|
def _run_test(test_name: str, config: dict, console: Console) -> dict:
|
|
"""Execute a single test by name."""
|
|
try:
|
|
if test_name == "gpu_info":
|
|
m = GPUInfo(config)
|
|
result = m.run()
|
|
m.print_results(result)
|
|
return result
|
|
|
|
elif test_name == "health":
|
|
m = HealthCheck(config)
|
|
result = m.run()
|
|
m.print_results(result)
|
|
return result
|
|
|
|
elif test_name == "memory_bench":
|
|
m = Benchmark(config)
|
|
result = m.run_memory_benchmark()
|
|
Benchmark.print_results(result)
|
|
return result
|
|
|
|
elif test_name == "compute_bench":
|
|
m = Benchmark(config)
|
|
result = m.run_compute_benchmark()
|
|
Benchmark.print_results(result)
|
|
return result
|
|
|
|
elif test_name == "nccl":
|
|
m = NCCLTest(config)
|
|
result = m.run()
|
|
m.print_results(result)
|
|
return result
|
|
|
|
elif test_name == "stress":
|
|
m = StressTest(config)
|
|
result = m.run()
|
|
m.print_results(result)
|
|
return result
|
|
|
|
elif test_name == "rdma":
|
|
m = RDMATest(config)
|
|
result = m.run()
|
|
m.print_results(result)
|
|
return result
|
|
|
|
elif test_name == "training":
|
|
m = TrainingSim(config)
|
|
result = m.run()
|
|
m.print_results(result)
|
|
return result
|
|
|
|
elif test_name == "all":
|
|
return _run_full_suite(config, console)
|
|
|
|
elif test_name == "report":
|
|
return {"__report__": True}
|
|
|
|
except KeyboardInterrupt:
|
|
console.print("\n[yellow]Test interrupted by user.[/yellow]")
|
|
return {"error": "interrupted"}
|
|
except Exception as e:
|
|
console.print(f"[bold red]Test failed: {e}[/bold red]")
|
|
return {"error": str(e)}
|
|
|
|
|
|
def _run_full_suite(config: dict, console: Console) -> dict:
|
|
"""Run all tests sequentially."""
|
|
console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE))
|
|
all_results: dict = {"timestamp": datetime.now().isoformat()}
|
|
tests = [
|
|
("gpu_info", "GPU Information", GPUInfo),
|
|
("health", "Health Check", HealthCheck),
|
|
("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)),
|
|
("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)),
|
|
("nccl", "NCCL Test", NCCLTest),
|
|
("stress", "GPU Stress Test", StressTest),
|
|
("rdma", "RDMA/IB Test", RDMATest),
|
|
("training", "Training Simulation", TrainingSim),
|
|
]
|
|
|
|
for i, (key, name, mod_cls) in enumerate(tests, 1):
|
|
console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]")
|
|
try:
|
|
mod = mod_cls(config)
|
|
if key == "memory_bench":
|
|
result = mod.run_memory_benchmark()
|
|
mod.print_results(result)
|
|
elif key == "compute_bench":
|
|
result = mod.run_compute_benchmark()
|
|
mod.print_results(result)
|
|
else:
|
|
result = mod.run()
|
|
mod.print_results(result)
|
|
all_results[key] = result
|
|
except Exception as e:
|
|
console.print(f"[bold red]{name} FAILED: {e}[/bold red]")
|
|
all_results[key] = {"error": str(e)}
|
|
|
|
# Summary
|
|
console.print("\n" + "=" * 60)
|
|
# 只统计测试结果,排除 timestamp 等元数据
|
|
test_results = {k: v for k, v in all_results.items() if k != "timestamp"}
|
|
passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v)
|
|
total = len(test_results)
|
|
color = "green" if passed == total else ("yellow" if passed > 0 else "red")
|
|
console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]")
|
|
return all_results
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="GPU Training Server Test Suite (A100/A800/H100/H200/B200/B300)",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python gpu_tester.py # Interactive menu
|
|
python gpu_tester.py --gpu-type a800 # Override GPU type
|
|
python gpu_tester.py --test gpu-info # GPU info
|
|
python gpu_tester.py --test health # Health check
|
|
python gpu_tester.py --test benchmark --type memory
|
|
python gpu_tester.py --test benchmark --type compute --dtype fp16
|
|
python gpu_tester.py --test nccl # NCCL test
|
|
python gpu_tester.py --test training # Training sim
|
|
python gpu_tester.py --test all # Full suite
|
|
python gpu_tester.py --report --format json --output report.json
|
|
""",
|
|
)
|
|
parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"],
|
|
help="Run a specific test")
|
|
parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)")
|
|
parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"],
|
|
help="Compute benchmark dtype (with --test benchmark --type compute)")
|
|
parser.add_argument("--interactive", action="store_true", help="Force interactive mode")
|
|
parser.add_argument("--report", action="store_true", help="Generate report from last results")
|
|
parser.add_argument("--format", choices=["json", "html", "md"], default="json", help="Report format")
|
|
parser.add_argument("--output", default=None, help="Report output file path")
|
|
parser.add_argument("--config", default=None, help="Path to config YAML file")
|
|
parser.add_argument(
|
|
"--gpu-type",
|
|
choices=["auto", "a100", "a800", "h100", "h200", "b200", "b300"],
|
|
default="auto",
|
|
help="Override GPU type detection",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
config = load_config()
|
|
|
|
# Override config with CLI args (load before gpu_type so custom configs work)
|
|
if args.config:
|
|
with open(args.config) as f:
|
|
config = yaml.safe_load(f)
|
|
|
|
# Set GPU type after config is finalized
|
|
if args.gpu_type and args.gpu_type != "auto":
|
|
config["gpu_type"] = args.gpu_type
|
|
else:
|
|
config["gpu_type"] = detect_gpu_type()
|
|
|
|
console = Console()
|
|
|
|
# Driver / CUDA compatibility check
|
|
compat_warnings = validate_driver_compatibility(config["gpu_type"])
|
|
for w in compat_warnings:
|
|
console.print(f"[bold yellow]\u26a0 {w}[/bold yellow]")
|
|
|
|
# Handle --report standalone
|
|
if args.report and not args.test:
|
|
console.print("[yellow]Run tests first to generate a report.[/yellow]")
|
|
return
|
|
|
|
# Interactive mode
|
|
if args.interactive or not args.test:
|
|
interactive_menu(config)
|
|
return
|
|
|
|
# CLI mode
|
|
if not check_prerequisites(console):
|
|
sys.exit(1)
|
|
|
|
test_map = {
|
|
"gpu-info": "gpu_info",
|
|
"health": "health",
|
|
"benchmark": None,
|
|
"nccl": "nccl",
|
|
"stress": "stress",
|
|
"rdma": "rdma",
|
|
"training": "training",
|
|
"all": "all",
|
|
}
|
|
|
|
if args.test == "benchmark":
|
|
bench = Benchmark(config)
|
|
if args.type == "memory":
|
|
result = bench.run_memory_benchmark()
|
|
Benchmark.print_results(result)
|
|
elif args.type == "compute":
|
|
result = bench.run_compute_benchmark(dtypes=[args.dtype] if args.dtype else None)
|
|
Benchmark.print_results(result)
|
|
else:
|
|
result = bench.run()
|
|
Benchmark.print_results(result)
|
|
if args.report:
|
|
ReportGenerator(config).generate({"benchmark": result, "timestamp": datetime.now().isoformat()},
|
|
fmt=args.format, output=args.output)
|
|
elif args.test == "all":
|
|
results = _run_full_suite(config, console)
|
|
if args.report:
|
|
ReportGenerator(config).generate(results, fmt=args.format, output=args.output)
|
|
has_errors = any("error" in v for v in results.values() if isinstance(v, dict))
|
|
sys.exit(1 if has_errors else 0)
|
|
else:
|
|
result = _run_test(test_map[args.test], config, console)
|
|
if args.report and result:
|
|
ReportGenerator(config).generate({args.test: result, "timestamp": datetime.now().isoformat()},
|
|
fmt=args.format, output=args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|