fix: generic branding, wire up report generation, fix --config flag
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
52fe96f2f5
commit
2cb776d7d5
@ -1,3 +1,6 @@
|
||||
# GPU type: auto-detect or override to h100/h200/b200/b300
|
||||
gpu_type: auto
|
||||
|
||||
benchmark:
|
||||
memory:
|
||||
size_mb: 4096
|
||||
@ -18,10 +21,10 @@ benchmark:
|
||||
health:
|
||||
temp_warning: 80
|
||||
temp_critical: 90
|
||||
power_limit: 700
|
||||
power_limit: null # null = auto-detect from GPU TDP (H100/H200: 700W, B200: 1000W, B300: 1200W)
|
||||
|
||||
nccl:
|
||||
min_bandwidth_gbps: 400
|
||||
min_bandwidth_gbps: null # null = auto-detect (40% of GPU NVLink BW)
|
||||
test_allreduce: true
|
||||
test_alltoall: true
|
||||
test_broadcast: true
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
"""H200 Training Server Test Suite - Main CLI Entry Point."""
|
||||
"""GPU Training Server Test Suite (H100/H200/B200/B300) - Main CLI Entry Point."""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
@ -26,6 +26,7 @@ from modules.training_sim import TrainingSim
|
||||
from modules.stress_test import StressTest
|
||||
from modules.rdma_test import RDMATest
|
||||
from modules.report import ReportGenerator
|
||||
from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
"benchmark": {
|
||||
@ -37,9 +38,9 @@ DEFAULT_CONFIG = {
|
||||
"iterations": 100,
|
||||
},
|
||||
},
|
||||
"health": {"temp_warning": 80, "temp_critical": 90, "power_limit": 700},
|
||||
"health": {"temp_warning": 80, "temp_critical": 90, "power_limit": None},
|
||||
"nccl": {
|
||||
"min_bandwidth_gbps": 400,
|
||||
"min_bandwidth_gbps": None,
|
||||
"test_allreduce": True,
|
||||
"test_alltoall": True,
|
||||
"test_broadcast": True,
|
||||
@ -77,8 +78,9 @@ BANNER = r"""
|
||||
[bold cyan]
|
||||
╔══════════════════════════════════════════════════╗
|
||||
║ ║
|
||||
║ H200 Training Server Test Suite ║
|
||||
║ GPU Diagnostics & Benchmarking Tool ║
|
||||
║ GPU Training Server Test Suite ║
|
||||
║ Diagnostics & Benchmarking Tool ║
|
||||
║ Supports: H100 / H200 / B200 / B300 ║
|
||||
║ ║
|
||||
╚══════════════════════════════════════════════════╝
|
||||
[/bold cyan]
|
||||
@ -111,6 +113,14 @@ def interactive_menu(config: dict):
|
||||
console = Console()
|
||||
|
||||
console.print(BANNER)
|
||||
|
||||
gpu_type = detect_gpu_type()
|
||||
gpu_label = get_gpu_label(gpu_type)
|
||||
if gpu_type != "unknown":
|
||||
console.print(f"[bold green]Detected GPU: {gpu_label} ({gpu_type.upper()})[/bold green]\n")
|
||||
else:
|
||||
console.print("[yellow]GPU type could not be auto-detected. Using default thresholds.[/yellow]\n")
|
||||
|
||||
if not check_prerequisites(console):
|
||||
return
|
||||
|
||||
@ -144,7 +154,7 @@ def interactive_menu(config: dict):
|
||||
descriptions = {
|
||||
"gpu_info": "Detect GPUs, show specs & NVLink topology",
|
||||
"health": "Temperature, power, ECC errors, PCIe, DCGM",
|
||||
"memory_bench": "HBM3e bandwidth via nvbandwidth",
|
||||
"memory_bench": "HBM bandwidth via nvbandwidth",
|
||||
"compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8",
|
||||
"nccl": "AllReduce, AllToAll, Broadcast via nccl-tests",
|
||||
"stress": "Long-running GPU stress via gpu-burn",
|
||||
@ -161,6 +171,8 @@ def interactive_menu(config: dict):
|
||||
choice = console.input("\n[bold green]Enter choice > [/bold green]").strip().lower()
|
||||
|
||||
if choice == "q":
|
||||
if results_store.get("tests"):
|
||||
_save_results_prompt(results_store, config, console)
|
||||
console.print("[dim]Goodbye![/dim]")
|
||||
break
|
||||
|
||||
@ -172,11 +184,27 @@ def interactive_menu(config: dict):
|
||||
|
||||
result = _run_test(action, config, console)
|
||||
if result:
|
||||
results_store["tests"][action] = result
|
||||
if result.get("__report__"):
|
||||
if results_store.get("tests"):
|
||||
rg = ReportGenerator(config)
|
||||
rg.generate(results_store)
|
||||
else:
|
||||
console.print("[yellow]No test results to export. Run tests first.[/yellow]")
|
||||
else:
|
||||
results_store["tests"][action] = result
|
||||
|
||||
return results_store
|
||||
|
||||
|
||||
def _save_results_prompt(results_store: dict, config: dict, console: Console):
|
||||
if not results_store.get("tests"):
|
||||
return
|
||||
save = console.input("[bold green]Save results before quitting? [y/N]: [/bold green]").strip().lower()
|
||||
if save == "y":
|
||||
rg = ReportGenerator(config)
|
||||
rg.generate(results_store)
|
||||
|
||||
|
||||
def _run_test(test_name: str, config: dict, console: Console) -> dict:
|
||||
"""Execute a single test by name."""
|
||||
try:
|
||||
@ -232,8 +260,7 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
|
||||
return _run_full_suite(config, console)
|
||||
|
||||
elif test_name == "report":
|
||||
console.print("[yellow]No test results to export. Run tests first.[/yellow]")
|
||||
return {}
|
||||
return {"__report__": True}
|
||||
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[yellow]Test interrupted by user.[/yellow]")
|
||||
@ -287,19 +314,20 @@ def _run_full_suite(config: dict, console: Console) -> dict:
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="H200 Training Server Test Suite",
|
||||
description="GPU Training Server Test Suite (H100/H200/B200/B300)",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python h200_tester.py # Interactive menu
|
||||
python h200_tester.py --test gpu-info # GPU info
|
||||
python h200_tester.py --test health # Health check
|
||||
python h200_tester.py --test benchmark --type memory
|
||||
python h200_tester.py --test benchmark --type compute --dtype fp16
|
||||
python h200_tester.py --test nccl # NCCL test
|
||||
python h200_tester.py --test training # Training sim
|
||||
python h200_tester.py --test all # Full suite
|
||||
python h200_tester.py --report --format json --output report.json
|
||||
python h200_tester.py # Interactive menu
|
||||
python h200_tester.py --gpu-type h200 # Override GPU type
|
||||
python h200_tester.py --test gpu-info # GPU info
|
||||
python h200_tester.py --test health # Health check
|
||||
python h200_tester.py --test benchmark --type memory
|
||||
python h200_tester.py --test benchmark --type compute --dtype fp16
|
||||
python h200_tester.py --test nccl # NCCL test
|
||||
python h200_tester.py --test training # Training sim
|
||||
python h200_tester.py --test all # Full suite
|
||||
python h200_tester.py --report --format json --output report.json
|
||||
""",
|
||||
)
|
||||
parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"],
|
||||
@ -312,15 +340,23 @@ Examples:
|
||||
parser.add_argument("--format", choices=["json", "html"], default="json", help="Report format")
|
||||
parser.add_argument("--output", default=None, help="Report output file path")
|
||||
parser.add_argument("--config", default=None, help="Path to config YAML file")
|
||||
parser.add_argument("--gpu-type", choices=["auto", "h100", "h200", "b200", "b300"],
|
||||
default="auto", help="Override GPU type detection")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = load_config()
|
||||
|
||||
# Override config with CLI args
|
||||
# Override config with CLI args (load before gpu_type so custom configs work)
|
||||
if args.config:
|
||||
with open(args.config) as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# Set GPU type after config is finalized
|
||||
if args.gpu_type and args.gpu_type != "auto":
|
||||
config["gpu_type"] = args.gpu_type
|
||||
else:
|
||||
config["gpu_type"] = detect_gpu_type()
|
||||
|
||||
console = Console()
|
||||
|
||||
# Handle --report standalone
|
||||
@ -357,13 +393,20 @@ Examples:
|
||||
result = bench.run_compute_benchmark(dtypes=[args.dtype] if args.dtype else None)
|
||||
Benchmark.print_results(result)
|
||||
else:
|
||||
# Run both
|
||||
result = bench.run()
|
||||
Benchmark.print_results(result)
|
||||
if args.report:
|
||||
ReportGenerator(config).generate({"benchmark": result, "timestamp": datetime.now().isoformat()})
|
||||
elif args.test == "all":
|
||||
_run_full_suite(config, console)
|
||||
results = _run_full_suite(config, console)
|
||||
if args.report:
|
||||
ReportGenerator(config).generate(results)
|
||||
has_errors = any("error" in v for v in results.values() if isinstance(v, dict))
|
||||
sys.exit(1 if has_errors else 0)
|
||||
else:
|
||||
_run_test(test_map[args.test], config, console)
|
||||
result = _run_test(test_map[args.test], config, console)
|
||||
if args.report and result:
|
||||
ReportGenerator(config).generate({args.test: result, "timestamp": datetime.now().isoformat()})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
INSTALL_DIR="${H200_TOOLS_DIR:-/opt/h200-test-tools}"
|
||||
INSTALL_DIR="${GPU_TOOLS_DIR:-${H200_TOOLS_DIR:-/opt/h200-test-tools}}"
|
||||
JOBS="${MAKE_JOBS:-$(nproc)}"
|
||||
VERBOSE="${VERBOSE:-0}"
|
||||
|
||||
@ -171,7 +171,7 @@ check_rdma_tools() {
|
||||
print_summary() {
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo " H200 Test Suite - Installation Summary"
|
||||
echo " GPU Test Suite - Installation Summary"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo " Install directory: $INSTALL_DIR"
|
||||
@ -211,7 +211,7 @@ print_summary() {
|
||||
main() {
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo " H200 Test Suite - Dependency Installer"
|
||||
echo " GPU Test Suite - Dependency Installer"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
|
||||
@ -14,7 +14,7 @@ HTML_TEMPLATE = """<!DOCTYPE html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>H200 Test Report - {timestamp}</title>
|
||||
<title>GPU Test Report - {timestamp}</title>
|
||||
<style>
|
||||
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
|
||||
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, monospace;
|
||||
@ -45,7 +45,7 @@ HTML_TEMPLATE = """<!DOCTYPE html>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<h1>H200 Training Server Test Report</h1>
|
||||
<h1>GPU Training Server Test Report</h1>
|
||||
<div class="meta">Generated: {timestamp} | Server: {hostname}</div>
|
||||
</div>
|
||||
{content}
|
||||
@ -67,7 +67,7 @@ class ReportGenerator:
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
if not output:
|
||||
output = os.path.join(output_dir, f"h200_report_{timestamp}.{fmt}")
|
||||
output = os.path.join(output_dir, f"gpu_report_{timestamp}.{fmt}")
|
||||
|
||||
if fmt == "json":
|
||||
return self._generate_json(results, output)
|
||||
@ -119,9 +119,9 @@ class ReportGenerator:
|
||||
sections.append(
|
||||
f'<div class="section"><h2>Memory Bandwidth</h2>'
|
||||
f'<div class="metric"><div class="value">{mem.get("d2d_bandwidth_gbps", "N/A")} GB/s</div>'
|
||||
f'<div class="label">D2D (HBM3e)</div></div>'
|
||||
f'<div class="label">D2D (HBM)</div></div>'
|
||||
f'<div class="metric"><div class="value">{mem.get("efficiency_pct", "N/A")}%</div>'
|
||||
f'<div class="label">Efficiency vs Peak ({mem.get("peak_bandwidth_gbps", 989)} GB/s)</div></div>'
|
||||
f'<div class="label">Efficiency vs Peak ({mem.get("peak_bandwidth_gbps", "N/A")} GB/s)</div></div>'
|
||||
f'</div>'
|
||||
)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user