From 52fe96f2f5cf0ed2eebd796fa3c634334ca035e1 Mon Sep 17 00:00:00 2001 From: qinyusen Date: Wed, 6 May 2026 19:31:51 +0800 Subject: [PATCH] refactor: replace hardcoded H200 specs with dynamic GPU detection Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- modules/benchmark.py | 36 ++++++++++++++++-------------------- modules/gpu_info.py | 12 ++++++++++-- modules/health_check.py | 9 +++++++-- modules/nccl_test.py | 11 +++++++++-- 4 files changed, 42 insertions(+), 26 deletions(-) diff --git a/modules/benchmark.py b/modules/benchmark.py index ae9830c..9f32201 100644 --- a/modules/benchmark.py +++ b/modules/benchmark.py @@ -11,14 +11,7 @@ from rich.console import Console from rich.table import Table from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn -H200_SPECS = { - "memory_bandwidth_gbps": 989.0, - "fp32_tflops": 67.0, - "tf32_tflops": 989.0, - "fp16_tflops": 989.0, - "bf16_tflops": 989.0, - "fp8_tflops": 1979.0, -} +from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label TORCH_AVAILABLE = False try: @@ -36,6 +29,9 @@ class Benchmark: self.console = Console() self.bench_cfg = config.get("benchmark", {}) self.tools_dir = config.get("tools", {}).get("install_dir", "/opt/h200-test-tools") + self.gpu_type = detect_gpu_type() + self.specs = get_gpu_specs(self.gpu_type) + self.gpu_label = get_gpu_label(self.gpu_type) def run(self) -> dict: results = {} @@ -144,7 +140,7 @@ class Benchmark: ) h2d_bw = results_by_test.get("host_to_device_memcpy_read_ce", 0) d2h_bw = results_by_test.get("device_to_host_memcpy_write_ce", 0) - efficiency = (d2d_bw / H200_SPECS["memory_bandwidth_gbps"]) * 100 if d2d_bw else 0 + efficiency = (d2d_bw / self.specs["memory_bandwidth_gbps"]) * 100 if d2d_bw else 0 return { "memory": { @@ -152,7 +148,7 @@ class Benchmark: "h2d_bandwidth_gbps": round(h2d_bw, 1), "d2h_bandwidth_gbps": round(d2h_bw, 1), "d2d_bandwidth_gbps": round(d2d_bw, 1), - "peak_bandwidth_gbps": H200_SPECS["memory_bandwidth_gbps"], + "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"], "efficiency_pct": round(efficiency, 1), "results_by_test": results_by_test, "per_gpu": per_gpu_d2d, @@ -220,7 +216,7 @@ class Benchmark: progress.advance(task) best_d2d = max(v["d2d_gbps"] for v in bandwidth_by_size.values()) - efficiency = (best_d2d / H200_SPECS["memory_bandwidth_gbps"]) * 100 + efficiency = (best_d2d / self.specs["memory_bandwidth_gbps"]) * 100 return { "memory": { @@ -228,7 +224,7 @@ class Benchmark: "h2d_bandwidth_gbps": round(max(v["h2d_gbps"] for v in bandwidth_by_size.values()), 1), "d2h_bandwidth_gbps": round(max(v["d2h_gbps"] for v in bandwidth_by_size.values()), 1), "d2d_bandwidth_gbps": round(best_d2d, 1), - "peak_bandwidth_gbps": H200_SPECS["memory_bandwidth_gbps"], + "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"], "efficiency_pct": round(efficiency, 1), "test_sizes_mb": test_sizes_mb, "bandwidth_by_size": bandwidth_by_size, @@ -251,11 +247,11 @@ class Benchmark: self.console.print(f"[cyan]Compute Benchmark - {gpu_count} GPU(s)[/cyan]") dtype_map = { - "fp32": (torch.float32, H200_SPECS["fp32_tflops"]), - "tf32": ("tf32", H200_SPECS["tf32_tflops"]), - "fp16": (torch.float16, H200_SPECS["fp16_tflops"]), - "bf16": (torch.bfloat16, H200_SPECS["bf16_tflops"]), - "fp8": (torch.float8_e4m3fn, H200_SPECS["fp8_tflops"]), + "fp32": (torch.float32, self.specs["fp32_tflops"]), + "tf32": ("tf32", self.specs["tf32_tflops"]), + "fp16": (torch.float16, self.specs["fp16_tflops"]), + "bf16": (torch.bfloat16, self.specs["bf16_tflops"]), + "fp8": (torch.float8_e4m3fn, self.specs["fp8_tflops"]), } results_by_dtype = {} @@ -351,7 +347,7 @@ class Benchmark: table = Table(box=None, padding=(0, 1)) table.add_column("Metric", style="bold") table.add_column("Value", justify="right") - table.add_column("Peak (H200)", justify="right") + table.add_column("Peak", justify="right") table.add_column("Efficiency", justify="right") for label, achieved, peak in [ @@ -385,7 +381,7 @@ class Benchmark: t2.add_column("D2H (GB/s)", justify="right") t2.add_column("D2D (GB/s)", justify="right") for sz, vals in sorted(by_size.items(), key=lambda x: int(x[0])): - peak = H200_SPECS["memory_bandwidth_gbps"] + peak = mem["peak_bandwidth_gbps"] d2d_eff = (vals["d2d_gbps"] / peak) * 100 ec = "green" if d2d_eff >= 80 else ("yellow" if d2d_eff >= 50 else "red") t2.add_row(sz, f"{vals['h2d_gbps']:.1f}", f"{vals['d2h_gbps']:.1f}", @@ -399,7 +395,7 @@ class Benchmark: table = Table(box=None, padding=(0, 1)) table.add_column("DType", style="bold") table.add_column("Achieved (TFLOPS)", justify="right") - table.add_column("Peak (H200)", justify="right") + table.add_column("Peak", justify="right") table.add_column("Efficiency", justify="right") peak = comp.get("peak_tflops", {}) diff --git a/modules/gpu_info.py b/modules/gpu_info.py index bb94e33..b7af28d 100644 --- a/modules/gpu_info.py +++ b/modules/gpu_info.py @@ -1,4 +1,4 @@ -"""GPU information detection module for NVIDIA H200.""" +"""GPU information detection module for NVIDIA datacenter GPUs (H100/H200/B200/B300).""" import subprocess import shutil @@ -10,12 +10,17 @@ from rich.table import Table from rich.panel import Panel from rich.text import Text +from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label + class GPUInfo: def __init__(self, config: dict): self.config = config self.console = Console() + self.gpu_type = detect_gpu_type() + self.specs = get_gpu_specs(self.gpu_type) + self.gpu_label = get_gpu_label(self.gpu_type) def _run_smi(self, query: str, fmt: str = "csv,noheader,nounits") -> Optional[str]: if not shutil.which("nvidia-smi"): @@ -116,6 +121,8 @@ class GPUInfo: "gpus": gpus, "topology": topology, "timestamp": datetime.now().isoformat(), + "detected_gpu_type": self.gpu_type, + "gpu_label": self.gpu_label, } def _get_topology(self) -> str: @@ -139,6 +146,7 @@ class GPUInfo: c.print(f" Driver Version : {results.get('driver_version', 'N/A')}") c.print(f" CUDA Version : {results.get('cuda_version', 'N/A')}") c.print(f" GPU Count : {results.get('gpu_count', 0)}") + c.print(f" Detected GPU : {results.get('gpu_label', 'Unknown')} ({results.get('detected_gpu_type', 'unknown')})") c.print(f" Timestamp : {results.get('timestamp', 'N/A')}") gpus = results.get("gpus", []) @@ -158,7 +166,7 @@ class GPUInfo: for g in gpus: name = g["name"] - if "H200" in name: + if any(k in name for k in ("H100", "H200", "B200", "B300")): name = f"[bold green]{name}[/bold green]" vram = f"{g['vram_used_mb']}/{g['vram_total_mb']} MB" temp = f"{g['temperature']}°C" diff --git a/modules/health_check.py b/modules/health_check.py index 2b0ad15..516936f 100644 --- a/modules/health_check.py +++ b/modules/health_check.py @@ -1,4 +1,4 @@ -"""Hardware health monitoring module for NVIDIA H200.""" +"""Hardware health monitoring module for NVIDIA datacenter GPUs (H100/H200/B200/B300).""" import subprocess import shutil @@ -11,6 +11,8 @@ from rich.table import Table from rich.panel import Panel from rich.text import Text +from modules.gpu_specs import detect_gpu_type, get_gpu_specs + class HealthCheck: @@ -18,6 +20,8 @@ class HealthCheck: self.config = config self.console = Console() self.health_cfg = config.get("health", {}) + self.gpu_type = detect_gpu_type() + self.specs = get_gpu_specs(self.gpu_type) def _run_smi(self, query: str) -> Optional[str]: if not shutil.which("nvidia-smi"): @@ -79,7 +83,7 @@ class HealthCheck: temp_warn = self.health_cfg.get("temp_warning", 80) temp_crit = self.health_cfg.get("temp_critical", 90) - power_lim = self.health_cfg.get("power_limit", 700) + power_lim = self.health_cfg.get("power_limit", self.specs.get("tdp_watts", 700)) gpu_health = [] overall_pass = True @@ -150,6 +154,7 @@ class HealthCheck: "gpu_health": gpu_health, "system_health": system_health, "timestamp": datetime.now().isoformat(), + "detected_gpu_type": self.gpu_type, } def _check_system(self) -> dict: diff --git a/modules/nccl_test.py b/modules/nccl_test.py index 6028e06..f043b71 100644 --- a/modules/nccl_test.py +++ b/modules/nccl_test.py @@ -12,6 +12,8 @@ from rich.console import Console from rich.table import Table from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn +from modules.gpu_specs import detect_gpu_type, get_gpu_specs + TORCH_AVAILABLE = False try: import torch @@ -28,6 +30,8 @@ class NCCLTest: self.console = Console() self.nccl_cfg = config.get("nccl", {}) self.tools_dir = config.get("tools", {}).get("install_dir", "/opt/h200-test-tools") + self.gpu_type = detect_gpu_type() + self.specs = get_gpu_specs(self.gpu_type) def _find_nccl_test(self, name: str) -> Optional[str]: p = shutil.which(name) @@ -81,7 +85,8 @@ class NCCLTest: tests.append(("sendrecv_perf", "SendRecv")) results = {} - min_bw = self.nccl_cfg.get("min_bandwidth_gbps", 400) + default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4 + min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw)) with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), @@ -109,6 +114,7 @@ class NCCLTest: "tests": results, "gpu_count": gpu_count, "timestamp": datetime.now().isoformat(), + "detected_gpu_type": self.gpu_type, } def _run_one_nccl_test(self, binary_name: str, label: str, @@ -187,7 +193,8 @@ class NCCLTest: def _run_torchrun_fallback(self, gpu_count: int) -> dict: self.console.print("[cyan]Using torchrun fallback for NCCL test[/cyan]") - min_bw = self.nccl_cfg.get("min_bandwidth_gbps", 400) + default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4 + min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw)) size_mb = 64 elements = size_mb * 1024 * 1024 // 4 iters = 20