refactor: replace hardcoded H200 specs with dynamic GPU detection

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-05-06 19:31:51 +08:00 · 2026-05-06 19:31:51 +08:00 · 52fe96f2f5
commit 52fe96f2f5
parent 98e4977e28
4 changed files with 42 additions and 26 deletions
--- a/modules/benchmark.py
+++ b/modules/benchmark.py
@ -11,14 +11,7 @@ from rich.console import Console
 from rich.table import Table
 from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn

-H200_SPECS = {
-    "memory_bandwidth_gbps": 989.0,
-    "fp32_tflops": 67.0,
-    "tf32_tflops": 989.0,
-    "fp16_tflops": 989.0,
-    "bf16_tflops": 989.0,
-    "fp8_tflops": 1979.0,
-}
+from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label

 TORCH_AVAILABLE = False
 try:
@ -36,6 +29,9 @@ class Benchmark:
        self.console = Console()
        self.bench_cfg = config.get("benchmark", {})
        self.tools_dir = config.get("tools", {}).get("install_dir", "/opt/h200-test-tools")
+        self.gpu_type = detect_gpu_type()
+        self.specs = get_gpu_specs(self.gpu_type)
+        self.gpu_label = get_gpu_label(self.gpu_type)

    def run(self) -> dict:
        results = {}
@ -144,7 +140,7 @@ class Benchmark:
        )
        h2d_bw = results_by_test.get("host_to_device_memcpy_read_ce", 0)
        d2h_bw = results_by_test.get("device_to_host_memcpy_write_ce", 0)
-        efficiency = (d2d_bw / H200_SPECS["memory_bandwidth_gbps"]) * 100 if d2d_bw else 0
+        efficiency = (d2d_bw / self.specs["memory_bandwidth_gbps"]) * 100 if d2d_bw else 0

        return {
            "memory": {
@ -152,7 +148,7 @@ class Benchmark:
                "h2d_bandwidth_gbps": round(h2d_bw, 1),
                "d2h_bandwidth_gbps": round(d2h_bw, 1),
                "d2d_bandwidth_gbps": round(d2d_bw, 1),
-                "peak_bandwidth_gbps": H200_SPECS["memory_bandwidth_gbps"],
+                "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
                "efficiency_pct": round(efficiency, 1),
                "results_by_test": results_by_test,
                "per_gpu": per_gpu_d2d,
@ -220,7 +216,7 @@ class Benchmark:
                progress.advance(task)

        best_d2d = max(v["d2d_gbps"] for v in bandwidth_by_size.values())
-        efficiency = (best_d2d / H200_SPECS["memory_bandwidth_gbps"]) * 100
+        efficiency = (best_d2d / self.specs["memory_bandwidth_gbps"]) * 100

        return {
            "memory": {
@ -228,7 +224,7 @@ class Benchmark:
                "h2d_bandwidth_gbps": round(max(v["h2d_gbps"] for v in bandwidth_by_size.values()), 1),
                "d2h_bandwidth_gbps": round(max(v["d2h_gbps"] for v in bandwidth_by_size.values()), 1),
                "d2d_bandwidth_gbps": round(best_d2d, 1),
-                "peak_bandwidth_gbps": H200_SPECS["memory_bandwidth_gbps"],
+                "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
                "efficiency_pct": round(efficiency, 1),
                "test_sizes_mb": test_sizes_mb,
                "bandwidth_by_size": bandwidth_by_size,
@ -251,11 +247,11 @@ class Benchmark:
        self.console.print(f"[cyan]Compute Benchmark - {gpu_count} GPU(s)[/cyan]")

        dtype_map = {
-            "fp32": (torch.float32, H200_SPECS["fp32_tflops"]),
-            "tf32": ("tf32", H200_SPECS["tf32_tflops"]),
-            "fp16": (torch.float16, H200_SPECS["fp16_tflops"]),
-            "bf16": (torch.bfloat16, H200_SPECS["bf16_tflops"]),
-            "fp8": (torch.float8_e4m3fn, H200_SPECS["fp8_tflops"]),
+            "fp32": (torch.float32, self.specs["fp32_tflops"]),
+            "tf32": ("tf32", self.specs["tf32_tflops"]),
+            "fp16": (torch.float16, self.specs["fp16_tflops"]),
+            "bf16": (torch.bfloat16, self.specs["bf16_tflops"]),
+            "fp8": (torch.float8_e4m3fn, self.specs["fp8_tflops"]),
        }

        results_by_dtype = {}
@ -351,7 +347,7 @@ class Benchmark:
            table = Table(box=None, padding=(0, 1))
            table.add_column("Metric", style="bold")
            table.add_column("Value", justify="right")
-            table.add_column("Peak (H200)", justify="right")
+            table.add_column("Peak", justify="right")
            table.add_column("Efficiency", justify="right")

            for label, achieved, peak in [
@ -385,7 +381,7 @@ class Benchmark:
                t2.add_column("D2H (GB/s)", justify="right")
                t2.add_column("D2D (GB/s)", justify="right")
                for sz, vals in sorted(by_size.items(), key=lambda x: int(x[0])):
-                    peak = H200_SPECS["memory_bandwidth_gbps"]
+                    peak = mem["peak_bandwidth_gbps"]
                    d2d_eff = (vals["d2d_gbps"] / peak) * 100
                    ec = "green" if d2d_eff >= 80 else ("yellow" if d2d_eff >= 50 else "red")
                    t2.add_row(sz, f"{vals['h2d_gbps']:.1f}", f"{vals['d2h_gbps']:.1f}",
@ -399,7 +395,7 @@ class Benchmark:
            table = Table(box=None, padding=(0, 1))
            table.add_column("DType", style="bold")
            table.add_column("Achieved (TFLOPS)", justify="right")
-            table.add_column("Peak (H200)", justify="right")
+            table.add_column("Peak", justify="right")
            table.add_column("Efficiency", justify="right")

            peak = comp.get("peak_tflops", {})
--- a/modules/gpu_info.py
+++ b/modules/gpu_info.py
@ -1,4 +1,4 @@
-"""GPU information detection module for NVIDIA H200."""
+"""GPU information detection module for NVIDIA datacenter GPUs (H100/H200/B200/B300)."""

 import subprocess
 import shutil
@ -10,12 +10,17 @@ from rich.table import Table
 from rich.panel import Panel
 from rich.text import Text

+from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label
+

 class GPUInfo:

    def __init__(self, config: dict):
        self.config = config
        self.console = Console()
+        self.gpu_type = detect_gpu_type()
+        self.specs = get_gpu_specs(self.gpu_type)
+        self.gpu_label = get_gpu_label(self.gpu_type)

    def _run_smi(self, query: str, fmt: str = "csv,noheader,nounits") -> Optional[str]:
        if not shutil.which("nvidia-smi"):
@ -116,6 +121,8 @@ class GPUInfo:
            "gpus": gpus,
            "topology": topology,
            "timestamp": datetime.now().isoformat(),
+            "detected_gpu_type": self.gpu_type,
+            "gpu_label": self.gpu_label,
        }

    def _get_topology(self) -> str:
@ -139,6 +146,7 @@ class GPUInfo:
        c.print(f"  Driver Version : {results.get('driver_version', 'N/A')}")
        c.print(f"  CUDA Version   : {results.get('cuda_version', 'N/A')}")
        c.print(f"  GPU Count      : {results.get('gpu_count', 0)}")
+        c.print(f"  Detected GPU   : {results.get('gpu_label', 'Unknown')} ({results.get('detected_gpu_type', 'unknown')})")
        c.print(f"  Timestamp      : {results.get('timestamp', 'N/A')}")

        gpus = results.get("gpus", [])
@ -158,7 +166,7 @@ class GPUInfo:

        for g in gpus:
            name = g["name"]
-            if "H200" in name:
+            if any(k in name for k in ("H100", "H200", "B200", "B300")):
                name = f"[bold green]{name}[/bold green]"
            vram = f"{g['vram_used_mb']}/{g['vram_total_mb']} MB"
            temp = f"{g['temperature']}°C"
--- a/modules/health_check.py
+++ b/modules/health_check.py
@ -1,4 +1,4 @@
-"""Hardware health monitoring module for NVIDIA H200."""
+"""Hardware health monitoring module for NVIDIA datacenter GPUs (H100/H200/B200/B300)."""

 import subprocess
 import shutil
@ -11,6 +11,8 @@ from rich.table import Table
 from rich.panel import Panel
 from rich.text import Text

+from modules.gpu_specs import detect_gpu_type, get_gpu_specs
+

 class HealthCheck:

@ -18,6 +20,8 @@ class HealthCheck:
        self.config = config
        self.console = Console()
        self.health_cfg = config.get("health", {})
+        self.gpu_type = detect_gpu_type()
+        self.specs = get_gpu_specs(self.gpu_type)

    def _run_smi(self, query: str) -> Optional[str]:
        if not shutil.which("nvidia-smi"):
@ -79,7 +83,7 @@ class HealthCheck:

        temp_warn = self.health_cfg.get("temp_warning", 80)
        temp_crit = self.health_cfg.get("temp_critical", 90)
-        power_lim = self.health_cfg.get("power_limit", 700)
+        power_lim = self.health_cfg.get("power_limit", self.specs.get("tdp_watts", 700))

        gpu_health = []
        overall_pass = True
@ -150,6 +154,7 @@ class HealthCheck:
            "gpu_health": gpu_health,
            "system_health": system_health,
            "timestamp": datetime.now().isoformat(),
+            "detected_gpu_type": self.gpu_type,
        }

    def _check_system(self) -> dict:
--- a/modules/nccl_test.py
+++ b/modules/nccl_test.py
@ -12,6 +12,8 @@ from rich.console import Console
 from rich.table import Table
 from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn

+from modules.gpu_specs import detect_gpu_type, get_gpu_specs
+
 TORCH_AVAILABLE = False
 try:
    import torch
@ -28,6 +30,8 @@ class NCCLTest:
        self.console = Console()
        self.nccl_cfg = config.get("nccl", {})
        self.tools_dir = config.get("tools", {}).get("install_dir", "/opt/h200-test-tools")
+        self.gpu_type = detect_gpu_type()
+        self.specs = get_gpu_specs(self.gpu_type)

    def _find_nccl_test(self, name: str) -> Optional[str]:
        p = shutil.which(name)
@ -81,7 +85,8 @@ class NCCLTest:
            tests.append(("sendrecv_perf", "SendRecv"))

        results = {}
-        min_bw = self.nccl_cfg.get("min_bandwidth_gbps", 400)
+        default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
+        min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw))

        with Progress(
            SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
@ -109,6 +114,7 @@ class NCCLTest:
            "tests": results,
            "gpu_count": gpu_count,
            "timestamp": datetime.now().isoformat(),
+            "detected_gpu_type": self.gpu_type,
        }

    def _run_one_nccl_test(self, binary_name: str, label: str,
@ -187,7 +193,8 @@ class NCCLTest:

    def _run_torchrun_fallback(self, gpu_count: int) -> dict:
        self.console.print("[cyan]Using torchrun fallback for NCCL test[/cyan]")
-        min_bw = self.nccl_cfg.get("min_bandwidth_gbps", 400)
+        default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
+        min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw))
        size_mb = 64
        elements = size_mb * 1024 * 1024 // 4
        iters = 20