From 52fe96f2f5cf0ed2eebd796fa3c634334ca035e1 Mon Sep 17 00:00:00 2001
From: qinyusen <qinyusen@users.noreply.github.com>
Date: Wed, 6 May 2026 19:31:51 +0800
Subject: [PATCH] refactor: replace hardcoded H200 specs with dynamic GPU
 detection

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
---
 modules/benchmark.py    | 36 ++++++++++++++++--------------------
 modules/gpu_info.py     | 12 ++++++++++--
 modules/health_check.py |  9 +++++++--
 modules/nccl_test.py    | 11 +++++++++--
 4 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/modules/benchmark.py b/modules/benchmark.py
index ae9830c..9f32201 100644
--- a/modules/benchmark.py
+++ b/modules/benchmark.py
@@ -11,14 +11,7 @@ from rich.console import Console
 from rich.table import Table
 from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
 
-H200_SPECS = {
-    "memory_bandwidth_gbps": 989.0,
-    "fp32_tflops": 67.0,
-    "tf32_tflops": 989.0,
-    "fp16_tflops": 989.0,
-    "bf16_tflops": 989.0,
-    "fp8_tflops": 1979.0,
-}
+from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label
 
 TORCH_AVAILABLE = False
 try:
@@ -36,6 +29,9 @@ class Benchmark:
         self.console = Console()
         self.bench_cfg = config.get("benchmark", {})
         self.tools_dir = config.get("tools", {}).get("install_dir", "/opt/h200-test-tools")
+        self.gpu_type = detect_gpu_type()
+        self.specs = get_gpu_specs(self.gpu_type)
+        self.gpu_label = get_gpu_label(self.gpu_type)
 
     def run(self) -> dict:
         results = {}
@@ -144,7 +140,7 @@ class Benchmark:
         )
         h2d_bw = results_by_test.get("host_to_device_memcpy_read_ce", 0)
         d2h_bw = results_by_test.get("device_to_host_memcpy_write_ce", 0)
-        efficiency = (d2d_bw / H200_SPECS["memory_bandwidth_gbps"]) * 100 if d2d_bw else 0
+        efficiency = (d2d_bw / self.specs["memory_bandwidth_gbps"]) * 100 if d2d_bw else 0
 
         return {
             "memory": {
@@ -152,7 +148,7 @@ class Benchmark:
                 "h2d_bandwidth_gbps": round(h2d_bw, 1),
                 "d2h_bandwidth_gbps": round(d2h_bw, 1),
                 "d2d_bandwidth_gbps": round(d2d_bw, 1),
-                "peak_bandwidth_gbps": H200_SPECS["memory_bandwidth_gbps"],
+                "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
                 "efficiency_pct": round(efficiency, 1),
                 "results_by_test": results_by_test,
                 "per_gpu": per_gpu_d2d,
@@ -220,7 +216,7 @@ class Benchmark:
                 progress.advance(task)
 
         best_d2d = max(v["d2d_gbps"] for v in bandwidth_by_size.values())
-        efficiency = (best_d2d / H200_SPECS["memory_bandwidth_gbps"]) * 100
+        efficiency = (best_d2d / self.specs["memory_bandwidth_gbps"]) * 100
 
         return {
             "memory": {
@@ -228,7 +224,7 @@ class Benchmark:
                 "h2d_bandwidth_gbps": round(max(v["h2d_gbps"] for v in bandwidth_by_size.values()), 1),
                 "d2h_bandwidth_gbps": round(max(v["d2h_gbps"] for v in bandwidth_by_size.values()), 1),
                 "d2d_bandwidth_gbps": round(best_d2d, 1),
-                "peak_bandwidth_gbps": H200_SPECS["memory_bandwidth_gbps"],
+                "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
                 "efficiency_pct": round(efficiency, 1),
                 "test_sizes_mb": test_sizes_mb,
                 "bandwidth_by_size": bandwidth_by_size,
@@ -251,11 +247,11 @@ class Benchmark:
         self.console.print(f"[cyan]Compute Benchmark - {gpu_count} GPU(s)[/cyan]")
 
         dtype_map = {
-            "fp32": (torch.float32, H200_SPECS["fp32_tflops"]),
-            "tf32": ("tf32", H200_SPECS["tf32_tflops"]),
-            "fp16": (torch.float16, H200_SPECS["fp16_tflops"]),
-            "bf16": (torch.bfloat16, H200_SPECS["bf16_tflops"]),
-            "fp8": (torch.float8_e4m3fn, H200_SPECS["fp8_tflops"]),
+            "fp32": (torch.float32, self.specs["fp32_tflops"]),
+            "tf32": ("tf32", self.specs["tf32_tflops"]),
+            "fp16": (torch.float16, self.specs["fp16_tflops"]),
+            "bf16": (torch.bfloat16, self.specs["bf16_tflops"]),
+            "fp8": (torch.float8_e4m3fn, self.specs["fp8_tflops"]),
         }
 
         results_by_dtype = {}
@@ -351,7 +347,7 @@ class Benchmark:
             table = Table(box=None, padding=(0, 1))
             table.add_column("Metric", style="bold")
             table.add_column("Value", justify="right")
-            table.add_column("Peak (H200)", justify="right")
+            table.add_column("Peak", justify="right")
             table.add_column("Efficiency", justify="right")
 
             for label, achieved, peak in [
@@ -385,7 +381,7 @@ class Benchmark:
                 t2.add_column("D2H (GB/s)", justify="right")
                 t2.add_column("D2D (GB/s)", justify="right")
                 for sz, vals in sorted(by_size.items(), key=lambda x: int(x[0])):
-                    peak = H200_SPECS["memory_bandwidth_gbps"]
+                    peak = mem["peak_bandwidth_gbps"]
                     d2d_eff = (vals["d2d_gbps"] / peak) * 100
                     ec = "green" if d2d_eff >= 80 else ("yellow" if d2d_eff >= 50 else "red")
                     t2.add_row(sz, f"{vals['h2d_gbps']:.1f}", f"{vals['d2h_gbps']:.1f}",
@@ -399,7 +395,7 @@ class Benchmark:
             table = Table(box=None, padding=(0, 1))
             table.add_column("DType", style="bold")
             table.add_column("Achieved (TFLOPS)", justify="right")
-            table.add_column("Peak (H200)", justify="right")
+            table.add_column("Peak", justify="right")
             table.add_column("Efficiency", justify="right")
 
             peak = comp.get("peak_tflops", {})
diff --git a/modules/gpu_info.py b/modules/gpu_info.py
index bb94e33..b7af28d 100644
--- a/modules/gpu_info.py
+++ b/modules/gpu_info.py
@@ -1,4 +1,4 @@
-"""GPU information detection module for NVIDIA H200."""
+"""GPU information detection module for NVIDIA datacenter GPUs (H100/H200/B200/B300)."""
 
 import subprocess
 import shutil
@@ -10,12 +10,17 @@ from rich.table import Table
 from rich.panel import Panel
 from rich.text import Text
 
+from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label
+
 
 class GPUInfo:
 
     def __init__(self, config: dict):
         self.config = config
         self.console = Console()
+        self.gpu_type = detect_gpu_type()
+        self.specs = get_gpu_specs(self.gpu_type)
+        self.gpu_label = get_gpu_label(self.gpu_type)
 
     def _run_smi(self, query: str, fmt: str = "csv,noheader,nounits") -> Optional[str]:
         if not shutil.which("nvidia-smi"):
@@ -116,6 +121,8 @@ class GPUInfo:
             "gpus": gpus,
             "topology": topology,
             "timestamp": datetime.now().isoformat(),
+            "detected_gpu_type": self.gpu_type,
+            "gpu_label": self.gpu_label,
         }
 
     def _get_topology(self) -> str:
@@ -139,6 +146,7 @@ class GPUInfo:
         c.print(f"  Driver Version : {results.get('driver_version', 'N/A')}")
         c.print(f"  CUDA Version   : {results.get('cuda_version', 'N/A')}")
         c.print(f"  GPU Count      : {results.get('gpu_count', 0)}")
+        c.print(f"  Detected GPU   : {results.get('gpu_label', 'Unknown')} ({results.get('detected_gpu_type', 'unknown')})")
         c.print(f"  Timestamp      : {results.get('timestamp', 'N/A')}")
 
         gpus = results.get("gpus", [])
@@ -158,7 +166,7 @@ class GPUInfo:
 
         for g in gpus:
             name = g["name"]
-            if "H200" in name:
+            if any(k in name for k in ("H100", "H200", "B200", "B300")):
                 name = f"[bold green]{name}[/bold green]"
             vram = f"{g['vram_used_mb']}/{g['vram_total_mb']} MB"
             temp = f"{g['temperature']}°C"
diff --git a/modules/health_check.py b/modules/health_check.py
index 2b0ad15..516936f 100644
--- a/modules/health_check.py
+++ b/modules/health_check.py
@@ -1,4 +1,4 @@
-"""Hardware health monitoring module for NVIDIA H200."""
+"""Hardware health monitoring module for NVIDIA datacenter GPUs (H100/H200/B200/B300)."""
 
 import subprocess
 import shutil
@@ -11,6 +11,8 @@ from rich.table import Table
 from rich.panel import Panel
 from rich.text import Text
 
+from modules.gpu_specs import detect_gpu_type, get_gpu_specs
+
 
 class HealthCheck:
 
@@ -18,6 +20,8 @@ class HealthCheck:
         self.config = config
         self.console = Console()
         self.health_cfg = config.get("health", {})
+        self.gpu_type = detect_gpu_type()
+        self.specs = get_gpu_specs(self.gpu_type)
 
     def _run_smi(self, query: str) -> Optional[str]:
         if not shutil.which("nvidia-smi"):
@@ -79,7 +83,7 @@ class HealthCheck:
 
         temp_warn = self.health_cfg.get("temp_warning", 80)
         temp_crit = self.health_cfg.get("temp_critical", 90)
-        power_lim = self.health_cfg.get("power_limit", 700)
+        power_lim = self.health_cfg.get("power_limit", self.specs.get("tdp_watts", 700))
 
         gpu_health = []
         overall_pass = True
@@ -150,6 +154,7 @@ class HealthCheck:
             "gpu_health": gpu_health,
             "system_health": system_health,
             "timestamp": datetime.now().isoformat(),
+            "detected_gpu_type": self.gpu_type,
         }
 
     def _check_system(self) -> dict:
diff --git a/modules/nccl_test.py b/modules/nccl_test.py
index 6028e06..f043b71 100644
--- a/modules/nccl_test.py
+++ b/modules/nccl_test.py
@@ -12,6 +12,8 @@ from rich.console import Console
 from rich.table import Table
 from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
 
+from modules.gpu_specs import detect_gpu_type, get_gpu_specs
+
 TORCH_AVAILABLE = False
 try:
     import torch
@@ -28,6 +30,8 @@ class NCCLTest:
         self.console = Console()
         self.nccl_cfg = config.get("nccl", {})
         self.tools_dir = config.get("tools", {}).get("install_dir", "/opt/h200-test-tools")
+        self.gpu_type = detect_gpu_type()
+        self.specs = get_gpu_specs(self.gpu_type)
 
     def _find_nccl_test(self, name: str) -> Optional[str]:
         p = shutil.which(name)
@@ -81,7 +85,8 @@ class NCCLTest:
             tests.append(("sendrecv_perf", "SendRecv"))
 
         results = {}
-        min_bw = self.nccl_cfg.get("min_bandwidth_gbps", 400)
+        default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
+        min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw))
 
         with Progress(
             SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
@@ -109,6 +114,7 @@ class NCCLTest:
             "tests": results,
             "gpu_count": gpu_count,
             "timestamp": datetime.now().isoformat(),
+            "detected_gpu_type": self.gpu_type,
         }
 
     def _run_one_nccl_test(self, binary_name: str, label: str,
@@ -187,7 +193,8 @@ class NCCLTest:
 
     def _run_torchrun_fallback(self, gpu_count: int) -> dict:
         self.console.print("[cyan]Using torchrun fallback for NCCL test[/cyan]")
-        min_bw = self.nccl_cfg.get("min_bandwidth_gbps", 400)
+        default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
+        min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw))
         size_mb = 64
         elements = size_mb * 1024 * 1024 // 4
         iters = 20