refactor: replace hardcoded H200 specs with dynamic GPU detection

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
qinyusen 2026-05-06 19:31:51 +08:00
parent 98e4977e28
commit 52fe96f2f5
4 changed files with 42 additions and 26 deletions

View File

@ -11,14 +11,7 @@ from rich.console import Console
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
H200_SPECS = {
"memory_bandwidth_gbps": 989.0,
"fp32_tflops": 67.0,
"tf32_tflops": 989.0,
"fp16_tflops": 989.0,
"bf16_tflops": 989.0,
"fp8_tflops": 1979.0,
}
from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label
TORCH_AVAILABLE = False
try:
@ -36,6 +29,9 @@ class Benchmark:
self.console = Console()
self.bench_cfg = config.get("benchmark", {})
self.tools_dir = config.get("tools", {}).get("install_dir", "/opt/h200-test-tools")
self.gpu_type = detect_gpu_type()
self.specs = get_gpu_specs(self.gpu_type)
self.gpu_label = get_gpu_label(self.gpu_type)
def run(self) -> dict:
results = {}
@ -144,7 +140,7 @@ class Benchmark:
)
h2d_bw = results_by_test.get("host_to_device_memcpy_read_ce", 0)
d2h_bw = results_by_test.get("device_to_host_memcpy_write_ce", 0)
efficiency = (d2d_bw / H200_SPECS["memory_bandwidth_gbps"]) * 100 if d2d_bw else 0
efficiency = (d2d_bw / self.specs["memory_bandwidth_gbps"]) * 100 if d2d_bw else 0
return {
"memory": {
@ -152,7 +148,7 @@ class Benchmark:
"h2d_bandwidth_gbps": round(h2d_bw, 1),
"d2h_bandwidth_gbps": round(d2h_bw, 1),
"d2d_bandwidth_gbps": round(d2d_bw, 1),
"peak_bandwidth_gbps": H200_SPECS["memory_bandwidth_gbps"],
"peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
"efficiency_pct": round(efficiency, 1),
"results_by_test": results_by_test,
"per_gpu": per_gpu_d2d,
@ -220,7 +216,7 @@ class Benchmark:
progress.advance(task)
best_d2d = max(v["d2d_gbps"] for v in bandwidth_by_size.values())
efficiency = (best_d2d / H200_SPECS["memory_bandwidth_gbps"]) * 100
efficiency = (best_d2d / self.specs["memory_bandwidth_gbps"]) * 100
return {
"memory": {
@ -228,7 +224,7 @@ class Benchmark:
"h2d_bandwidth_gbps": round(max(v["h2d_gbps"] for v in bandwidth_by_size.values()), 1),
"d2h_bandwidth_gbps": round(max(v["d2h_gbps"] for v in bandwidth_by_size.values()), 1),
"d2d_bandwidth_gbps": round(best_d2d, 1),
"peak_bandwidth_gbps": H200_SPECS["memory_bandwidth_gbps"],
"peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
"efficiency_pct": round(efficiency, 1),
"test_sizes_mb": test_sizes_mb,
"bandwidth_by_size": bandwidth_by_size,
@ -251,11 +247,11 @@ class Benchmark:
self.console.print(f"[cyan]Compute Benchmark - {gpu_count} GPU(s)[/cyan]")
dtype_map = {
"fp32": (torch.float32, H200_SPECS["fp32_tflops"]),
"tf32": ("tf32", H200_SPECS["tf32_tflops"]),
"fp16": (torch.float16, H200_SPECS["fp16_tflops"]),
"bf16": (torch.bfloat16, H200_SPECS["bf16_tflops"]),
"fp8": (torch.float8_e4m3fn, H200_SPECS["fp8_tflops"]),
"fp32": (torch.float32, self.specs["fp32_tflops"]),
"tf32": ("tf32", self.specs["tf32_tflops"]),
"fp16": (torch.float16, self.specs["fp16_tflops"]),
"bf16": (torch.bfloat16, self.specs["bf16_tflops"]),
"fp8": (torch.float8_e4m3fn, self.specs["fp8_tflops"]),
}
results_by_dtype = {}
@ -351,7 +347,7 @@ class Benchmark:
table = Table(box=None, padding=(0, 1))
table.add_column("Metric", style="bold")
table.add_column("Value", justify="right")
table.add_column("Peak (H200)", justify="right")
table.add_column("Peak", justify="right")
table.add_column("Efficiency", justify="right")
for label, achieved, peak in [
@ -385,7 +381,7 @@ class Benchmark:
t2.add_column("D2H (GB/s)", justify="right")
t2.add_column("D2D (GB/s)", justify="right")
for sz, vals in sorted(by_size.items(), key=lambda x: int(x[0])):
peak = H200_SPECS["memory_bandwidth_gbps"]
peak = mem["peak_bandwidth_gbps"]
d2d_eff = (vals["d2d_gbps"] / peak) * 100
ec = "green" if d2d_eff >= 80 else ("yellow" if d2d_eff >= 50 else "red")
t2.add_row(sz, f"{vals['h2d_gbps']:.1f}", f"{vals['d2h_gbps']:.1f}",
@ -399,7 +395,7 @@ class Benchmark:
table = Table(box=None, padding=(0, 1))
table.add_column("DType", style="bold")
table.add_column("Achieved (TFLOPS)", justify="right")
table.add_column("Peak (H200)", justify="right")
table.add_column("Peak", justify="right")
table.add_column("Efficiency", justify="right")
peak = comp.get("peak_tflops", {})

View File

@ -1,4 +1,4 @@
"""GPU information detection module for NVIDIA H200."""
"""GPU information detection module for NVIDIA datacenter GPUs (H100/H200/B200/B300)."""
import subprocess
import shutil
@ -10,12 +10,17 @@ from rich.table import Table
from rich.panel import Panel
from rich.text import Text
from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label
class GPUInfo:
def __init__(self, config: dict):
self.config = config
self.console = Console()
self.gpu_type = detect_gpu_type()
self.specs = get_gpu_specs(self.gpu_type)
self.gpu_label = get_gpu_label(self.gpu_type)
def _run_smi(self, query: str, fmt: str = "csv,noheader,nounits") -> Optional[str]:
if not shutil.which("nvidia-smi"):
@ -116,6 +121,8 @@ class GPUInfo:
"gpus": gpus,
"topology": topology,
"timestamp": datetime.now().isoformat(),
"detected_gpu_type": self.gpu_type,
"gpu_label": self.gpu_label,
}
def _get_topology(self) -> str:
@ -139,6 +146,7 @@ class GPUInfo:
c.print(f" Driver Version : {results.get('driver_version', 'N/A')}")
c.print(f" CUDA Version : {results.get('cuda_version', 'N/A')}")
c.print(f" GPU Count : {results.get('gpu_count', 0)}")
c.print(f" Detected GPU : {results.get('gpu_label', 'Unknown')} ({results.get('detected_gpu_type', 'unknown')})")
c.print(f" Timestamp : {results.get('timestamp', 'N/A')}")
gpus = results.get("gpus", [])
@ -158,7 +166,7 @@ class GPUInfo:
for g in gpus:
name = g["name"]
if "H200" in name:
if any(k in name for k in ("H100", "H200", "B200", "B300")):
name = f"[bold green]{name}[/bold green]"
vram = f"{g['vram_used_mb']}/{g['vram_total_mb']} MB"
temp = f"{g['temperature']}°C"

View File

@ -1,4 +1,4 @@
"""Hardware health monitoring module for NVIDIA H200."""
"""Hardware health monitoring module for NVIDIA datacenter GPUs (H100/H200/B200/B300)."""
import subprocess
import shutil
@ -11,6 +11,8 @@ from rich.table import Table
from rich.panel import Panel
from rich.text import Text
from modules.gpu_specs import detect_gpu_type, get_gpu_specs
class HealthCheck:
@ -18,6 +20,8 @@ class HealthCheck:
self.config = config
self.console = Console()
self.health_cfg = config.get("health", {})
self.gpu_type = detect_gpu_type()
self.specs = get_gpu_specs(self.gpu_type)
def _run_smi(self, query: str) -> Optional[str]:
if not shutil.which("nvidia-smi"):
@ -79,7 +83,7 @@ class HealthCheck:
temp_warn = self.health_cfg.get("temp_warning", 80)
temp_crit = self.health_cfg.get("temp_critical", 90)
power_lim = self.health_cfg.get("power_limit", 700)
power_lim = self.health_cfg.get("power_limit", self.specs.get("tdp_watts", 700))
gpu_health = []
overall_pass = True
@ -150,6 +154,7 @@ class HealthCheck:
"gpu_health": gpu_health,
"system_health": system_health,
"timestamp": datetime.now().isoformat(),
"detected_gpu_type": self.gpu_type,
}
def _check_system(self) -> dict:

View File

@ -12,6 +12,8 @@ from rich.console import Console
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
from modules.gpu_specs import detect_gpu_type, get_gpu_specs
TORCH_AVAILABLE = False
try:
import torch
@ -28,6 +30,8 @@ class NCCLTest:
self.console = Console()
self.nccl_cfg = config.get("nccl", {})
self.tools_dir = config.get("tools", {}).get("install_dir", "/opt/h200-test-tools")
self.gpu_type = detect_gpu_type()
self.specs = get_gpu_specs(self.gpu_type)
def _find_nccl_test(self, name: str) -> Optional[str]:
p = shutil.which(name)
@ -81,7 +85,8 @@ class NCCLTest:
tests.append(("sendrecv_perf", "SendRecv"))
results = {}
min_bw = self.nccl_cfg.get("min_bandwidth_gbps", 400)
default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw))
with Progress(
SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
@ -109,6 +114,7 @@ class NCCLTest:
"tests": results,
"gpu_count": gpu_count,
"timestamp": datetime.now().isoformat(),
"detected_gpu_type": self.gpu_type,
}
def _run_one_nccl_test(self, binary_name: str, label: str,
@ -187,7 +193,8 @@ class NCCLTest:
def _run_torchrun_fallback(self, gpu_count: int) -> dict:
self.console.print("[cyan]Using torchrun fallback for NCCL test[/cyan]")
min_bw = self.nccl_cfg.get("min_bandwidth_gbps", 400)
default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw))
size_mb = 64
elements = size_mb * 1024 * 1024 // 4
iters = 20