refactor: replace hardcoded H200 specs with dynamic GPU detection
Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
98e4977e28
commit
52fe96f2f5
@ -11,14 +11,7 @@ from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
|
||||
|
||||
H200_SPECS = {
|
||||
"memory_bandwidth_gbps": 989.0,
|
||||
"fp32_tflops": 67.0,
|
||||
"tf32_tflops": 989.0,
|
||||
"fp16_tflops": 989.0,
|
||||
"bf16_tflops": 989.0,
|
||||
"fp8_tflops": 1979.0,
|
||||
}
|
||||
from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label
|
||||
|
||||
TORCH_AVAILABLE = False
|
||||
try:
|
||||
@ -36,6 +29,9 @@ class Benchmark:
|
||||
self.console = Console()
|
||||
self.bench_cfg = config.get("benchmark", {})
|
||||
self.tools_dir = config.get("tools", {}).get("install_dir", "/opt/h200-test-tools")
|
||||
self.gpu_type = detect_gpu_type()
|
||||
self.specs = get_gpu_specs(self.gpu_type)
|
||||
self.gpu_label = get_gpu_label(self.gpu_type)
|
||||
|
||||
def run(self) -> dict:
|
||||
results = {}
|
||||
@ -144,7 +140,7 @@ class Benchmark:
|
||||
)
|
||||
h2d_bw = results_by_test.get("host_to_device_memcpy_read_ce", 0)
|
||||
d2h_bw = results_by_test.get("device_to_host_memcpy_write_ce", 0)
|
||||
efficiency = (d2d_bw / H200_SPECS["memory_bandwidth_gbps"]) * 100 if d2d_bw else 0
|
||||
efficiency = (d2d_bw / self.specs["memory_bandwidth_gbps"]) * 100 if d2d_bw else 0
|
||||
|
||||
return {
|
||||
"memory": {
|
||||
@ -152,7 +148,7 @@ class Benchmark:
|
||||
"h2d_bandwidth_gbps": round(h2d_bw, 1),
|
||||
"d2h_bandwidth_gbps": round(d2h_bw, 1),
|
||||
"d2d_bandwidth_gbps": round(d2d_bw, 1),
|
||||
"peak_bandwidth_gbps": H200_SPECS["memory_bandwidth_gbps"],
|
||||
"peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
|
||||
"efficiency_pct": round(efficiency, 1),
|
||||
"results_by_test": results_by_test,
|
||||
"per_gpu": per_gpu_d2d,
|
||||
@ -220,7 +216,7 @@ class Benchmark:
|
||||
progress.advance(task)
|
||||
|
||||
best_d2d = max(v["d2d_gbps"] for v in bandwidth_by_size.values())
|
||||
efficiency = (best_d2d / H200_SPECS["memory_bandwidth_gbps"]) * 100
|
||||
efficiency = (best_d2d / self.specs["memory_bandwidth_gbps"]) * 100
|
||||
|
||||
return {
|
||||
"memory": {
|
||||
@ -228,7 +224,7 @@ class Benchmark:
|
||||
"h2d_bandwidth_gbps": round(max(v["h2d_gbps"] for v in bandwidth_by_size.values()), 1),
|
||||
"d2h_bandwidth_gbps": round(max(v["d2h_gbps"] for v in bandwidth_by_size.values()), 1),
|
||||
"d2d_bandwidth_gbps": round(best_d2d, 1),
|
||||
"peak_bandwidth_gbps": H200_SPECS["memory_bandwidth_gbps"],
|
||||
"peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
|
||||
"efficiency_pct": round(efficiency, 1),
|
||||
"test_sizes_mb": test_sizes_mb,
|
||||
"bandwidth_by_size": bandwidth_by_size,
|
||||
@ -251,11 +247,11 @@ class Benchmark:
|
||||
self.console.print(f"[cyan]Compute Benchmark - {gpu_count} GPU(s)[/cyan]")
|
||||
|
||||
dtype_map = {
|
||||
"fp32": (torch.float32, H200_SPECS["fp32_tflops"]),
|
||||
"tf32": ("tf32", H200_SPECS["tf32_tflops"]),
|
||||
"fp16": (torch.float16, H200_SPECS["fp16_tflops"]),
|
||||
"bf16": (torch.bfloat16, H200_SPECS["bf16_tflops"]),
|
||||
"fp8": (torch.float8_e4m3fn, H200_SPECS["fp8_tflops"]),
|
||||
"fp32": (torch.float32, self.specs["fp32_tflops"]),
|
||||
"tf32": ("tf32", self.specs["tf32_tflops"]),
|
||||
"fp16": (torch.float16, self.specs["fp16_tflops"]),
|
||||
"bf16": (torch.bfloat16, self.specs["bf16_tflops"]),
|
||||
"fp8": (torch.float8_e4m3fn, self.specs["fp8_tflops"]),
|
||||
}
|
||||
|
||||
results_by_dtype = {}
|
||||
@ -351,7 +347,7 @@ class Benchmark:
|
||||
table = Table(box=None, padding=(0, 1))
|
||||
table.add_column("Metric", style="bold")
|
||||
table.add_column("Value", justify="right")
|
||||
table.add_column("Peak (H200)", justify="right")
|
||||
table.add_column("Peak", justify="right")
|
||||
table.add_column("Efficiency", justify="right")
|
||||
|
||||
for label, achieved, peak in [
|
||||
@ -385,7 +381,7 @@ class Benchmark:
|
||||
t2.add_column("D2H (GB/s)", justify="right")
|
||||
t2.add_column("D2D (GB/s)", justify="right")
|
||||
for sz, vals in sorted(by_size.items(), key=lambda x: int(x[0])):
|
||||
peak = H200_SPECS["memory_bandwidth_gbps"]
|
||||
peak = mem["peak_bandwidth_gbps"]
|
||||
d2d_eff = (vals["d2d_gbps"] / peak) * 100
|
||||
ec = "green" if d2d_eff >= 80 else ("yellow" if d2d_eff >= 50 else "red")
|
||||
t2.add_row(sz, f"{vals['h2d_gbps']:.1f}", f"{vals['d2h_gbps']:.1f}",
|
||||
@ -399,7 +395,7 @@ class Benchmark:
|
||||
table = Table(box=None, padding=(0, 1))
|
||||
table.add_column("DType", style="bold")
|
||||
table.add_column("Achieved (TFLOPS)", justify="right")
|
||||
table.add_column("Peak (H200)", justify="right")
|
||||
table.add_column("Peak", justify="right")
|
||||
table.add_column("Efficiency", justify="right")
|
||||
|
||||
peak = comp.get("peak_tflops", {})
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
"""GPU information detection module for NVIDIA H200."""
|
||||
"""GPU information detection module for NVIDIA datacenter GPUs (H100/H200/B200/B300)."""
|
||||
|
||||
import subprocess
|
||||
import shutil
|
||||
@ -10,12 +10,17 @@ from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich.text import Text
|
||||
|
||||
from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label
|
||||
|
||||
|
||||
class GPUInfo:
|
||||
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
self.console = Console()
|
||||
self.gpu_type = detect_gpu_type()
|
||||
self.specs = get_gpu_specs(self.gpu_type)
|
||||
self.gpu_label = get_gpu_label(self.gpu_type)
|
||||
|
||||
def _run_smi(self, query: str, fmt: str = "csv,noheader,nounits") -> Optional[str]:
|
||||
if not shutil.which("nvidia-smi"):
|
||||
@ -116,6 +121,8 @@ class GPUInfo:
|
||||
"gpus": gpus,
|
||||
"topology": topology,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"detected_gpu_type": self.gpu_type,
|
||||
"gpu_label": self.gpu_label,
|
||||
}
|
||||
|
||||
def _get_topology(self) -> str:
|
||||
@ -139,6 +146,7 @@ class GPUInfo:
|
||||
c.print(f" Driver Version : {results.get('driver_version', 'N/A')}")
|
||||
c.print(f" CUDA Version : {results.get('cuda_version', 'N/A')}")
|
||||
c.print(f" GPU Count : {results.get('gpu_count', 0)}")
|
||||
c.print(f" Detected GPU : {results.get('gpu_label', 'Unknown')} ({results.get('detected_gpu_type', 'unknown')})")
|
||||
c.print(f" Timestamp : {results.get('timestamp', 'N/A')}")
|
||||
|
||||
gpus = results.get("gpus", [])
|
||||
@ -158,7 +166,7 @@ class GPUInfo:
|
||||
|
||||
for g in gpus:
|
||||
name = g["name"]
|
||||
if "H200" in name:
|
||||
if any(k in name for k in ("H100", "H200", "B200", "B300")):
|
||||
name = f"[bold green]{name}[/bold green]"
|
||||
vram = f"{g['vram_used_mb']}/{g['vram_total_mb']} MB"
|
||||
temp = f"{g['temperature']}°C"
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
"""Hardware health monitoring module for NVIDIA H200."""
|
||||
"""Hardware health monitoring module for NVIDIA datacenter GPUs (H100/H200/B200/B300)."""
|
||||
|
||||
import subprocess
|
||||
import shutil
|
||||
@ -11,6 +11,8 @@ from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich.text import Text
|
||||
|
||||
from modules.gpu_specs import detect_gpu_type, get_gpu_specs
|
||||
|
||||
|
||||
class HealthCheck:
|
||||
|
||||
@ -18,6 +20,8 @@ class HealthCheck:
|
||||
self.config = config
|
||||
self.console = Console()
|
||||
self.health_cfg = config.get("health", {})
|
||||
self.gpu_type = detect_gpu_type()
|
||||
self.specs = get_gpu_specs(self.gpu_type)
|
||||
|
||||
def _run_smi(self, query: str) -> Optional[str]:
|
||||
if not shutil.which("nvidia-smi"):
|
||||
@ -79,7 +83,7 @@ class HealthCheck:
|
||||
|
||||
temp_warn = self.health_cfg.get("temp_warning", 80)
|
||||
temp_crit = self.health_cfg.get("temp_critical", 90)
|
||||
power_lim = self.health_cfg.get("power_limit", 700)
|
||||
power_lim = self.health_cfg.get("power_limit", self.specs.get("tdp_watts", 700))
|
||||
|
||||
gpu_health = []
|
||||
overall_pass = True
|
||||
@ -150,6 +154,7 @@ class HealthCheck:
|
||||
"gpu_health": gpu_health,
|
||||
"system_health": system_health,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"detected_gpu_type": self.gpu_type,
|
||||
}
|
||||
|
||||
def _check_system(self) -> dict:
|
||||
|
||||
@ -12,6 +12,8 @@ from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
|
||||
|
||||
from modules.gpu_specs import detect_gpu_type, get_gpu_specs
|
||||
|
||||
TORCH_AVAILABLE = False
|
||||
try:
|
||||
import torch
|
||||
@ -28,6 +30,8 @@ class NCCLTest:
|
||||
self.console = Console()
|
||||
self.nccl_cfg = config.get("nccl", {})
|
||||
self.tools_dir = config.get("tools", {}).get("install_dir", "/opt/h200-test-tools")
|
||||
self.gpu_type = detect_gpu_type()
|
||||
self.specs = get_gpu_specs(self.gpu_type)
|
||||
|
||||
def _find_nccl_test(self, name: str) -> Optional[str]:
|
||||
p = shutil.which(name)
|
||||
@ -81,7 +85,8 @@ class NCCLTest:
|
||||
tests.append(("sendrecv_perf", "SendRecv"))
|
||||
|
||||
results = {}
|
||||
min_bw = self.nccl_cfg.get("min_bandwidth_gbps", 400)
|
||||
default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
|
||||
min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw))
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
|
||||
@ -109,6 +114,7 @@ class NCCLTest:
|
||||
"tests": results,
|
||||
"gpu_count": gpu_count,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"detected_gpu_type": self.gpu_type,
|
||||
}
|
||||
|
||||
def _run_one_nccl_test(self, binary_name: str, label: str,
|
||||
@ -187,7 +193,8 @@ class NCCLTest:
|
||||
|
||||
def _run_torchrun_fallback(self, gpu_count: int) -> dict:
|
||||
self.console.print("[cyan]Using torchrun fallback for NCCL test[/cyan]")
|
||||
min_bw = self.nccl_cfg.get("min_bandwidth_gbps", 400)
|
||||
default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
|
||||
min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw))
|
||||
size_mb = 64
|
||||
elements = size_mb * 1024 * 1024 // 4
|
||||
iters = 20
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user