test_gpu_scripts/modules/gpu_specs.py
zulifeng dd77a882f1 feat: 跨机 RDMA 并入 rdma_test.py + H800 算力门槛对齐 H100
- modules/rdma_test.py: 新增 SSH 编排的跨机 RDMA(run_cross_node /
  _cross_node_perftest / 解析器),从 client 端逐设备拉起对端 perftest
  server 跑本地 client,替代已删除的 scripts/rdma_cross_node.sh;两机
  4×NDR400 实测全 PASS(~387-392 Gb/s,~2 µs)。
- configs/default.yaml: 新增 rdma.cross_node 配置块(默认 enabled:false)。
- modules/gpu_specs.py: H800 PASS 门槛对齐 H100 实测地板
  (tf32 400->385, bf16 720->730, fp8 1400->1200);H800=H100 硅片,
  PyTorch tensorwise fp8 天花板 ~1310,原 1400 不可达。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 19:38:43 +08:00

381 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""GPU specifications database for NVIDIA datacenter GPUs."""
import os
import shutil
import subprocess
from typing import List, Optional
# GPU name patterns -> internal key mapping
# Order matters: longer/more-specific patterns must come before shorter ones.
GPU_NAME_PATTERNS = {
"A100": "a100",
"A800": "a800",
"H100": "h100",
"H800": "h800", # H800 = H100 SXM with NVLink halved (400 GB/s) and FP64 restricted
"H200": "h200",
"H20": "h20", # H20 / H20-3e is the China-compliance export variant, REDUCED peaks
"B200": "b200",
"B300": "b300",
}
# Specs database — ALL values are DENSE (non-sparse) TFLOPS
GPU_SPECS = {
"h100": {
# Peaks below are NVIDIA marketing dense peaks (theoretical Tensor Core max).
# `compute_pass_thresholds_tflops` carries the absolute PASS thresholds used
# by report.py — decoupled from peaks so marketing-spec changes (dense vs
# sparse vs FP8-sparsity) don't shift the validation bar.
"full_name": "NVIDIA H100 SXM5",
"architecture": "Hopper",
"compute_capability": 9.0,
"hbm_capacity_gb": 80,
"hbm_type": "HBM3",
"memory_bandwidth_gbps": 3400, # GB/s (3.4 TB/s)
"fp32_tflops": 67,
"tf32_tflops": 495, # dense (989 sparse)
"fp16_tflops": 990, # dense (1979 sparse w/ 2:4)
"bf16_tflops": 990, # dense
"fp8_tflops": 1979, # dense
"compute_pass_thresholds_tflops": {
# Recalibrated 2026-05-25 to the H100 eager-cuBLAS achievable floor (each
# threshold ~2-4% below the sustained value measured across 16 GPUs via the
# MAMF shape sweep: fp32 ~52 / tf32 ~405 / fp16 ~732-748 / bf16 ~747-758 /
# fp8 ~1248-1271). The old marketing/MAMF-derived values (fp32 54, tf32 444,
# fp16 734, bf16 745, fp8 1400) sat ON or ABOVE what PyTorch cuBLAS reaches
# on H100, so healthy cards flaked to WARN/FAIL. fp8 1400 in particular was
# an H200/rowwise-scaling figure; H100 tensorwise _scaled_mm tops out ~1310.
"fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
# FP64 63 / INT8 1536 — listed for documentation; benchmark module
# doesn't currently exercise these dtypes.
},
"tdp_watts": 700,
"nvlink_gen": 4,
"nvlink_bandwidth_gbps": 900, # bidirectional
"pcie_gen": 5,
"min_driver_version": "535",
"min_cuda_version": "12.1",
},
"h200": {
"full_name": "NVIDIA H200 SXM",
"architecture": "Hopper",
"compute_capability": 9.0,
"hbm_capacity_gb": 141,
"hbm_type": "HBM3e",
"memory_bandwidth_gbps": 4800, # GB/s (4.8 TB/s) — THIS IS THE CORRECT VALUE, NOT 989!
"fp32_tflops": 67,
"tf32_tflops": 495, # dense
"fp16_tflops": 990, # dense
"bf16_tflops": 990, # dense
"fp8_tflops": 1979, # dense
# PASS thresholds aligned with H200_production_acceptance.md v2 (2026-05-21):
# calibrated against Semianalysis & stas00 MAMF — H200 shares H100 SMs so
# achievable TFLOPS in PyTorch is in the same band.
"compute_pass_thresholds_tflops": {
"fp32": 50, "tf32": 400, "fp16": 720, "bf16": 720, "fp8": 1400,
},
"tdp_watts": 700,
"nvlink_gen": 4,
"nvlink_bandwidth_gbps": 900,
"pcie_gen": 5,
"min_driver_version": "545",
"min_cuda_version": "12.4",
},
"h800": {
# H800 = China-compliance export variant of H100 SXM5. SAME chip / SMs /
# clocks / HBM as H100 SXM5 — Tensor Core peaks (FP16 / BF16 / FP8 / TF32 /
# FP32) are identical to H100. Two restrictions vs H100:
# 1. NVLink bandwidth halved: 400 GB/s bidirectional (vs H100 900 GB/s)
# 2. FP64 throughput severely cut to ~1 TFLOPS (vs H100 34/67 TFLOPS)
# All other interfaces (PCIe Gen5, NVSwitch, HBM3 80GB @ 3.35 TB/s) match H100.
# NCCL multi-GPU thresholds MUST be downscaled because NVLink BW is halved.
"full_name": "NVIDIA H800 SXM5",
"architecture": "Hopper",
"compute_capability": 9.0,
"hbm_capacity_gb": 80,
"hbm_type": "HBM3",
"memory_bandwidth_gbps": 3350, # GB/s (3.35 TB/s) — same as H100 SXM
"fp32_tflops": 67,
"tf32_tflops": 495, # dense (same as H100)
"fp16_tflops": 990, # dense (same as H100)
"bf16_tflops": 990, # dense (same as H100)
"fp8_tflops": 1979, # dense (same as H100)
# Tensor Core peaks identical to H100, so PASS thresholds reuse the H100
# eager-cuBLAS calibration (2026-05-25). Measured on 8×H800: fp32 ~52 /
# tf32 ~420 / fp16 ~741 / bf16 ~745 / fp8 ~1249 — all clear these. fp8 was
# 1400 (an H200/rowwise-scaling figure) which PyTorch tensorwise _scaled_mm
# can't reach on H100-class silicon (~1310 ceiling); lowered to 1200 to match
# h100. FP64 deliberately NOT listed — H800 is restricted to ~1 TFLOPS FP64.
"compute_pass_thresholds_tflops": {
"fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
},
"tdp_watts": 700,
"nvlink_gen": 4,
"nvlink_bandwidth_gbps": 400, # bidirectional — HALF of H100 (export restriction)
"pcie_gen": 5,
"min_driver_version": "535",
"min_cuda_version": "12.1",
},
"h20": {
# China-compliance export variant of H200 (reported as "H20" / "H20-3e" by nvidia-smi).
# Same silicon family / HBM as H200, but Tensor Core peaks are throttled.
# Peaks below are sourced from supplier / NVIDIA China and confirmed against
# measured throughput on 8x H20-3e (FP16 ~741, BF16 ~770, FP8 ~1328 TFLOPS).
"full_name": "NVIDIA H20 / H20-3e",
"architecture": "Hopper",
"compute_capability": 9.0,
"hbm_capacity_gb": 141,
"hbm_type": "HBM3e",
"memory_bandwidth_gbps": 4800,
"fp32_tflops": 54, # China spec (matches measured ~51-52)
"tf32_tflops": 372, # ~75% of H200 (matches measured ~362)
"fp16_tflops": 744, # dense, China spec
"bf16_tflops": 739, # dense, China spec
"fp8_tflops": 1420, # dense, China spec
"tdp_watts": 700,
"nvlink_gen": 4,
"nvlink_bandwidth_gbps": 900,
"pcie_gen": 5,
"min_driver_version": "535",
"min_cuda_version": "12.1",
},
"b200": {
"full_name": "NVIDIA B200 SXM",
"architecture": "Blackwell",
"compute_capability": 10.0,
"hbm_capacity_gb": 180,
"hbm_type": "HBM3e",
"memory_bandwidth_gbps": 8000, # GB/s (8 TB/s)
"fp32_tflops": 90,
"tf32_tflops": 1125, # dense
"fp16_tflops": 2250, # dense
"bf16_tflops": 2250, # dense
"fp8_tflops": 4500, # dense
"tdp_watts": 1000,
"nvlink_gen": 5,
"nvlink_bandwidth_gbps": 1800,
"pcie_gen": 5,
"min_driver_version": "550",
"min_cuda_version": "12.4",
},
"a100": {
"full_name": "NVIDIA A100 SXM",
"architecture": "Ampere",
"compute_capability": 8.0,
"hbm_capacity_gb": 80,
"hbm_type": "HBM2e",
"memory_bandwidth_gbps": 2039, # GB/s (~2.0 TB/s)
"fp32_tflops": 19.5,
"tf32_tflops": 156, # dense
"fp16_tflops": 312, # dense
"bf16_tflops": 312, # dense
"fp8_tflops": 0, # Ampere has no FP8
"tdp_watts": 400,
"nvlink_gen": 3,
"nvlink_bandwidth_gbps": 600, # bidirectional
"pcie_gen": 4,
"min_driver_version": "470",
"min_cuda_version": "11.0",
},
"a800": {
"full_name": "NVIDIA A800 SXM",
"architecture": "Ampere",
"compute_capability": 8.0,
"hbm_capacity_gb": 80,
"hbm_type": "HBM2e",
"memory_bandwidth_gbps": 2039, # GB/s (~2.0 TB/s)
"fp32_tflops": 19.5,
"tf32_tflops": 156, # dense
"fp16_tflops": 312, # dense
"bf16_tflops": 312, # dense
"fp8_tflops": 0, # Ampere has no FP8
"tdp_watts": 400,
"nvlink_gen": 3,
"nvlink_bandwidth_gbps": 600, # bidirectional (NVLink 3, limited vs A100)
"pcie_gen": 4,
"min_driver_version": "470",
"min_cuda_version": "11.0",
},
"b300": {
"full_name": "NVIDIA B300 SXM (Blackwell Ultra)",
"architecture": "Blackwell Ultra",
"compute_capability": 10.0,
"hbm_capacity_gb": 288,
"hbm_type": "HBM3e",
"memory_bandwidth_gbps": 8000, # GB/s (8 TB/s)
"fp32_tflops": 125,
"tf32_tflops": 1750, # dense (estimated)
"fp16_tflops": 3500, # dense
"bf16_tflops": 3500, # dense
"fp8_tflops": 7000, # dense
"tdp_watts": 1200,
"nvlink_gen": 5,
"nvlink_bandwidth_gbps": 1800,
"pcie_gen": 5,
"min_driver_version": "550",
"min_cuda_version": "12.4",
},
}
# Fallback for unknown / unsupported GPUs
_UNKNOWN_SPECS = {
"full_name": "Unknown GPU",
"architecture": "unknown",
"compute_capability": 0.0,
"hbm_capacity_gb": 0,
"hbm_type": "unknown",
"memory_bandwidth_gbps": 0,
"fp32_tflops": 0,
"tf32_tflops": 0,
"fp16_tflops": 0,
"bf16_tflops": 0,
"fp8_tflops": 0,
"compute_pass_thresholds_tflops": {}, # empty => report.py falls back to 80% of peak
"tdp_watts": 700,
"nvlink_gen": 0,
"nvlink_bandwidth_gbps": 0,
"pcie_gen": 0,
"min_driver_version": "",
"min_cuda_version": "",
}
def detect_gpu_type() -> str:
"""Detect GPU type via nvidia-smi and return the internal key (e.g. 'h200').
Returns 'unknown' if nvidia-smi is unavailable or the GPU is not recognized.
"""
nvidia_smi = shutil.which("nvidia-smi")
if not nvidia_smi:
return "unknown"
try:
r = subprocess.run(
[nvidia_smi, "--query-gpu=name", "--format=csv,noheader"],
capture_output=True, text=True, timeout=10,
)
if r.returncode != 0:
return "unknown"
first_line = r.stdout.strip().splitlines()[0].strip().upper()
# Iterate longest-pattern-first so "H200" doesn't get matched by "H20".
for pattern, key in sorted(GPU_NAME_PATTERNS.items(), key=lambda kv: -len(kv[0])):
if pattern in first_line:
return key
return "unknown"
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
return "unknown"
def get_gpu_specs(gpu_type: str = None) -> dict:
"""Return specs dict for the given gpu_type, auto-detecting if None.
Returns a minimal 'unknown' fallback dict with zero peaks for unsupported GPUs.
"""
if gpu_type is None:
gpu_type = detect_gpu_type()
return GPU_SPECS.get(gpu_type, dict(_UNKNOWN_SPECS))
def get_supported_gpus() -> list:
"""Return list of supported GPU type keys."""
return list(GPU_SPECS.keys())
def get_gpu_label(gpu_type: str) -> str:
"""Return a short human-readable label like 'H200 SXM' for display in tables."""
specs = GPU_SPECS.get(gpu_type)
if specs:
full = specs["full_name"]
# Strip the "NVIDIA " prefix for display
return full.replace("NVIDIA ", "")
return "Unknown GPU"
# ---------------------------------------------------------------------------
# Tools path resolution
# ---------------------------------------------------------------------------
def resolve_tools_dir(config: dict) -> str:
"""Resolve tools installation directory with smart fallback.
Priority: GPU_TOOLS_DIR env > config value > /opt/gpu-test-tools > /tmp/gpu-test-tools
"""
# 1. Env var override
env_dir = os.environ.get("GPU_TOOLS_DIR")
if env_dir:
return env_dir
# 2. Config value if explicitly set
cfg_dir = config.get("tools", {}).get("install_dir", "")
if cfg_dir:
return cfg_dir
# 3. /opt/gpu-test-tools if it already exists or /opt is writable
default = "/opt/gpu-test-tools"
if os.path.isdir(default) or os.access("/opt", os.W_OK):
return default
# 4. Fallback to /tmp
return "/tmp/gpu-test-tools"
# ---------------------------------------------------------------------------
# Driver / CUDA compatibility validation
# ---------------------------------------------------------------------------
def _query_nvidia_smi(field: str) -> Optional[str]:
"""Query a single nvidia-smi field, return value string or None."""
nvidia_smi = shutil.which("nvidia-smi")
if not nvidia_smi:
return None
try:
r = subprocess.run(
[nvidia_smi, f"--query-gpu={field}", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=10,
)
if r.returncode == 0 and r.stdout.strip():
return r.stdout.strip().splitlines()[0].strip()
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
pass
return None
def _version_lt(actual: str, minimum: str) -> bool:
"""Return True if actual version < minimum (numeric dotted comparison)."""
def to_tuple(v: str):
parts = []
for p in v.split("."):
try:
parts.append(int(p))
except ValueError:
break
return tuple(parts) if parts else (0,)
return to_tuple(actual) < to_tuple(minimum)
def validate_driver_compatibility(gpu_type: str) -> List[str]:
"""Check if current driver/CUDA meets minimum requirements for the detected GPU.
Returns a list of warning strings (empty if everything is fine).
"""
specs = get_gpu_specs(gpu_type)
warnings: List[str] = []
min_driver = specs.get("min_driver_version", "")
min_cuda = specs.get("min_cuda_version", "")
if not min_driver and not min_cuda:
return warnings
actual_driver = _query_nvidia_smi("driver_version")
# nvidia-smi reports the highest CUDA version supported by the driver
actual_cuda = _query_nvidia_smi("cuda_version")
gpu_label = get_gpu_label(gpu_type)
if actual_driver and min_driver and _version_lt(actual_driver, min_driver):
warnings.append(
f"Driver {actual_driver} < minimum {min_driver} required for {gpu_label}"
)
if actual_cuda and min_cuda and _version_lt(actual_cuda, min_cuda):
warnings.append(
f"CUDA {actual_cuda} < minimum {min_cuda} required for {gpu_label}"
)
return warnings