- modules/rdma_test.py: 新增 SSH 编排的跨机 RDMA(run_cross_node / _cross_node_perftest / 解析器),从 client 端逐设备拉起对端 perftest server 跑本地 client,替代已删除的 scripts/rdma_cross_node.sh;两机 4×NDR400 实测全 PASS(~387-392 Gb/s,~2 µs)。 - configs/default.yaml: 新增 rdma.cross_node 配置块(默认 enabled:false)。 - modules/gpu_specs.py: H800 PASS 门槛对齐 H100 实测地板 (tf32 400->385, bf16 720->730, fp8 1400->1200);H800=H100 硅片, PyTorch tensorwise fp8 天花板 ~1310,原 1400 不可达。 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
381 lines
15 KiB
Python
381 lines
15 KiB
Python
"""GPU specifications database for NVIDIA datacenter GPUs."""
|
||
|
||
import os
|
||
import shutil
|
||
import subprocess
|
||
from typing import List, Optional
|
||
|
||
# GPU name patterns -> internal key mapping
|
||
# Order matters: longer/more-specific patterns must come before shorter ones.
|
||
GPU_NAME_PATTERNS = {
|
||
"A100": "a100",
|
||
"A800": "a800",
|
||
"H100": "h100",
|
||
"H800": "h800", # H800 = H100 SXM with NVLink halved (400 GB/s) and FP64 restricted
|
||
"H200": "h200",
|
||
"H20": "h20", # H20 / H20-3e is the China-compliance export variant, REDUCED peaks
|
||
"B200": "b200",
|
||
"B300": "b300",
|
||
}
|
||
|
||
# Specs database — ALL values are DENSE (non-sparse) TFLOPS
|
||
GPU_SPECS = {
|
||
"h100": {
|
||
# Peaks below are NVIDIA marketing dense peaks (theoretical Tensor Core max).
|
||
# `compute_pass_thresholds_tflops` carries the absolute PASS thresholds used
|
||
# by report.py — decoupled from peaks so marketing-spec changes (dense vs
|
||
# sparse vs FP8-sparsity) don't shift the validation bar.
|
||
"full_name": "NVIDIA H100 SXM5",
|
||
"architecture": "Hopper",
|
||
"compute_capability": 9.0,
|
||
"hbm_capacity_gb": 80,
|
||
"hbm_type": "HBM3",
|
||
"memory_bandwidth_gbps": 3400, # GB/s (3.4 TB/s)
|
||
"fp32_tflops": 67,
|
||
"tf32_tflops": 495, # dense (989 sparse)
|
||
"fp16_tflops": 990, # dense (1979 sparse w/ 2:4)
|
||
"bf16_tflops": 990, # dense
|
||
"fp8_tflops": 1979, # dense
|
||
"compute_pass_thresholds_tflops": {
|
||
# Recalibrated 2026-05-25 to the H100 eager-cuBLAS achievable floor (each
|
||
# threshold ~2-4% below the sustained value measured across 16 GPUs via the
|
||
# MAMF shape sweep: fp32 ~52 / tf32 ~405 / fp16 ~732-748 / bf16 ~747-758 /
|
||
# fp8 ~1248-1271). The old marketing/MAMF-derived values (fp32 54, tf32 444,
|
||
# fp16 734, bf16 745, fp8 1400) sat ON or ABOVE what PyTorch cuBLAS reaches
|
||
# on H100, so healthy cards flaked to WARN/FAIL. fp8 1400 in particular was
|
||
# an H200/rowwise-scaling figure; H100 tensorwise _scaled_mm tops out ~1310.
|
||
"fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
|
||
# FP64 63 / INT8 1536 — listed for documentation; benchmark module
|
||
# doesn't currently exercise these dtypes.
|
||
},
|
||
"tdp_watts": 700,
|
||
"nvlink_gen": 4,
|
||
"nvlink_bandwidth_gbps": 900, # bidirectional
|
||
"pcie_gen": 5,
|
||
"min_driver_version": "535",
|
||
"min_cuda_version": "12.1",
|
||
},
|
||
"h200": {
|
||
"full_name": "NVIDIA H200 SXM",
|
||
"architecture": "Hopper",
|
||
"compute_capability": 9.0,
|
||
"hbm_capacity_gb": 141,
|
||
"hbm_type": "HBM3e",
|
||
"memory_bandwidth_gbps": 4800, # GB/s (4.8 TB/s) — THIS IS THE CORRECT VALUE, NOT 989!
|
||
"fp32_tflops": 67,
|
||
"tf32_tflops": 495, # dense
|
||
"fp16_tflops": 990, # dense
|
||
"bf16_tflops": 990, # dense
|
||
"fp8_tflops": 1979, # dense
|
||
# PASS thresholds aligned with H200_production_acceptance.md v2 (2026-05-21):
|
||
# calibrated against Semianalysis & stas00 MAMF — H200 shares H100 SMs so
|
||
# achievable TFLOPS in PyTorch is in the same band.
|
||
"compute_pass_thresholds_tflops": {
|
||
"fp32": 50, "tf32": 400, "fp16": 720, "bf16": 720, "fp8": 1400,
|
||
},
|
||
"tdp_watts": 700,
|
||
"nvlink_gen": 4,
|
||
"nvlink_bandwidth_gbps": 900,
|
||
"pcie_gen": 5,
|
||
"min_driver_version": "545",
|
||
"min_cuda_version": "12.4",
|
||
},
|
||
"h800": {
|
||
# H800 = China-compliance export variant of H100 SXM5. SAME chip / SMs /
|
||
# clocks / HBM as H100 SXM5 — Tensor Core peaks (FP16 / BF16 / FP8 / TF32 /
|
||
# FP32) are identical to H100. Two restrictions vs H100:
|
||
# 1. NVLink bandwidth halved: 400 GB/s bidirectional (vs H100 900 GB/s)
|
||
# 2. FP64 throughput severely cut to ~1 TFLOPS (vs H100 34/67 TFLOPS)
|
||
# All other interfaces (PCIe Gen5, NVSwitch, HBM3 80GB @ 3.35 TB/s) match H100.
|
||
# NCCL multi-GPU thresholds MUST be downscaled because NVLink BW is halved.
|
||
"full_name": "NVIDIA H800 SXM5",
|
||
"architecture": "Hopper",
|
||
"compute_capability": 9.0,
|
||
"hbm_capacity_gb": 80,
|
||
"hbm_type": "HBM3",
|
||
"memory_bandwidth_gbps": 3350, # GB/s (3.35 TB/s) — same as H100 SXM
|
||
"fp32_tflops": 67,
|
||
"tf32_tflops": 495, # dense (same as H100)
|
||
"fp16_tflops": 990, # dense (same as H100)
|
||
"bf16_tflops": 990, # dense (same as H100)
|
||
"fp8_tflops": 1979, # dense (same as H100)
|
||
# Tensor Core peaks identical to H100, so PASS thresholds reuse the H100
|
||
# eager-cuBLAS calibration (2026-05-25). Measured on 8×H800: fp32 ~52 /
|
||
# tf32 ~420 / fp16 ~741 / bf16 ~745 / fp8 ~1249 — all clear these. fp8 was
|
||
# 1400 (an H200/rowwise-scaling figure) which PyTorch tensorwise _scaled_mm
|
||
# can't reach on H100-class silicon (~1310 ceiling); lowered to 1200 to match
|
||
# h100. FP64 deliberately NOT listed — H800 is restricted to ~1 TFLOPS FP64.
|
||
"compute_pass_thresholds_tflops": {
|
||
"fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200,
|
||
},
|
||
"tdp_watts": 700,
|
||
"nvlink_gen": 4,
|
||
"nvlink_bandwidth_gbps": 400, # bidirectional — HALF of H100 (export restriction)
|
||
"pcie_gen": 5,
|
||
"min_driver_version": "535",
|
||
"min_cuda_version": "12.1",
|
||
},
|
||
"h20": {
|
||
# China-compliance export variant of H200 (reported as "H20" / "H20-3e" by nvidia-smi).
|
||
# Same silicon family / HBM as H200, but Tensor Core peaks are throttled.
|
||
# Peaks below are sourced from supplier / NVIDIA China and confirmed against
|
||
# measured throughput on 8x H20-3e (FP16 ~741, BF16 ~770, FP8 ~1328 TFLOPS).
|
||
"full_name": "NVIDIA H20 / H20-3e",
|
||
"architecture": "Hopper",
|
||
"compute_capability": 9.0,
|
||
"hbm_capacity_gb": 141,
|
||
"hbm_type": "HBM3e",
|
||
"memory_bandwidth_gbps": 4800,
|
||
"fp32_tflops": 54, # China spec (matches measured ~51-52)
|
||
"tf32_tflops": 372, # ~75% of H200 (matches measured ~362)
|
||
"fp16_tflops": 744, # dense, China spec
|
||
"bf16_tflops": 739, # dense, China spec
|
||
"fp8_tflops": 1420, # dense, China spec
|
||
"tdp_watts": 700,
|
||
"nvlink_gen": 4,
|
||
"nvlink_bandwidth_gbps": 900,
|
||
"pcie_gen": 5,
|
||
"min_driver_version": "535",
|
||
"min_cuda_version": "12.1",
|
||
},
|
||
"b200": {
|
||
"full_name": "NVIDIA B200 SXM",
|
||
"architecture": "Blackwell",
|
||
"compute_capability": 10.0,
|
||
"hbm_capacity_gb": 180,
|
||
"hbm_type": "HBM3e",
|
||
"memory_bandwidth_gbps": 8000, # GB/s (8 TB/s)
|
||
"fp32_tflops": 90,
|
||
"tf32_tflops": 1125, # dense
|
||
"fp16_tflops": 2250, # dense
|
||
"bf16_tflops": 2250, # dense
|
||
"fp8_tflops": 4500, # dense
|
||
"tdp_watts": 1000,
|
||
"nvlink_gen": 5,
|
||
"nvlink_bandwidth_gbps": 1800,
|
||
"pcie_gen": 5,
|
||
"min_driver_version": "550",
|
||
"min_cuda_version": "12.4",
|
||
},
|
||
"a100": {
|
||
"full_name": "NVIDIA A100 SXM",
|
||
"architecture": "Ampere",
|
||
"compute_capability": 8.0,
|
||
"hbm_capacity_gb": 80,
|
||
"hbm_type": "HBM2e",
|
||
"memory_bandwidth_gbps": 2039, # GB/s (~2.0 TB/s)
|
||
"fp32_tflops": 19.5,
|
||
"tf32_tflops": 156, # dense
|
||
"fp16_tflops": 312, # dense
|
||
"bf16_tflops": 312, # dense
|
||
"fp8_tflops": 0, # Ampere has no FP8
|
||
"tdp_watts": 400,
|
||
"nvlink_gen": 3,
|
||
"nvlink_bandwidth_gbps": 600, # bidirectional
|
||
"pcie_gen": 4,
|
||
"min_driver_version": "470",
|
||
"min_cuda_version": "11.0",
|
||
},
|
||
"a800": {
|
||
"full_name": "NVIDIA A800 SXM",
|
||
"architecture": "Ampere",
|
||
"compute_capability": 8.0,
|
||
"hbm_capacity_gb": 80,
|
||
"hbm_type": "HBM2e",
|
||
"memory_bandwidth_gbps": 2039, # GB/s (~2.0 TB/s)
|
||
"fp32_tflops": 19.5,
|
||
"tf32_tflops": 156, # dense
|
||
"fp16_tflops": 312, # dense
|
||
"bf16_tflops": 312, # dense
|
||
"fp8_tflops": 0, # Ampere has no FP8
|
||
"tdp_watts": 400,
|
||
"nvlink_gen": 3,
|
||
"nvlink_bandwidth_gbps": 600, # bidirectional (NVLink 3, limited vs A100)
|
||
"pcie_gen": 4,
|
||
"min_driver_version": "470",
|
||
"min_cuda_version": "11.0",
|
||
},
|
||
"b300": {
|
||
"full_name": "NVIDIA B300 SXM (Blackwell Ultra)",
|
||
"architecture": "Blackwell Ultra",
|
||
"compute_capability": 10.0,
|
||
"hbm_capacity_gb": 288,
|
||
"hbm_type": "HBM3e",
|
||
"memory_bandwidth_gbps": 8000, # GB/s (8 TB/s)
|
||
"fp32_tflops": 125,
|
||
"tf32_tflops": 1750, # dense (estimated)
|
||
"fp16_tflops": 3500, # dense
|
||
"bf16_tflops": 3500, # dense
|
||
"fp8_tflops": 7000, # dense
|
||
"tdp_watts": 1200,
|
||
"nvlink_gen": 5,
|
||
"nvlink_bandwidth_gbps": 1800,
|
||
"pcie_gen": 5,
|
||
"min_driver_version": "550",
|
||
"min_cuda_version": "12.4",
|
||
},
|
||
}
|
||
|
||
# Fallback for unknown / unsupported GPUs
|
||
_UNKNOWN_SPECS = {
|
||
"full_name": "Unknown GPU",
|
||
"architecture": "unknown",
|
||
"compute_capability": 0.0,
|
||
"hbm_capacity_gb": 0,
|
||
"hbm_type": "unknown",
|
||
"memory_bandwidth_gbps": 0,
|
||
"fp32_tflops": 0,
|
||
"tf32_tflops": 0,
|
||
"fp16_tflops": 0,
|
||
"bf16_tflops": 0,
|
||
"fp8_tflops": 0,
|
||
"compute_pass_thresholds_tflops": {}, # empty => report.py falls back to 80% of peak
|
||
"tdp_watts": 700,
|
||
"nvlink_gen": 0,
|
||
"nvlink_bandwidth_gbps": 0,
|
||
"pcie_gen": 0,
|
||
"min_driver_version": "",
|
||
"min_cuda_version": "",
|
||
}
|
||
|
||
|
||
def detect_gpu_type() -> str:
|
||
"""Detect GPU type via nvidia-smi and return the internal key (e.g. 'h200').
|
||
|
||
Returns 'unknown' if nvidia-smi is unavailable or the GPU is not recognized.
|
||
"""
|
||
nvidia_smi = shutil.which("nvidia-smi")
|
||
if not nvidia_smi:
|
||
return "unknown"
|
||
|
||
try:
|
||
r = subprocess.run(
|
||
[nvidia_smi, "--query-gpu=name", "--format=csv,noheader"],
|
||
capture_output=True, text=True, timeout=10,
|
||
)
|
||
if r.returncode != 0:
|
||
return "unknown"
|
||
|
||
first_line = r.stdout.strip().splitlines()[0].strip().upper()
|
||
# Iterate longest-pattern-first so "H200" doesn't get matched by "H20".
|
||
for pattern, key in sorted(GPU_NAME_PATTERNS.items(), key=lambda kv: -len(kv[0])):
|
||
if pattern in first_line:
|
||
return key
|
||
return "unknown"
|
||
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
||
return "unknown"
|
||
|
||
|
||
def get_gpu_specs(gpu_type: str = None) -> dict:
|
||
"""Return specs dict for the given gpu_type, auto-detecting if None.
|
||
|
||
Returns a minimal 'unknown' fallback dict with zero peaks for unsupported GPUs.
|
||
"""
|
||
if gpu_type is None:
|
||
gpu_type = detect_gpu_type()
|
||
return GPU_SPECS.get(gpu_type, dict(_UNKNOWN_SPECS))
|
||
|
||
|
||
def get_supported_gpus() -> list:
|
||
"""Return list of supported GPU type keys."""
|
||
return list(GPU_SPECS.keys())
|
||
|
||
|
||
def get_gpu_label(gpu_type: str) -> str:
|
||
"""Return a short human-readable label like 'H200 SXM' for display in tables."""
|
||
specs = GPU_SPECS.get(gpu_type)
|
||
if specs:
|
||
full = specs["full_name"]
|
||
# Strip the "NVIDIA " prefix for display
|
||
return full.replace("NVIDIA ", "")
|
||
return "Unknown GPU"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Tools path resolution
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def resolve_tools_dir(config: dict) -> str:
|
||
"""Resolve tools installation directory with smart fallback.
|
||
|
||
Priority: GPU_TOOLS_DIR env > config value > /opt/gpu-test-tools > /tmp/gpu-test-tools
|
||
"""
|
||
# 1. Env var override
|
||
env_dir = os.environ.get("GPU_TOOLS_DIR")
|
||
if env_dir:
|
||
return env_dir
|
||
# 2. Config value if explicitly set
|
||
cfg_dir = config.get("tools", {}).get("install_dir", "")
|
||
if cfg_dir:
|
||
return cfg_dir
|
||
# 3. /opt/gpu-test-tools if it already exists or /opt is writable
|
||
default = "/opt/gpu-test-tools"
|
||
if os.path.isdir(default) or os.access("/opt", os.W_OK):
|
||
return default
|
||
# 4. Fallback to /tmp
|
||
return "/tmp/gpu-test-tools"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Driver / CUDA compatibility validation
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _query_nvidia_smi(field: str) -> Optional[str]:
|
||
"""Query a single nvidia-smi field, return value string or None."""
|
||
nvidia_smi = shutil.which("nvidia-smi")
|
||
if not nvidia_smi:
|
||
return None
|
||
try:
|
||
r = subprocess.run(
|
||
[nvidia_smi, f"--query-gpu={field}", "--format=csv,noheader,nounits"],
|
||
capture_output=True, text=True, timeout=10,
|
||
)
|
||
if r.returncode == 0 and r.stdout.strip():
|
||
return r.stdout.strip().splitlines()[0].strip()
|
||
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
||
pass
|
||
return None
|
||
|
||
|
||
def _version_lt(actual: str, minimum: str) -> bool:
|
||
"""Return True if actual version < minimum (numeric dotted comparison)."""
|
||
def to_tuple(v: str):
|
||
parts = []
|
||
for p in v.split("."):
|
||
try:
|
||
parts.append(int(p))
|
||
except ValueError:
|
||
break
|
||
return tuple(parts) if parts else (0,)
|
||
return to_tuple(actual) < to_tuple(minimum)
|
||
|
||
|
||
def validate_driver_compatibility(gpu_type: str) -> List[str]:
|
||
"""Check if current driver/CUDA meets minimum requirements for the detected GPU.
|
||
|
||
Returns a list of warning strings (empty if everything is fine).
|
||
"""
|
||
specs = get_gpu_specs(gpu_type)
|
||
warnings: List[str] = []
|
||
|
||
min_driver = specs.get("min_driver_version", "")
|
||
min_cuda = specs.get("min_cuda_version", "")
|
||
if not min_driver and not min_cuda:
|
||
return warnings
|
||
|
||
actual_driver = _query_nvidia_smi("driver_version")
|
||
# nvidia-smi reports the highest CUDA version supported by the driver
|
||
actual_cuda = _query_nvidia_smi("cuda_version")
|
||
|
||
gpu_label = get_gpu_label(gpu_type)
|
||
|
||
if actual_driver and min_driver and _version_lt(actual_driver, min_driver):
|
||
warnings.append(
|
||
f"Driver {actual_driver} < minimum {min_driver} required for {gpu_label}"
|
||
)
|
||
if actual_cuda and min_cuda and _version_lt(actual_cuda, min_cuda):
|
||
warnings.append(
|
||
f"CUDA {actual_cuda} < minimum {min_cuda} required for {gpu_label}"
|
||
)
|
||
return warnings
|