"""GPU specifications database for NVIDIA datacenter GPUs.""" import os import shutil import subprocess from typing import List, Optional # GPU name patterns -> internal key mapping # Order matters: longer/more-specific patterns must come before shorter ones. GPU_NAME_PATTERNS = { "A100": "a100", "A800": "a800", "H100": "h100", "H800": "h800", # H800 = H100 SXM with NVLink halved (400 GB/s) and FP64 restricted "H200": "h200", "H20": "h20", # H20 / H20-3e is the China-compliance export variant, REDUCED peaks "B200": "b200", "B300": "b300", } # Specs database — ALL values are DENSE (non-sparse) TFLOPS GPU_SPECS = { "h100": { # Peaks below are NVIDIA marketing dense peaks (theoretical Tensor Core max). # `compute_pass_thresholds_tflops` carries the absolute PASS thresholds used # by report.py — decoupled from peaks so marketing-spec changes (dense vs # sparse vs FP8-sparsity) don't shift the validation bar. "full_name": "NVIDIA H100 SXM5", "architecture": "Hopper", "compute_capability": 9.0, "hbm_capacity_gb": 80, "hbm_type": "HBM3", "memory_bandwidth_gbps": 3400, # GB/s (3.4 TB/s) "fp32_tflops": 67, "tf32_tflops": 495, # dense (989 sparse) "fp16_tflops": 990, # dense (1979 sparse w/ 2:4) "bf16_tflops": 990, # dense "fp8_tflops": 1979, # dense "compute_pass_thresholds_tflops": { # Recalibrated 2026-05-25 to the H100 eager-cuBLAS achievable floor (each # threshold ~2-4% below the sustained value measured across 16 GPUs via the # MAMF shape sweep: fp32 ~52 / tf32 ~405 / fp16 ~732-748 / bf16 ~747-758 / # fp8 ~1248-1271). The old marketing/MAMF-derived values (fp32 54, tf32 444, # fp16 734, bf16 745, fp8 1400) sat ON or ABOVE what PyTorch cuBLAS reaches # on H100, so healthy cards flaked to WARN/FAIL. fp8 1400 in particular was # an H200/rowwise-scaling figure; H100 tensorwise _scaled_mm tops out ~1310. "fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200, # FP64 63 / INT8 1536 — listed for documentation; benchmark module # doesn't currently exercise these dtypes. }, "tdp_watts": 700, "nvlink_gen": 4, "nvlink_bandwidth_gbps": 900, # bidirectional "pcie_gen": 5, "min_driver_version": "535", "min_cuda_version": "12.1", }, "h200": { "full_name": "NVIDIA H200 SXM", "architecture": "Hopper", "compute_capability": 9.0, "hbm_capacity_gb": 141, "hbm_type": "HBM3e", "memory_bandwidth_gbps": 4800, # GB/s (4.8 TB/s) — THIS IS THE CORRECT VALUE, NOT 989! "fp32_tflops": 67, "tf32_tflops": 495, # dense "fp16_tflops": 990, # dense "bf16_tflops": 990, # dense "fp8_tflops": 1979, # dense # PASS thresholds aligned with H200_production_acceptance.md v2 (2026-05-21): # calibrated against Semianalysis & stas00 MAMF — H200 shares H100 SMs so # achievable TFLOPS in PyTorch is in the same band. "compute_pass_thresholds_tflops": { "fp32": 50, "tf32": 400, "fp16": 720, "bf16": 720, "fp8": 1400, }, "tdp_watts": 700, "nvlink_gen": 4, "nvlink_bandwidth_gbps": 900, "pcie_gen": 5, "min_driver_version": "545", "min_cuda_version": "12.4", }, "h800": { # H800 = China-compliance export variant of H100 SXM5. SAME chip / SMs / # clocks / HBM as H100 SXM5 — Tensor Core peaks (FP16 / BF16 / FP8 / TF32 / # FP32) are identical to H100. Two restrictions vs H100: # 1. NVLink bandwidth halved: 400 GB/s bidirectional (vs H100 900 GB/s) # 2. FP64 throughput severely cut to ~1 TFLOPS (vs H100 34/67 TFLOPS) # All other interfaces (PCIe Gen5, NVSwitch, HBM3 80GB @ 3.35 TB/s) match H100. # NCCL multi-GPU thresholds MUST be downscaled because NVLink BW is halved. "full_name": "NVIDIA H800 SXM5", "architecture": "Hopper", "compute_capability": 9.0, "hbm_capacity_gb": 80, "hbm_type": "HBM3", "memory_bandwidth_gbps": 3350, # GB/s (3.35 TB/s) — same as H100 SXM "fp32_tflops": 67, "tf32_tflops": 495, # dense (same as H100) "fp16_tflops": 990, # dense (same as H100) "bf16_tflops": 990, # dense (same as H100) "fp8_tflops": 1979, # dense (same as H100) # Tensor Core peaks identical to H100, so PASS thresholds reuse the H100 # eager-cuBLAS calibration (2026-05-25). Measured on 8×H800: fp32 ~52 / # tf32 ~420 / fp16 ~741 / bf16 ~745 / fp8 ~1249 — all clear these. fp8 was # 1400 (an H200/rowwise-scaling figure) which PyTorch tensorwise _scaled_mm # can't reach on H100-class silicon (~1310 ceiling); lowered to 1200 to match # h100. FP64 deliberately NOT listed — H800 is restricted to ~1 TFLOPS FP64. "compute_pass_thresholds_tflops": { "fp32": 50, "tf32": 385, "fp16": 720, "bf16": 730, "fp8": 1200, }, "tdp_watts": 700, "nvlink_gen": 4, "nvlink_bandwidth_gbps": 400, # bidirectional — HALF of H100 (export restriction) "pcie_gen": 5, "min_driver_version": "535", "min_cuda_version": "12.1", }, "h20": { # China-compliance export variant of H200 (reported as "H20" / "H20-3e" by nvidia-smi). # Same silicon family / HBM as H200, but Tensor Core peaks are throttled. # Peaks below are sourced from supplier / NVIDIA China and confirmed against # measured throughput on 8x H20-3e (FP16 ~741, BF16 ~770, FP8 ~1328 TFLOPS). "full_name": "NVIDIA H20 / H20-3e", "architecture": "Hopper", "compute_capability": 9.0, "hbm_capacity_gb": 141, "hbm_type": "HBM3e", "memory_bandwidth_gbps": 4800, "fp32_tflops": 54, # China spec (matches measured ~51-52) "tf32_tflops": 372, # ~75% of H200 (matches measured ~362) "fp16_tflops": 744, # dense, China spec "bf16_tflops": 739, # dense, China spec "fp8_tflops": 1420, # dense, China spec "tdp_watts": 700, "nvlink_gen": 4, "nvlink_bandwidth_gbps": 900, "pcie_gen": 5, "min_driver_version": "535", "min_cuda_version": "12.1", }, "b200": { "full_name": "NVIDIA B200 SXM", "architecture": "Blackwell", "compute_capability": 10.0, "hbm_capacity_gb": 180, "hbm_type": "HBM3e", "memory_bandwidth_gbps": 8000, # GB/s (8 TB/s) "fp32_tflops": 90, "tf32_tflops": 1125, # dense "fp16_tflops": 2250, # dense "bf16_tflops": 2250, # dense "fp8_tflops": 4500, # dense "tdp_watts": 1000, "nvlink_gen": 5, "nvlink_bandwidth_gbps": 1800, "pcie_gen": 5, "min_driver_version": "550", "min_cuda_version": "12.4", }, "a100": { "full_name": "NVIDIA A100 SXM", "architecture": "Ampere", "compute_capability": 8.0, "hbm_capacity_gb": 80, "hbm_type": "HBM2e", "memory_bandwidth_gbps": 2039, # GB/s (~2.0 TB/s) "fp32_tflops": 19.5, "tf32_tflops": 156, # dense "fp16_tflops": 312, # dense "bf16_tflops": 312, # dense "fp8_tflops": 0, # Ampere has no FP8 "tdp_watts": 400, "nvlink_gen": 3, "nvlink_bandwidth_gbps": 600, # bidirectional "pcie_gen": 4, "min_driver_version": "470", "min_cuda_version": "11.0", }, "a800": { "full_name": "NVIDIA A800 SXM", "architecture": "Ampere", "compute_capability": 8.0, "hbm_capacity_gb": 80, "hbm_type": "HBM2e", "memory_bandwidth_gbps": 2039, # GB/s (~2.0 TB/s) "fp32_tflops": 19.5, "tf32_tflops": 156, # dense "fp16_tflops": 312, # dense "bf16_tflops": 312, # dense "fp8_tflops": 0, # Ampere has no FP8 "tdp_watts": 400, "nvlink_gen": 3, "nvlink_bandwidth_gbps": 600, # bidirectional (NVLink 3, limited vs A100) "pcie_gen": 4, "min_driver_version": "470", "min_cuda_version": "11.0", }, "b300": { "full_name": "NVIDIA B300 SXM (Blackwell Ultra)", "architecture": "Blackwell Ultra", "compute_capability": 10.0, "hbm_capacity_gb": 288, "hbm_type": "HBM3e", "memory_bandwidth_gbps": 8000, # GB/s (8 TB/s) "fp32_tflops": 125, "tf32_tflops": 1750, # dense (estimated) "fp16_tflops": 3500, # dense "bf16_tflops": 3500, # dense "fp8_tflops": 7000, # dense "tdp_watts": 1200, "nvlink_gen": 5, "nvlink_bandwidth_gbps": 1800, "pcie_gen": 5, "min_driver_version": "550", "min_cuda_version": "12.4", }, } # Fallback for unknown / unsupported GPUs _UNKNOWN_SPECS = { "full_name": "Unknown GPU", "architecture": "unknown", "compute_capability": 0.0, "hbm_capacity_gb": 0, "hbm_type": "unknown", "memory_bandwidth_gbps": 0, "fp32_tflops": 0, "tf32_tflops": 0, "fp16_tflops": 0, "bf16_tflops": 0, "fp8_tflops": 0, "compute_pass_thresholds_tflops": {}, # empty => report.py falls back to 80% of peak "tdp_watts": 700, "nvlink_gen": 0, "nvlink_bandwidth_gbps": 0, "pcie_gen": 0, "min_driver_version": "", "min_cuda_version": "", } def detect_gpu_type() -> str: """Detect GPU type via nvidia-smi and return the internal key (e.g. 'h200'). Returns 'unknown' if nvidia-smi is unavailable or the GPU is not recognized. """ nvidia_smi = shutil.which("nvidia-smi") if not nvidia_smi: return "unknown" try: r = subprocess.run( [nvidia_smi, "--query-gpu=name", "--format=csv,noheader"], capture_output=True, text=True, timeout=10, ) if r.returncode != 0: return "unknown" first_line = r.stdout.strip().splitlines()[0].strip().upper() # Iterate longest-pattern-first so "H200" doesn't get matched by "H20". for pattern, key in sorted(GPU_NAME_PATTERNS.items(), key=lambda kv: -len(kv[0])): if pattern in first_line: return key return "unknown" except (subprocess.TimeoutExpired, FileNotFoundError, OSError): return "unknown" def get_gpu_specs(gpu_type: str = None) -> dict: """Return specs dict for the given gpu_type, auto-detecting if None. Returns a minimal 'unknown' fallback dict with zero peaks for unsupported GPUs. """ if gpu_type is None: gpu_type = detect_gpu_type() return GPU_SPECS.get(gpu_type, dict(_UNKNOWN_SPECS)) def get_supported_gpus() -> list: """Return list of supported GPU type keys.""" return list(GPU_SPECS.keys()) def get_gpu_label(gpu_type: str) -> str: """Return a short human-readable label like 'H200 SXM' for display in tables.""" specs = GPU_SPECS.get(gpu_type) if specs: full = specs["full_name"] # Strip the "NVIDIA " prefix for display return full.replace("NVIDIA ", "") return "Unknown GPU" # --------------------------------------------------------------------------- # Tools path resolution # --------------------------------------------------------------------------- def resolve_tools_dir(config: dict) -> str: """Resolve tools installation directory with smart fallback. Priority: GPU_TOOLS_DIR env > config value > /opt/gpu-test-tools > /tmp/gpu-test-tools """ # 1. Env var override env_dir = os.environ.get("GPU_TOOLS_DIR") if env_dir: return env_dir # 2. Config value if explicitly set cfg_dir = config.get("tools", {}).get("install_dir", "") if cfg_dir: return cfg_dir # 3. /opt/gpu-test-tools if it already exists or /opt is writable default = "/opt/gpu-test-tools" if os.path.isdir(default) or os.access("/opt", os.W_OK): return default # 4. Fallback to /tmp return "/tmp/gpu-test-tools" # --------------------------------------------------------------------------- # Driver / CUDA compatibility validation # --------------------------------------------------------------------------- def _query_nvidia_smi(field: str) -> Optional[str]: """Query a single nvidia-smi field, return value string or None.""" nvidia_smi = shutil.which("nvidia-smi") if not nvidia_smi: return None try: r = subprocess.run( [nvidia_smi, f"--query-gpu={field}", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=10, ) if r.returncode == 0 and r.stdout.strip(): return r.stdout.strip().splitlines()[0].strip() except (subprocess.TimeoutExpired, FileNotFoundError, OSError): pass return None def _version_lt(actual: str, minimum: str) -> bool: """Return True if actual version < minimum (numeric dotted comparison).""" def to_tuple(v: str): parts = [] for p in v.split("."): try: parts.append(int(p)) except ValueError: break return tuple(parts) if parts else (0,) return to_tuple(actual) < to_tuple(minimum) def validate_driver_compatibility(gpu_type: str) -> List[str]: """Check if current driver/CUDA meets minimum requirements for the detected GPU. Returns a list of warning strings (empty if everything is fine). """ specs = get_gpu_specs(gpu_type) warnings: List[str] = [] min_driver = specs.get("min_driver_version", "") min_cuda = specs.get("min_cuda_version", "") if not min_driver and not min_cuda: return warnings actual_driver = _query_nvidia_smi("driver_version") # nvidia-smi reports the highest CUDA version supported by the driver actual_cuda = _query_nvidia_smi("cuda_version") gpu_label = get_gpu_label(gpu_type) if actual_driver and min_driver and _version_lt(actual_driver, min_driver): warnings.append( f"Driver {actual_driver} < minimum {min_driver} required for {gpu_label}" ) if actual_cuda and min_cuda and _version_lt(actual_cuda, min_cuda): warnings.append( f"CUDA {actual_cuda} < minimum {min_cuda} required for {gpu_label}" ) return warnings