"""GPU specifications database for NVIDIA datacenter GPUs.""" import os import shutil import subprocess from typing import List, Optional # GPU name patterns -> internal key mapping # Order matters: longer/more-specific patterns must come before shorter ones. GPU_NAME_PATTERNS = { "A100": "a100", "A800": "a800", "H100": "h100", "H200": "h200", "H20": "h20", # H20 / H20-3e is the China-compliance export variant, REDUCED peaks "B200": "b200", "B300": "b300", } # Specs database — ALL values are DENSE (non-sparse) TFLOPS GPU_SPECS = { "h100": { "full_name": "NVIDIA H100 SXM5", "architecture": "Hopper", "compute_capability": 9.0, "hbm_capacity_gb": 80, "hbm_type": "HBM3", "memory_bandwidth_gbps": 3400, # GB/s (3.4 TB/s) "fp32_tflops": 67, "tf32_tflops": 495, # dense (989 sparse) "fp16_tflops": 990, # dense (1979 sparse w/ 2:4) "bf16_tflops": 990, # dense "fp8_tflops": 1979, # dense "tdp_watts": 700, "nvlink_gen": 4, "nvlink_bandwidth_gbps": 900, # bidirectional "pcie_gen": 5, "min_driver_version": "535", "min_cuda_version": "12.1", }, "h200": { "full_name": "NVIDIA H200 SXM", "architecture": "Hopper", "compute_capability": 9.0, "hbm_capacity_gb": 141, "hbm_type": "HBM3e", "memory_bandwidth_gbps": 4800, # GB/s (4.8 TB/s) — THIS IS THE CORRECT VALUE, NOT 989! "fp32_tflops": 67, "tf32_tflops": 495, # dense "fp16_tflops": 990, # dense "bf16_tflops": 990, # dense "fp8_tflops": 1979, # dense "tdp_watts": 700, "nvlink_gen": 4, "nvlink_bandwidth_gbps": 900, "pcie_gen": 5, "min_driver_version": "535", "min_cuda_version": "12.1", }, "h20": { # China-compliance export variant of H200 (reported as "H20" / "H20-3e" by nvidia-smi). # Same silicon family / HBM as H200, but Tensor Core peaks are throttled. # Peaks below are sourced from supplier / NVIDIA China and confirmed against # measured throughput on 8x H20-3e (FP16 ~741, BF16 ~770, FP8 ~1328 TFLOPS). "full_name": "NVIDIA H20 / H20-3e", "architecture": "Hopper", "compute_capability": 9.0, "hbm_capacity_gb": 141, "hbm_type": "HBM3e", "memory_bandwidth_gbps": 4800, "fp32_tflops": 54, # China spec (matches measured ~51-52) "tf32_tflops": 372, # ~75% of H200 (matches measured ~362) "fp16_tflops": 744, # dense, China spec "bf16_tflops": 739, # dense, China spec "fp8_tflops": 1420, # dense, China spec "tdp_watts": 700, "nvlink_gen": 4, "nvlink_bandwidth_gbps": 900, "pcie_gen": 5, "min_driver_version": "535", "min_cuda_version": "12.1", }, "b200": { "full_name": "NVIDIA B200 SXM", "architecture": "Blackwell", "compute_capability": 10.0, "hbm_capacity_gb": 180, "hbm_type": "HBM3e", "memory_bandwidth_gbps": 8000, # GB/s (8 TB/s) "fp32_tflops": 90, "tf32_tflops": 1125, # dense "fp16_tflops": 2250, # dense "bf16_tflops": 2250, # dense "fp8_tflops": 4500, # dense "tdp_watts": 1000, "nvlink_gen": 5, "nvlink_bandwidth_gbps": 1800, "pcie_gen": 5, "min_driver_version": "550", "min_cuda_version": "12.4", }, "a100": { "full_name": "NVIDIA A100 SXM", "architecture": "Ampere", "compute_capability": 8.0, "hbm_capacity_gb": 80, "hbm_type": "HBM2e", "memory_bandwidth_gbps": 2039, # GB/s (~2.0 TB/s) "fp32_tflops": 19.5, "tf32_tflops": 156, # dense "fp16_tflops": 312, # dense "bf16_tflops": 312, # dense "fp8_tflops": 0, # Ampere has no FP8 "tdp_watts": 400, "nvlink_gen": 3, "nvlink_bandwidth_gbps": 600, # bidirectional "pcie_gen": 4, "min_driver_version": "470", "min_cuda_version": "11.0", }, "a800": { "full_name": "NVIDIA A800 SXM", "architecture": "Ampere", "compute_capability": 8.0, "hbm_capacity_gb": 80, "hbm_type": "HBM2e", "memory_bandwidth_gbps": 2039, # GB/s (~2.0 TB/s) "fp32_tflops": 19.5, "tf32_tflops": 156, # dense "fp16_tflops": 312, # dense "bf16_tflops": 312, # dense "fp8_tflops": 0, # Ampere has no FP8 "tdp_watts": 400, "nvlink_gen": 3, "nvlink_bandwidth_gbps": 600, # bidirectional (NVLink 3, limited vs A100) "pcie_gen": 4, "min_driver_version": "470", "min_cuda_version": "11.0", }, "b300": { "full_name": "NVIDIA B300 SXM (Blackwell Ultra)", "architecture": "Blackwell Ultra", "compute_capability": 10.0, "hbm_capacity_gb": 288, "hbm_type": "HBM3e", "memory_bandwidth_gbps": 8000, # GB/s (8 TB/s) "fp32_tflops": 125, "tf32_tflops": 1750, # dense (estimated) "fp16_tflops": 3500, # dense "bf16_tflops": 3500, # dense "fp8_tflops": 7000, # dense "tdp_watts": 1200, "nvlink_gen": 5, "nvlink_bandwidth_gbps": 1800, "pcie_gen": 5, "min_driver_version": "550", "min_cuda_version": "12.4", }, } # Fallback for unknown / unsupported GPUs _UNKNOWN_SPECS = { "full_name": "Unknown GPU", "architecture": "unknown", "compute_capability": 0.0, "hbm_capacity_gb": 0, "hbm_type": "unknown", "memory_bandwidth_gbps": 0, "fp32_tflops": 0, "tf32_tflops": 0, "fp16_tflops": 0, "bf16_tflops": 0, "fp8_tflops": 0, "tdp_watts": 700, "nvlink_gen": 0, "nvlink_bandwidth_gbps": 0, "pcie_gen": 0, "min_driver_version": "", "min_cuda_version": "", } def detect_gpu_type() -> str: """Detect GPU type via nvidia-smi and return the internal key (e.g. 'h200'). Returns 'unknown' if nvidia-smi is unavailable or the GPU is not recognized. """ nvidia_smi = shutil.which("nvidia-smi") if not nvidia_smi: return "unknown" try: r = subprocess.run( [nvidia_smi, "--query-gpu=name", "--format=csv,noheader"], capture_output=True, text=True, timeout=10, ) if r.returncode != 0: return "unknown" first_line = r.stdout.strip().splitlines()[0].strip().upper() # Iterate longest-pattern-first so "H200" doesn't get matched by "H20". for pattern, key in sorted(GPU_NAME_PATTERNS.items(), key=lambda kv: -len(kv[0])): if pattern in first_line: return key return "unknown" except (subprocess.TimeoutExpired, FileNotFoundError, OSError): return "unknown" def get_gpu_specs(gpu_type: str = None) -> dict: """Return specs dict for the given gpu_type, auto-detecting if None. Returns a minimal 'unknown' fallback dict with zero peaks for unsupported GPUs. """ if gpu_type is None: gpu_type = detect_gpu_type() return GPU_SPECS.get(gpu_type, dict(_UNKNOWN_SPECS)) def get_supported_gpus() -> list: """Return list of supported GPU type keys.""" return list(GPU_SPECS.keys()) def get_gpu_label(gpu_type: str) -> str: """Return a short human-readable label like 'H200 SXM' for display in tables.""" specs = GPU_SPECS.get(gpu_type) if specs: full = specs["full_name"] # Strip the "NVIDIA " prefix for display return full.replace("NVIDIA ", "") return "Unknown GPU" # --------------------------------------------------------------------------- # Tools path resolution # --------------------------------------------------------------------------- def resolve_tools_dir(config: dict) -> str: """Resolve tools installation directory with smart fallback. Priority: GPU_TOOLS_DIR env > config value > /opt/gpu-test-tools > /tmp/gpu-test-tools """ # 1. Env var override env_dir = os.environ.get("GPU_TOOLS_DIR") if env_dir: return env_dir # 2. Config value if explicitly set cfg_dir = config.get("tools", {}).get("install_dir", "") if cfg_dir: return cfg_dir # 3. /opt/gpu-test-tools if it already exists or /opt is writable default = "/opt/gpu-test-tools" if os.path.isdir(default) or os.access("/opt", os.W_OK): return default # 4. Fallback to /tmp return "/tmp/gpu-test-tools" # --------------------------------------------------------------------------- # Driver / CUDA compatibility validation # --------------------------------------------------------------------------- def _query_nvidia_smi(field: str) -> Optional[str]: """Query a single nvidia-smi field, return value string or None.""" nvidia_smi = shutil.which("nvidia-smi") if not nvidia_smi: return None try: r = subprocess.run( [nvidia_smi, f"--query-gpu={field}", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=10, ) if r.returncode == 0 and r.stdout.strip(): return r.stdout.strip().splitlines()[0].strip() except (subprocess.TimeoutExpired, FileNotFoundError, OSError): pass return None def _version_lt(actual: str, minimum: str) -> bool: """Return True if actual version < minimum (numeric dotted comparison).""" def to_tuple(v: str): parts = [] for p in v.split("."): try: parts.append(int(p)) except ValueError: break return tuple(parts) if parts else (0,) return to_tuple(actual) < to_tuple(minimum) def validate_driver_compatibility(gpu_type: str) -> List[str]: """Check if current driver/CUDA meets minimum requirements for the detected GPU. Returns a list of warning strings (empty if everything is fine). """ specs = get_gpu_specs(gpu_type) warnings: List[str] = [] min_driver = specs.get("min_driver_version", "") min_cuda = specs.get("min_cuda_version", "") if not min_driver and not min_cuda: return warnings actual_driver = _query_nvidia_smi("driver_version") # nvidia-smi reports the highest CUDA version supported by the driver actual_cuda = _query_nvidia_smi("cuda_version") gpu_label = get_gpu_label(gpu_type) if actual_driver and min_driver and _version_lt(actual_driver, min_driver): warnings.append( f"Driver {actual_driver} < minimum {min_driver} required for {gpu_label}" ) if actual_cuda and min_cuda and _version_lt(actual_cuda, min_cuda): warnings.append( f"CUDA {actual_cuda} < minimum {min_cuda} required for {gpu_label}" ) return warnings