refactor: remove hardcoding, fix AMP bug, unify English output

- Fix AMP autocast: bf16 now uses torch.amp.autocast (was skipped) - Fix NCCL threshold: unknown GPU gets 10 GB/s floor instead of 0 - Fix PCIe health check: use specs-driven pcie_gen, not hardcoded Gen4 - Remove hardcoded GPU lists: dynamic banner, CLI choices, version - Unknown GPU efficiency displays N/A instead of 0% - Unify all console output to English (stress_test, gpu_tester) - Use importlib.metadata for runtime version resolution - Remove dir="/tmp" from tempfile (use system default) 🤖 Generated with [Qoder][https://qoder.com]
2026-05-07 21:32:35 +08:00 · 2026-05-07 21:32:35 +08:00 · fefef8e03b
commit fefef8e03b
parent f2158f6cd3
8 changed files with 75 additions and 56 deletions
--- a/gpu_tester.py
+++ b/gpu_tester.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""GPU Training Server Test Suite (A100/A800/H100/H200/B200/B300) - Main CLI Entry Point."""
+"""GPU Training Server Test Suite - Main CLI Entry Point."""

 import argparse
 import json
@ -74,17 +74,19 @@ DEFAULT_CONFIG = {
    "tools": {"install_dir": "/opt/gpu-test-tools"},
 }

-BANNER = r"""
-[bold cyan]
-╔══════════════════════════════════════════════════════╗
-║                                                      ║
-║       GPU Training Server Test Suite                 ║
-║       Diagnostics & Benchmarking Tool                ║
-║       Supports: A100 / A800 / H100 / H200 / B200 / B300  ║
-║                                                      ║
-╚══════════════════════════════════════════════════════════╝
-[/bold cyan]
-"""
+def _build_banner() -> str:
+    gpu_list = " / ".join(g.upper() for g in get_supported_gpus())
+    return (
+        "[bold cyan]\n"
+        "╔══════════════════════════════════════════════════════════╗\n"
+        "║                                                          ║\n"
+        "║       GPU Training Server Test Suite                     ║\n"
+        "║       Diagnostics & Benchmarking Tool                    ║\n"
+        f"║       Supports: {gpu_list:<40s} ║\n"
+        "║                                                          ║\n"
+        "╚══════════════════════════════════════════════════════════╝\n"
+        "[/bold cyan]"
+    )


 def load_config() -> dict:
@ -112,7 +114,7 @@ def interactive_menu(config: dict):
    """Run interactive menu loop."""
    console = Console()

-    console.print(BANNER)
+    console.print(_build_banner())

    gpu_type = detect_gpu_type()
    gpu_label = get_gpu_label(gpu_type)
@ -310,7 +312,7 @@ def _run_full_suite(config: dict, console: Console) -> dict:

    # Summary
    console.print("\n" + "=" * 60)
-    # 只统计测试结果，排除 timestamp 等元数据
+    # Only count test results, exclude metadata like timestamp
    test_results = {k: v for k, v in all_results.items() if k != "timestamp"}
    passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v)
    total = len(test_results)
@ -320,8 +322,9 @@ def _run_full_suite(config: dict, console: Console) -> dict:


 def main():
+    gpu_list_str = " / ".join(g.upper() for g in get_supported_gpus())
    parser = argparse.ArgumentParser(
-        description="GPU Training Server Test Suite (A100/A800/H100/H200/B200/B300)",
+        description=f"GPU Training Server Test Suite ({gpu_list_str})",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
@ -349,7 +352,7 @@ Examples:
    parser.add_argument("--config", default=None, help="Path to config YAML file")
    parser.add_argument(
        "--gpu-type",
-        choices=["auto", "a100", "a800", "h100", "h200", "b200", "b300"],
+        choices=["auto"] + get_supported_gpus(),
        default="auto",
        help="Override GPU type detection",
    )
--- a/modules/benchmark.py
+++ b/modules/benchmark.py
@ -151,13 +151,13 @@ class Benchmark:
        # (nvlink_bandwidth_gbps is bidirectional, so per-direction = /2)
        nvlink_bw = self.specs.get("nvlink_bandwidth_gbps", 0)
        d2d_peak = nvlink_bw / 2 if nvlink_bw else 0
-        d2d_efficiency = (d2d_bw / d2d_peak) * 100 if (d2d_bw and d2d_peak) else 0
+        d2d_efficiency = round((d2d_bw / d2d_peak) * 100, 1) if (d2d_bw and d2d_peak) else None

        # H2D/D2H goes through PCIe — estimate peak from PCIe gen
-        pcie_gen = self.specs.get("pcie_gen", 4)
-        pcie_peak = {3: 16, 4: 32, 5: 64, 6: 128}.get(pcie_gen, 32)  # GB/s x16
-        h2d_efficiency = (h2d_bw / pcie_peak) * 100 if (h2d_bw and pcie_peak) else 0
-        d2h_efficiency = (d2h_bw / pcie_peak) * 100 if (d2h_bw and pcie_peak) else 0
+        pcie_gen = self.specs.get("pcie_gen", 0)
+        pcie_peak = {3: 16, 4: 32, 5: 64, 6: 128}.get(pcie_gen, 32) if pcie_gen > 0 else 0  # GB/s x16
+        h2d_efficiency = round((h2d_bw / pcie_peak) * 100, 1) if (h2d_bw and pcie_peak) else None
+        d2h_efficiency = round((d2h_bw / pcie_peak) * 100, 1) if (d2h_bw and pcie_peak) else None

        return {
            "memory": {
@ -165,14 +165,14 @@ class Benchmark:
                "h2d_bandwidth_gbps": round(h2d_bw, 1),
                "d2h_bandwidth_gbps": round(d2h_bw, 1),
                "d2d_bandwidth_gbps": round(d2d_bw, 1),
-                "h2d_peak_gbps": pcie_peak,
-                "d2h_peak_gbps": pcie_peak,
-                "d2d_peak_gbps": round(d2d_peak, 1),
-                "h2d_efficiency_pct": round(h2d_efficiency, 1),
-                "d2h_efficiency_pct": round(d2h_efficiency, 1),
-                "d2d_efficiency_pct": round(d2d_efficiency, 1),
+                "h2d_peak_gbps": pcie_peak if pcie_peak else None,
+                "d2h_peak_gbps": pcie_peak if pcie_peak else None,
+                "d2d_peak_gbps": round(d2d_peak, 1) if d2d_peak else None,
+                "h2d_efficiency_pct": h2d_efficiency,
+                "d2h_efficiency_pct": d2h_efficiency,
+                "d2d_efficiency_pct": d2d_efficiency,
                "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
-                "efficiency_pct": round(d2d_efficiency, 1),
+                "efficiency_pct": d2d_efficiency,
                "results_by_test": results_by_test,
                "per_gpu": [],
            }
@ -276,7 +276,7 @@ class Benchmark:

        best_d2d = max(v["d2d_gbps"] for v in bandwidth_by_size.values())
        peak_bw = self.specs["memory_bandwidth_gbps"]
-        efficiency = (best_d2d / peak_bw) * 100 if peak_bw else 0.0
+        efficiency = round((best_d2d / peak_bw) * 100, 1) if peak_bw else None

        return {
            "memory": {
@ -285,7 +285,7 @@ class Benchmark:
                "d2h_bandwidth_gbps": round(max(v["d2h_gbps"] for v in bandwidth_by_size.values()), 1),
                "d2d_bandwidth_gbps": round(best_d2d, 1),
                "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
-                "efficiency_pct": round(efficiency, 1),
+                "efficiency_pct": efficiency,
                "test_sizes_mb": test_sizes_mb,
                "bandwidth_by_size": bandwidth_by_size,
                "per_gpu": [],
--- a/modules/gpu_specs.py
+++ b/modules/gpu_specs.py
@ -1,4 +1,4 @@
-"""GPU specifications database for NVIDIA datacenter GPUs (A100/A800/H100/H200/B200/B300)."""
+"""GPU specifications database for NVIDIA datacenter GPUs."""

 import os
 import shutil
--- a/modules/health_check.py
+++ b/modules/health_check.py
@ -1,4 +1,4 @@
-"""Hardware health monitoring module for NVIDIA datacenter GPUs (A100/A800/H100/H200/B200/B300)."""
+"""Hardware health monitoring module for NVIDIA datacenter GPUs."""

 import subprocess
 import shutil
@ -115,7 +115,11 @@ class HealthCheck:

            pg = self._safe_int(pcie_gens[i] if i < len(pcie_gens) else 0)
            pw = self._safe_int(pcie_widths[i] if i < len(pcie_widths) else 0)
-            pcie_ok = pg >= 4 and pw >= 8
+            expected_gen = self.specs.get("pcie_gen", 0)
+            if expected_gen > 0:
+                pcie_ok = pg >= expected_gen and pw >= 16
+            else:
+                pcie_ok = pw >= 8  # unknown GPU: just check width
            if not pcie_ok:
                overall_pass = False
            checks["pcie_link"] = {"gen": pg, "width": pw, "status": "PASS" if pcie_ok else "WARN"}
--- a/modules/nccl_test.py
+++ b/modules/nccl_test.py
@ -79,9 +79,17 @@ class NCCLTest:
        if self.nccl_cfg.get("test_sendrecv", False):
            tests.append(("sendrecv_perf", "SendRecv"))

-        default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
+        nvlink_bw = self.specs.get("nvlink_bandwidth_gbps", 0)
+        if nvlink_bw > 0:
+            default_min_bw = nvlink_bw * 0.4
+        else:
+            # Conservative floor: any working NVLink should exceed 10 GB/s
+            default_min_bw = 10
        min_bw = self.nccl_cfg.get("min_bandwidth_gbps") or round(default_min_bw)

+        if self.gpu_type == "unknown":
+            self.console.print("[yellow]Unknown GPU — using conservative bandwidth thresholds[/yellow]")
+
        # Strategy: try nccl-tests binary directly (single-node, -g N),
        # then mpirun, then torchrun fallback
        results = {}
@ -317,7 +325,7 @@ except Exception as e:
 dist.destroy_process_group()
 """
        import tempfile
-        tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, dir="/tmp")
+        tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False)
        tmp.write(code)
        tmp.close()

--- a/modules/report.py
+++ b/modules/report.py
@ -6,6 +6,12 @@ from datetime import datetime
 from pathlib import Path
 from typing import Optional

+try:
+    from importlib.metadata import version as _pkg_version
+    __version__ = _pkg_version("gpu-server-test-suite")
+except Exception:
+    __version__ = "0.2.0"
+
 from rich.console import Console
 from rich.panel import Panel

@ -368,7 +374,7 @@ class ReportGenerator:

        # --- Footer ---
        lines.append("---")
-        lines.append(f"*Generated by GPU Test Suite v0.2.0*")
+        lines.append(f"*Generated by GPU Test Suite v{__version__}*")

        content = "\n".join(lines)
        with open(output, "w") as f:
--- a/modules/stress_test.py
+++ b/modules/stress_test.py
@ -49,13 +49,13 @@ class StressTest:
        gpu_burn = self._find_gpu_burn()

        if gpu_burn:
-            # 尝试使用 gpu-burn
+            # Try gpu-burn first
            result = self._run_gpu_burn(gpu_burn, duration_sec, use_doubles, use_tensor_cores, target_gpus)
            
-            # 如果 gpu-burn 失败（例如显存不足），自动 fallback 到 PyTorch
+            # If gpu-burn fails (e.g. OOM), auto-fallback to PyTorch
            if not result.get("passed") and result.get("elapsed_sec", 0) < duration_sec * 0.5:
-                self.console.print("\n[yellow]gpu-burn 提前退出（可能显存不足），自动切换到 PyTorch 压力测试[/yellow]")
-                self.console.print("[dim]PyTorch 模式会根据实际可用显存动态调整，更稳定[/dim]\n")
+                self.console.print("\n[yellow]gpu-burn exited early (possible OOM), switching to PyTorch stress test[/yellow]")
+                self.console.print("[dim]PyTorch mode dynamically adapts to available memory[/dim]\n")
                return self._run_pytorch_stress(duration_sec, memory_pct)
            
            return result
@ -134,18 +134,16 @@ class StressTest:
            tensors = {}
            for i in range(gpu_count):
                with torch.cuda.device(i):
-                    # 获取实际可用显存（考虑其他进程已占用的部分）
+                    # Get actual free memory (accounting for other processes)
                    free_mem, total_mem = torch.cuda.mem_get_info(i)
                    
-                    # 根据配置的 memory_pct 计算分配大小
-                    # 例如：memory_pct=90 表示使用总显存的 90%
+                    # Calculate allocation from configured memory_pct
                    target_mem = int(total_mem * memory_pct / 100)
                    
-                    # 但不能超过实际可用显存（留出 5% 安全余量）
+                    # Cap at actual free memory with 5% safety margin
                    alloc_bytes = min(target_mem, int(free_mem * 0.95))
                    
-                    # matmul(A, A.T) 需要 2x 输入显存（输入 + 输出）
-                    # 所以分配 sqrt(alloc_bytes/4/2) 大小的方阵
+                    # matmul(A, A.T) needs 2x input memory (input + output)
                    side = int((alloc_bytes / 4 / 2) ** 0.5)  # float32 = 4 bytes
                    
                    actual_mem_mb = side * side * 4 / 1024 / 1024
@ -153,13 +151,13 @@ class StressTest:
                    free_mem_mb = free_mem / 1024 / 1024
                    
                    self.console.print(
-                        f"  [dim]GPU {i}: 总显存 {total_mem_mb:.0f}MB, 可用 {free_mem_mb:.0f}MB, "
-                        f"分配 {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - "
-                        f"矩阵 {side}x{side}[/dim]"
+                        f"  [dim]GPU {i}: total {total_mem_mb:.0f}MB, free {free_mem_mb:.0f}MB, "
+                        f"alloc {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - "
+                        f"matrix {side}x{side}[/dim]"
                    )
                    tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32)

-            self.console.print(f"\n[cyan]开始压力测试，持续 {duration} 秒...[/cyan]")
+            self.console.print(f"\n[cyan]Starting stress test for {duration} seconds...[/cyan]")
            
            elapsed_check = 0
            while time.time() - t0 < duration:
@ -169,10 +167,10 @@ class StressTest:
                        torch.cuda.synchronize()
                time.sleep(0.1)
                
-                # 每 10 秒显示一次进度
+                # Show progress every 10 seconds
                current_elapsed = time.time() - t0
                if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0:
-                    self.console.print(f"  [dim]已运行 {int(current_elapsed)}s / {duration}s[/dim]")
+                    self.console.print(f"  [dim]Running {int(current_elapsed)}s / {duration}s[/dim]")
                    elapsed_check = current_elapsed

            for i in range(gpu_count):
@ -180,7 +178,7 @@ class StressTest:

        except RuntimeError as e:
            error_msg = str(e)
-            self.console.print(f"\n[red]压力测试出错: {error_msg}[/red]")
+            self.console.print(f"\n[red]Stress test error: {error_msg}[/red]")
            for i in range(gpu_count):
                if i not in gpu_status:
                    gpu_status[i] = "FAIL"
--- a/modules/training_sim.py
+++ b/modules/training_sim.py
@ -77,7 +77,7 @@ class TrainingSim:
            optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

            if dtype in (torch.float16, torch.bfloat16):
-                scaler = torch.cuda.amp.GradScaler(enabled=(dtype == torch.float16))
+                scaler = torch.amp.GradScaler("cuda", enabled=(dtype == torch.float16))

            step_times = []
            mem_usage = []
@ -96,8 +96,8 @@ class TrainingSim:
                    input_ids = input_ids.to(model.device)
                    attention_mask = attention_mask.to(model.device)

-                    if dtype in (torch.float16, torch.bfloat16) and dtype != torch.bfloat16:
-                        with torch.cuda.amp.autocast(dtype=dtype):
+                    if dtype in (torch.float16, torch.bfloat16):
+                        with torch.amp.autocast("cuda", dtype=dtype):
                            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
                            loss = outputs.loss
                    else: