From fefef8e03b9b94fcd2f881afbf8bb072c69d6cf4 Mon Sep 17 00:00:00 2001 From: qinyusen Date: Thu, 7 May 2026 21:32:35 +0800 Subject: [PATCH] refactor: remove hardcoding, fix AMP bug, unify English output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix AMP autocast: bf16 now uses torch.amp.autocast (was skipped) - Fix NCCL threshold: unknown GPU gets 10 GB/s floor instead of 0 - Fix PCIe health check: use specs-driven pcie_gen, not hardcoded Gen4 - Remove hardcoded GPU lists: dynamic banner, CLI choices, version - Unknown GPU efficiency displays N/A instead of 0% - Unify all console output to English (stress_test, gpu_tester) - Use importlib.metadata for runtime version resolution - Remove dir="/tmp" from tempfile (use system default) ๐Ÿค– Generated with [Qoder][https://qoder.com] --- gpu_tester.py | 35 +++++++++++++++++++---------------- modules/benchmark.py | 28 ++++++++++++++-------------- modules/gpu_specs.py | 2 +- modules/health_check.py | 8 ++++++-- modules/nccl_test.py | 12 ++++++++++-- modules/report.py | 8 +++++++- modules/stress_test.py | 32 +++++++++++++++----------------- modules/training_sim.py | 6 +++--- 8 files changed, 75 insertions(+), 56 deletions(-) diff --git a/gpu_tester.py b/gpu_tester.py index b2b6851..4cfa47c 100644 --- a/gpu_tester.py +++ b/gpu_tester.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""GPU Training Server Test Suite (A100/A800/H100/H200/B200/B300) - Main CLI Entry Point.""" +"""GPU Training Server Test Suite - Main CLI Entry Point.""" import argparse import json @@ -74,17 +74,19 @@ DEFAULT_CONFIG = { "tools": {"install_dir": "/opt/gpu-test-tools"}, } -BANNER = r""" -[bold cyan] -โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•— -โ•‘ โ•‘ -โ•‘ GPU Training Server Test Suite โ•‘ -โ•‘ Diagnostics & Benchmarking Tool โ•‘ -โ•‘ Supports: A100 / A800 / H100 / H200 / B200 / B300 โ•‘ -โ•‘ โ•‘ -โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• -[/bold cyan] -""" +def _build_banner() -> str: + gpu_list = " / ".join(g.upper() for g in get_supported_gpus()) + return ( + "[bold cyan]\n" + "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—\n" + "โ•‘ โ•‘\n" + "โ•‘ GPU Training Server Test Suite โ•‘\n" + "โ•‘ Diagnostics & Benchmarking Tool โ•‘\n" + f"โ•‘ Supports: {gpu_list:<40s} โ•‘\n" + "โ•‘ โ•‘\n" + "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•\n" + "[/bold cyan]" + ) def load_config() -> dict: @@ -112,7 +114,7 @@ def interactive_menu(config: dict): """Run interactive menu loop.""" console = Console() - console.print(BANNER) + console.print(_build_banner()) gpu_type = detect_gpu_type() gpu_label = get_gpu_label(gpu_type) @@ -310,7 +312,7 @@ def _run_full_suite(config: dict, console: Console) -> dict: # Summary console.print("\n" + "=" * 60) - # ๅช็ปŸ่ฎกๆต‹่ฏ•็ป“ๆžœ๏ผŒๆŽ’้™ค timestamp ็ญ‰ๅ…ƒๆ•ฐๆฎ + # Only count test results, exclude metadata like timestamp test_results = {k: v for k, v in all_results.items() if k != "timestamp"} passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v) total = len(test_results) @@ -320,8 +322,9 @@ def _run_full_suite(config: dict, console: Console) -> dict: def main(): + gpu_list_str = " / ".join(g.upper() for g in get_supported_gpus()) parser = argparse.ArgumentParser( - description="GPU Training Server Test Suite (A100/A800/H100/H200/B200/B300)", + description=f"GPU Training Server Test Suite ({gpu_list_str})", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: @@ -349,7 +352,7 @@ Examples: parser.add_argument("--config", default=None, help="Path to config YAML file") parser.add_argument( "--gpu-type", - choices=["auto", "a100", "a800", "h100", "h200", "b200", "b300"], + choices=["auto"] + get_supported_gpus(), default="auto", help="Override GPU type detection", ) diff --git a/modules/benchmark.py b/modules/benchmark.py index a87d018..dce8b6c 100644 --- a/modules/benchmark.py +++ b/modules/benchmark.py @@ -151,13 +151,13 @@ class Benchmark: # (nvlink_bandwidth_gbps is bidirectional, so per-direction = /2) nvlink_bw = self.specs.get("nvlink_bandwidth_gbps", 0) d2d_peak = nvlink_bw / 2 if nvlink_bw else 0 - d2d_efficiency = (d2d_bw / d2d_peak) * 100 if (d2d_bw and d2d_peak) else 0 + d2d_efficiency = round((d2d_bw / d2d_peak) * 100, 1) if (d2d_bw and d2d_peak) else None # H2D/D2H goes through PCIe โ€” estimate peak from PCIe gen - pcie_gen = self.specs.get("pcie_gen", 4) - pcie_peak = {3: 16, 4: 32, 5: 64, 6: 128}.get(pcie_gen, 32) # GB/s x16 - h2d_efficiency = (h2d_bw / pcie_peak) * 100 if (h2d_bw and pcie_peak) else 0 - d2h_efficiency = (d2h_bw / pcie_peak) * 100 if (d2h_bw and pcie_peak) else 0 + pcie_gen = self.specs.get("pcie_gen", 0) + pcie_peak = {3: 16, 4: 32, 5: 64, 6: 128}.get(pcie_gen, 32) if pcie_gen > 0 else 0 # GB/s x16 + h2d_efficiency = round((h2d_bw / pcie_peak) * 100, 1) if (h2d_bw and pcie_peak) else None + d2h_efficiency = round((d2h_bw / pcie_peak) * 100, 1) if (d2h_bw and pcie_peak) else None return { "memory": { @@ -165,14 +165,14 @@ class Benchmark: "h2d_bandwidth_gbps": round(h2d_bw, 1), "d2h_bandwidth_gbps": round(d2h_bw, 1), "d2d_bandwidth_gbps": round(d2d_bw, 1), - "h2d_peak_gbps": pcie_peak, - "d2h_peak_gbps": pcie_peak, - "d2d_peak_gbps": round(d2d_peak, 1), - "h2d_efficiency_pct": round(h2d_efficiency, 1), - "d2h_efficiency_pct": round(d2h_efficiency, 1), - "d2d_efficiency_pct": round(d2d_efficiency, 1), + "h2d_peak_gbps": pcie_peak if pcie_peak else None, + "d2h_peak_gbps": pcie_peak if pcie_peak else None, + "d2d_peak_gbps": round(d2d_peak, 1) if d2d_peak else None, + "h2d_efficiency_pct": h2d_efficiency, + "d2h_efficiency_pct": d2h_efficiency, + "d2d_efficiency_pct": d2d_efficiency, "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"], - "efficiency_pct": round(d2d_efficiency, 1), + "efficiency_pct": d2d_efficiency, "results_by_test": results_by_test, "per_gpu": [], } @@ -276,7 +276,7 @@ class Benchmark: best_d2d = max(v["d2d_gbps"] for v in bandwidth_by_size.values()) peak_bw = self.specs["memory_bandwidth_gbps"] - efficiency = (best_d2d / peak_bw) * 100 if peak_bw else 0.0 + efficiency = round((best_d2d / peak_bw) * 100, 1) if peak_bw else None return { "memory": { @@ -285,7 +285,7 @@ class Benchmark: "d2h_bandwidth_gbps": round(max(v["d2h_gbps"] for v in bandwidth_by_size.values()), 1), "d2d_bandwidth_gbps": round(best_d2d, 1), "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"], - "efficiency_pct": round(efficiency, 1), + "efficiency_pct": efficiency, "test_sizes_mb": test_sizes_mb, "bandwidth_by_size": bandwidth_by_size, "per_gpu": [], diff --git a/modules/gpu_specs.py b/modules/gpu_specs.py index 4a0190e..f746b84 100644 --- a/modules/gpu_specs.py +++ b/modules/gpu_specs.py @@ -1,4 +1,4 @@ -"""GPU specifications database for NVIDIA datacenter GPUs (A100/A800/H100/H200/B200/B300).""" +"""GPU specifications database for NVIDIA datacenter GPUs.""" import os import shutil diff --git a/modules/health_check.py b/modules/health_check.py index 24c3294..dd64071 100644 --- a/modules/health_check.py +++ b/modules/health_check.py @@ -1,4 +1,4 @@ -"""Hardware health monitoring module for NVIDIA datacenter GPUs (A100/A800/H100/H200/B200/B300).""" +"""Hardware health monitoring module for NVIDIA datacenter GPUs.""" import subprocess import shutil @@ -115,7 +115,11 @@ class HealthCheck: pg = self._safe_int(pcie_gens[i] if i < len(pcie_gens) else 0) pw = self._safe_int(pcie_widths[i] if i < len(pcie_widths) else 0) - pcie_ok = pg >= 4 and pw >= 8 + expected_gen = self.specs.get("pcie_gen", 0) + if expected_gen > 0: + pcie_ok = pg >= expected_gen and pw >= 16 + else: + pcie_ok = pw >= 8 # unknown GPU: just check width if not pcie_ok: overall_pass = False checks["pcie_link"] = {"gen": pg, "width": pw, "status": "PASS" if pcie_ok else "WARN"} diff --git a/modules/nccl_test.py b/modules/nccl_test.py index a513b80..77ab2bd 100644 --- a/modules/nccl_test.py +++ b/modules/nccl_test.py @@ -79,9 +79,17 @@ class NCCLTest: if self.nccl_cfg.get("test_sendrecv", False): tests.append(("sendrecv_perf", "SendRecv")) - default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4 + nvlink_bw = self.specs.get("nvlink_bandwidth_gbps", 0) + if nvlink_bw > 0: + default_min_bw = nvlink_bw * 0.4 + else: + # Conservative floor: any working NVLink should exceed 10 GB/s + default_min_bw = 10 min_bw = self.nccl_cfg.get("min_bandwidth_gbps") or round(default_min_bw) + if self.gpu_type == "unknown": + self.console.print("[yellow]Unknown GPU โ€” using conservative bandwidth thresholds[/yellow]") + # Strategy: try nccl-tests binary directly (single-node, -g N), # then mpirun, then torchrun fallback results = {} @@ -317,7 +325,7 @@ except Exception as e: dist.destroy_process_group() """ import tempfile - tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False, dir="/tmp") + tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) tmp.write(code) tmp.close() diff --git a/modules/report.py b/modules/report.py index 11e335b..9278eda 100644 --- a/modules/report.py +++ b/modules/report.py @@ -6,6 +6,12 @@ from datetime import datetime from pathlib import Path from typing import Optional +try: + from importlib.metadata import version as _pkg_version + __version__ = _pkg_version("gpu-server-test-suite") +except Exception: + __version__ = "0.2.0" + from rich.console import Console from rich.panel import Panel @@ -368,7 +374,7 @@ class ReportGenerator: # --- Footer --- lines.append("---") - lines.append(f"*Generated by GPU Test Suite v0.2.0*") + lines.append(f"*Generated by GPU Test Suite v{__version__}*") content = "\n".join(lines) with open(output, "w") as f: diff --git a/modules/stress_test.py b/modules/stress_test.py index 02647e1..892f95a 100644 --- a/modules/stress_test.py +++ b/modules/stress_test.py @@ -49,13 +49,13 @@ class StressTest: gpu_burn = self._find_gpu_burn() if gpu_burn: - # ๅฐ่ฏ•ไฝฟ็”จ gpu-burn + # Try gpu-burn first result = self._run_gpu_burn(gpu_burn, duration_sec, use_doubles, use_tensor_cores, target_gpus) - # ๅฆ‚ๆžœ gpu-burn ๅคฑ่ดฅ๏ผˆไพ‹ๅฆ‚ๆ˜พๅญ˜ไธ่ถณ๏ผ‰๏ผŒ่‡ชๅŠจ fallback ๅˆฐ PyTorch + # If gpu-burn fails (e.g. OOM), auto-fallback to PyTorch if not result.get("passed") and result.get("elapsed_sec", 0) < duration_sec * 0.5: - self.console.print("\n[yellow]gpu-burn ๆๅ‰้€€ๅ‡บ๏ผˆๅฏ่ƒฝๆ˜พๅญ˜ไธ่ถณ๏ผ‰๏ผŒ่‡ชๅŠจๅˆ‡ๆขๅˆฐ PyTorch ๅŽ‹ๅŠ›ๆต‹่ฏ•[/yellow]") - self.console.print("[dim]PyTorch ๆจกๅผไผšๆ นๆฎๅฎž้™…ๅฏ็”จๆ˜พๅญ˜ๅŠจๆ€่ฐƒๆ•ด๏ผŒๆ›ด็จณๅฎš[/dim]\n") + self.console.print("\n[yellow]gpu-burn exited early (possible OOM), switching to PyTorch stress test[/yellow]") + self.console.print("[dim]PyTorch mode dynamically adapts to available memory[/dim]\n") return self._run_pytorch_stress(duration_sec, memory_pct) return result @@ -134,18 +134,16 @@ class StressTest: tensors = {} for i in range(gpu_count): with torch.cuda.device(i): - # ่Žทๅ–ๅฎž้™…ๅฏ็”จๆ˜พๅญ˜๏ผˆ่€ƒ่™‘ๅ…ถไป–่ฟ›็จ‹ๅทฒๅ ็”จ็š„้ƒจๅˆ†๏ผ‰ + # Get actual free memory (accounting for other processes) free_mem, total_mem = torch.cuda.mem_get_info(i) - # ๆ นๆฎ้…็ฝฎ็š„ memory_pct ่ฎก็ฎ—ๅˆ†้…ๅคงๅฐ - # ไพ‹ๅฆ‚๏ผšmemory_pct=90 ่กจ็คบไฝฟ็”จๆ€ปๆ˜พๅญ˜็š„ 90% + # Calculate allocation from configured memory_pct target_mem = int(total_mem * memory_pct / 100) - # ไฝ†ไธ่ƒฝ่ถ…่ฟ‡ๅฎž้™…ๅฏ็”จๆ˜พๅญ˜๏ผˆ็•™ๅ‡บ 5% ๅฎ‰ๅ…จไฝ™้‡๏ผ‰ + # Cap at actual free memory with 5% safety margin alloc_bytes = min(target_mem, int(free_mem * 0.95)) - # matmul(A, A.T) ้œ€่ฆ 2x ่พ“ๅ…ฅๆ˜พๅญ˜๏ผˆ่พ“ๅ…ฅ + ่พ“ๅ‡บ๏ผ‰ - # ๆ‰€ไปฅๅˆ†้… sqrt(alloc_bytes/4/2) ๅคงๅฐ็š„ๆ–น้˜ต + # matmul(A, A.T) needs 2x input memory (input + output) side = int((alloc_bytes / 4 / 2) ** 0.5) # float32 = 4 bytes actual_mem_mb = side * side * 4 / 1024 / 1024 @@ -153,13 +151,13 @@ class StressTest: free_mem_mb = free_mem / 1024 / 1024 self.console.print( - f" [dim]GPU {i}: ๆ€ปๆ˜พๅญ˜ {total_mem_mb:.0f}MB, ๅฏ็”จ {free_mem_mb:.0f}MB, " - f"ๅˆ†้… {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - " - f"็Ÿฉ้˜ต {side}x{side}[/dim]" + f" [dim]GPU {i}: total {total_mem_mb:.0f}MB, free {free_mem_mb:.0f}MB, " + f"alloc {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - " + f"matrix {side}x{side}[/dim]" ) tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32) - self.console.print(f"\n[cyan]ๅผ€ๅง‹ๅŽ‹ๅŠ›ๆต‹่ฏ•๏ผŒๆŒ็ปญ {duration} ็ง’...[/cyan]") + self.console.print(f"\n[cyan]Starting stress test for {duration} seconds...[/cyan]") elapsed_check = 0 while time.time() - t0 < duration: @@ -169,10 +167,10 @@ class StressTest: torch.cuda.synchronize() time.sleep(0.1) - # ๆฏ 10 ็ง’ๆ˜พ็คบไธ€ๆฌก่ฟ›ๅบฆ + # Show progress every 10 seconds current_elapsed = time.time() - t0 if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0: - self.console.print(f" [dim]ๅทฒ่ฟ่กŒ {int(current_elapsed)}s / {duration}s[/dim]") + self.console.print(f" [dim]Running {int(current_elapsed)}s / {duration}s[/dim]") elapsed_check = current_elapsed for i in range(gpu_count): @@ -180,7 +178,7 @@ class StressTest: except RuntimeError as e: error_msg = str(e) - self.console.print(f"\n[red]ๅŽ‹ๅŠ›ๆต‹่ฏ•ๅ‡บ้”™: {error_msg}[/red]") + self.console.print(f"\n[red]Stress test error: {error_msg}[/red]") for i in range(gpu_count): if i not in gpu_status: gpu_status[i] = "FAIL" diff --git a/modules/training_sim.py b/modules/training_sim.py index 3830a76..dc7f5a3 100644 --- a/modules/training_sim.py +++ b/modules/training_sim.py @@ -77,7 +77,7 @@ class TrainingSim: optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) if dtype in (torch.float16, torch.bfloat16): - scaler = torch.cuda.amp.GradScaler(enabled=(dtype == torch.float16)) + scaler = torch.amp.GradScaler("cuda", enabled=(dtype == torch.float16)) step_times = [] mem_usage = [] @@ -96,8 +96,8 @@ class TrainingSim: input_ids = input_ids.to(model.device) attention_mask = attention_mask.to(model.device) - if dtype in (torch.float16, torch.bfloat16) and dtype != torch.bfloat16: - with torch.cuda.amp.autocast(dtype=dtype): + if dtype in (torch.float16, torch.bfloat16): + with torch.amp.autocast("cuda", dtype=dtype): outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids) loss = outputs.loss else: