test_gpu_scripts/modules/stress_test.py
qinyusen fefef8e03b refactor: remove hardcoding, fix AMP bug, unify English output
- Fix AMP autocast: bf16 now uses torch.amp.autocast (was skipped)
- Fix NCCL threshold: unknown GPU gets 10 GB/s floor instead of 0
- Fix PCIe health check: use specs-driven pcie_gen, not hardcoded Gen4
- Remove hardcoded GPU lists: dynamic banner, CLI choices, version
- Unknown GPU efficiency displays N/A instead of 0%
- Unify all console output to English (stress_test, gpu_tester)
- Use importlib.metadata for runtime version resolution
- Remove dir="/tmp" from tempfile (use system default)

🤖 Generated with [Qoder][https://qoder.com]
2026-05-07 21:32:35 +08:00

240 lines
9.0 KiB
Python

"""GPU stress test module — wraps gpu-burn for long-running stability tests."""
import glob
import os
import shutil
import subprocess
import time
from datetime import datetime
from rich.console import Console
from rich.table import Table
from rich.live import Live
from rich.text import Text
from modules.gpu_specs import resolve_tools_dir
class StressTest:
def __init__(self, config: dict):
self.config = config
self.console = Console()
self.stress_cfg = config.get("stress", {})
self.tools_dir = resolve_tools_dir(config)
def _find_gpu_burn(self) -> str:
p = shutil.which("gpu_burn")
if p:
return p
local = os.path.join(self.tools_dir, "gpu-burn", "gpu_burn")
if os.path.isfile(local) and os.access(local, shutil.os.X_OK):
return local
matches = glob.glob(os.path.join(self.tools_dir, "gpu-burn", "**", "gpu_burn"), recursive=True)
for m in matches:
if os.access(m, shutil.os.X_OK):
return m
return ""
def run(self) -> dict:
cfg = self.stress_cfg
duration_sec = cfg.get("duration_sec", 60)
use_doubles = cfg.get("use_doubles", False)
use_tensor_cores = cfg.get("use_tensor_cores", True)
memory_pct = cfg.get("memory_pct", 90)
target_gpus = cfg.get("gpus", "all")
gpu_burn = self._find_gpu_burn()
if gpu_burn:
# Try gpu-burn first
result = self._run_gpu_burn(gpu_burn, duration_sec, use_doubles, use_tensor_cores, target_gpus)
# If gpu-burn fails (e.g. OOM), auto-fallback to PyTorch
if not result.get("passed") and result.get("elapsed_sec", 0) < duration_sec * 0.5:
self.console.print("\n[yellow]gpu-burn exited early (possible OOM), switching to PyTorch stress test[/yellow]")
self.console.print("[dim]PyTorch mode dynamically adapts to available memory[/dim]\n")
return self._run_pytorch_stress(duration_sec, memory_pct)
return result
self.console.print("[yellow]gpu_burn not found, using PyTorch stress test[/yellow]")
return self._run_pytorch_stress(duration_sec, memory_pct)
def _run_gpu_burn(self, gpu_burn: str, duration: int,
doubles: bool, tensor_cores: bool, target_gpus: str) -> dict:
self.console.print(f"[cyan]GPU Stress Test via gpu-burn ({duration}s)[/cyan]")
cmd = [gpu_burn]
if doubles:
cmd.append("-d")
if tensor_cores:
cmd.append("-tc")
if target_gpus != "all":
cmd.extend(["-i", str(target_gpus)])
cmd.append(str(duration))
t0 = time.time()
try:
r = subprocess.run(cmd, capture_output=True, text=True, timeout=duration + 120)
elapsed = round(time.time() - t0, 1)
output = r.stdout + r.stderr
passed = r.returncode == 0
gpu_results = []
for line in output.split("\n"):
line = line.strip()
if "GPU" in line and ("PASS" in line.upper() or "FAIL" in line.upper()):
gpu_results.append(line)
return {
"source": "gpu-burn",
"passed": passed,
"duration_sec": duration,
"elapsed_sec": elapsed,
"gpu_results": gpu_results,
"raw_output_tail": output[-500:] if output else "",
"timestamp": datetime.now().isoformat(),
}
except subprocess.TimeoutExpired:
return {
"source": "gpu-burn",
"passed": False,
"duration_sec": duration,
"error": "timeout",
"timestamp": datetime.now().isoformat(),
}
except Exception as e:
return {
"source": "gpu-burn",
"passed": False,
"error": str(e),
"timestamp": datetime.now().isoformat(),
}
def _run_pytorch_stress(self, duration: int, memory_pct: int = 90) -> dict:
try:
import torch
if not torch.cuda.is_available():
return {"error": "pytorch_not_available"}
except ImportError:
return {"error": "pytorch_not_available"}
gpu_count = torch.cuda.device_count()
self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs, target {memory_pct}% memory)[/cyan]")
gpu_status = {}
t0 = time.time()
try:
tensors = {}
for i in range(gpu_count):
with torch.cuda.device(i):
# Get actual free memory (accounting for other processes)
free_mem, total_mem = torch.cuda.mem_get_info(i)
# Calculate allocation from configured memory_pct
target_mem = int(total_mem * memory_pct / 100)
# Cap at actual free memory with 5% safety margin
alloc_bytes = min(target_mem, int(free_mem * 0.95))
# matmul(A, A.T) needs 2x input memory (input + output)
side = int((alloc_bytes / 4 / 2) ** 0.5) # float32 = 4 bytes
actual_mem_mb = side * side * 4 / 1024 / 1024
total_mem_mb = total_mem / 1024 / 1024
free_mem_mb = free_mem / 1024 / 1024
self.console.print(
f" [dim]GPU {i}: total {total_mem_mb:.0f}MB, free {free_mem_mb:.0f}MB, "
f"alloc {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - "
f"matrix {side}x{side}[/dim]"
)
tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32)
self.console.print(f"\n[cyan]Starting stress test for {duration} seconds...[/cyan]")
elapsed_check = 0
while time.time() - t0 < duration:
for i in range(gpu_count):
with torch.cuda.device(i):
tensors[i] = torch.matmul(tensors[i], tensors[i].T)
torch.cuda.synchronize()
time.sleep(0.1)
# Show progress every 10 seconds
current_elapsed = time.time() - t0
if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0:
self.console.print(f" [dim]Running {int(current_elapsed)}s / {duration}s[/dim]")
elapsed_check = current_elapsed
for i in range(gpu_count):
gpu_status[i] = "PASS"
except RuntimeError as e:
error_msg = str(e)
self.console.print(f"\n[red]Stress test error: {error_msg}[/red]")
for i in range(gpu_count):
if i not in gpu_status:
gpu_status[i] = "FAIL"
return {
"source": "pytorch",
"passed": False,
"duration_sec": duration,
"error": error_msg,
"gpu_status": gpu_status,
}
finally:
tensors.clear()
torch.cuda.empty_cache()
elapsed = round(time.time() - t0, 1)
return {
"source": "pytorch",
"passed": True,
"duration_sec": duration,
"elapsed_sec": elapsed,
"gpu_status": gpu_status,
"timestamp": datetime.now().isoformat(),
}
@staticmethod
def print_results(results: dict, console: Console = None):
c = console or Console()
if "error" in results:
c.print(f"[bold red]Error: {results['error']}[/bold red]")
return
passed = results.get("passed", False)
source = results.get("source", "unknown")
duration = results.get("duration_sec", "?")
elapsed = results.get("elapsed_sec", "?")
verdict = "[bold green]✓ Stress Test PASSED[/bold green]" if passed else "[bold red]✗ Stress Test FAILED[/bold red]"
c.print(f"\n{verdict} [dim](via {source})[/dim]")
c.print(f" Target duration: {duration}s | Actual: {elapsed}s")
gpu_results = results.get("gpu_results", [])
if gpu_results:
c.print("\n Per-GPU results:")
for line in gpu_results:
if "FAIL" in line.upper():
c.print(f" [red]{line}[/red]")
else:
c.print(f" [green]{line}[/green]")
gpu_status = results.get("gpu_status", {})
if gpu_status:
c.print("\n Per-GPU status:")
for gid, status in sorted(gpu_status.items()):
color = "green" if status == "PASS" else "red"
c.print(f" GPU {gid}: [{color}]{status}[/{color}]")
if results.get("error"):
c.print(f" [red]Error: {results['error']}[/red]")