diff --git a/.gitignore b/.gitignore index 2ddbbbb..30a9cdb 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ reports/ .env .venv/ venv/ +.qoder/* diff --git a/gpu_tester.py b/gpu_tester.py index 3582817..b2b6851 100644 --- a/gpu_tester.py +++ b/gpu_tester.py @@ -310,8 +310,10 @@ def _run_full_suite(config: dict, console: Console) -> dict: # Summary console.print("\n" + "=" * 60) - passed = sum(1 for v in all_results.values() if not isinstance(v, dict) or "error" not in v) - total = len(tests) + # 只统计测试结果,排除 timestamp 等元数据 + test_results = {k: v for k, v in all_results.items() if k != "timestamp"} + passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v) + total = len(test_results) color = "green" if passed == total else ("yellow" if passed > 0 else "red") console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]") return all_results diff --git a/install_deps.sh b/install_deps.sh index a9659a3..ab5e499 100755 --- a/install_deps.sh +++ b/install_deps.sh @@ -25,6 +25,9 @@ PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" JOBS="${MAKE_JOBS:-$(nproc)}" VERBOSE="${VERBOSE:-0}" +# uv 配置:跨文件系统时使用 copy 模式,避免硬链接警告 +export UV_LINK_MODE="${UV_LINK_MODE:-copy}" + # 参数标志 FLAG_INSTALL_SYS_DEPS=0 FLAG_SKIP_PYTORCH=0 @@ -152,39 +155,59 @@ detect_gpu_and_driver() { } detect_cuda_version() { - # 方式 1: nvcc(最可靠,代表 toolkit 确实安装了) + # 优先级 1: nvcc 在 PATH 中(最可靠,代表 CUDA Toolkit 已正确配置) if command -v nvcc &>/dev/null; then CUDA_VERSION=$(nvcc --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+') if [[ -n "$CUDA_VERSION" ]]; then - ok "CUDA: $CUDA_VERSION (via nvcc)" + ok "CUDA: $CUDA_VERSION (via nvcc in PATH)" _map_cuda_tag return 0 fi fi - # 方式 2: nvidia-smi(驱动支持的最大 CUDA 版本,非 toolkit) - local smi_cuda - smi_cuda=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+') - if [[ -n "$smi_cuda" ]]; then - CUDA_VERSION="$smi_cuda" - warn "CUDA: $CUDA_VERSION (via nvidia-smi — 仅代表驱动能力,非已安装 toolkit)" - warn " → 若编译失败,请安装 CUDA Toolkit: apt install cuda-toolkit-${CUDA_VERSION/./-}" - _map_cuda_tag - return 0 - fi - - # 方式 3: /usr/local/cuda - if [[ -f /usr/local/cuda/version.txt ]]; then - CUDA_VERSION=$(grep -oP '[0-9]+\.[0-9]+' /usr/local/cuda/version.txt | head -1) + # 优先级 2: CUDA_HOME 环境变量已设置且有效 + if [[ -n "${CUDA_HOME:-}" ]] && [[ -x "${CUDA_HOME}/bin/nvcc" ]]; then + CUDA_VERSION=$("${CUDA_HOME}/bin/nvcc" --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+') if [[ -n "$CUDA_VERSION" ]]; then - ok "CUDA: $CUDA_VERSION (via /usr/local/cuda/version.txt)" + ok "CUDA: $CUDA_VERSION (via CUDA_HOME=${CUDA_HOME})" + # 将 CUDA_HOME/bin 加入 PATH,供后续编译使用 + export PATH="${CUDA_HOME}/bin:$PATH" _map_cuda_tag return 0 fi fi - fail "无法检测 CUDA 版本" - echo " → 请安装 CUDA Toolkit: https://developer.nvidia.com/cuda-downloads" + # 优先级 3: 检查标准路径 /usr/local/cuda(最常见的安装位置) + if [[ -x "/usr/local/cuda/bin/nvcc" ]]; then + CUDA_VERSION=$("/usr/local/cuda/bin/nvcc" --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+') + if [[ -n "$CUDA_VERSION" ]]; then + export CUDA_HOME="/usr/local/cuda" + export PATH="$CUDA_HOME/bin:$PATH" + ok "CUDA: $CUDA_VERSION (via /usr/local/cuda)" + _map_cuda_tag + return 0 + fi + fi + + # 所有方式都失败,明确报错退出 + fail "CUDA Toolkit 未找到!" + echo "" + echo " 当前环境状态:" + echo " • nvcc 不在 PATH 中" + if [[ -z "${CUDA_HOME:-}" ]]; then + echo " • CUDA_HOME 环境变量未设置" + else + echo " • CUDA_HOME=${CUDA_HOME} (但 nvcc 不存在或不可执行)" + fi + echo " • /usr/local/cuda/bin/nvcc 不存在或不可执行" + echo "" + echo " 解决方案(选择其一):" + echo " 1. 安装 CUDA Toolkit: https://developer.nvidia.com/cuda-downloads" + echo " 2. 如果已安装,请设置环境变量:" + echo " export CUDA_HOME=/path/to/cuda" + echo " export PATH=\$CUDA_HOME/bin:\$PATH" + echo " 3. 创建符号链接: sudo ln -s /path/to/cuda /usr/local/cuda" + echo "" return 1 } @@ -194,6 +217,8 @@ _map_cuda_tag() { minor="${CUDA_VERSION#*.}" minor="${minor%%.*}" + # PyTorch 官方提供的 CUDA wheel 版本: cu118, cu121, cu124, cu128 + # 选择规则: 取不超过驱动支持 CUDA 版本的最高可用 wheel if [[ "$major" -eq 11 ]]; then CUDA_TAG="cu118" elif [[ "$major" -eq 12 ]]; then @@ -204,11 +229,18 @@ _map_cuda_tag() { else CUDA_TAG="cu128" fi - else + elif [[ "$major" -ge 13 ]]; then + # CUDA 13+ 驱动,仍用 cu128(PyTorch 暂无更高版本 wheel) CUDA_TAG="cu128" - warn "未知 CUDA $CUDA_VERSION,默认使用 cu128 索引" + else + CUDA_TAG="cu124" + warn "未知 CUDA $CUDA_VERSION,默认使用 cu124 索引" fi - log "PyTorch wheel 索引: $CUDA_TAG" + + log "版本选择决策:" + log " 驱动支持最高 CUDA: ${CUDA_VERSION}" + log " PyTorch 可用 wheel: cu118 / cu121 / cu124 / cu128" + log " → 选择: ${CUDA_TAG}(不超过 CUDA ${CUDA_VERSION} 的最高兼容版本)" } check_python() { @@ -286,11 +318,13 @@ check_nccl_dev() { if ldconfig -p 2>/dev/null | grep -q libnccl; then HAS_NCCL_DEV=1 ok "libnccl: 已找到 (via ldconfig)" + _check_nccl_compatibility return 0 fi if [[ -f /usr/include/nccl.h ]] || dpkg -l libnccl-dev &>/dev/null 2>&1; then HAS_NCCL_DEV=1 ok "libnccl-dev: 已安装" + _check_nccl_compatibility return 0 fi HAS_NCCL_DEV=0 @@ -299,6 +333,55 @@ check_nccl_dev() { return 0 } +# 检测系统 NCCL 版本是否与当前驱动/CUDA 兼容 +NCCL_COMPATIBLE=1 +_check_nccl_compatibility() { + NCCL_COMPATIBLE=1 + + # 获取 NCCL 包的 CUDA 依赖版本 + local nccl_pkg_info="" + nccl_pkg_info=$(dpkg -l libnccl2 2>/dev/null | grep -oP '\+cuda[0-9.]+' | head -1) + if [[ -z "$nccl_pkg_info" ]]; then + return 0 # 无法判断,假设兼容 + fi + + local nccl_cuda_ver="${nccl_pkg_info#+cuda}" + local nccl_cuda_major="${nccl_cuda_ver%%.*}" + local nccl_cuda_minor="${nccl_cuda_ver#*.}" + nccl_cuda_minor="${nccl_cuda_minor%%.*}" + + # 获取驱动支持的最大 CUDA 版本 + local driver_cuda="" + driver_cuda=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+') + if [[ -z "$driver_cuda" ]]; then + return 0 + fi + + local drv_cuda_major="${driver_cuda%%.*}" + local drv_cuda_minor="${driver_cuda#*.}" + drv_cuda_minor="${drv_cuda_minor%%.*}" + + # NCCL 需要的 CUDA 版本 > 驱动支持的 CUDA 版本 → 不兼容 + if [[ "$nccl_cuda_major" -gt "$drv_cuda_major" ]] || \ + { [[ "$nccl_cuda_major" -eq "$drv_cuda_major" ]] && [[ "$nccl_cuda_minor" -gt "$drv_cuda_minor" ]]; }; then + NCCL_COMPATIBLE=0 + warn "系统 NCCL 版本不兼容!" + echo -e " ${YELLOW}NCCL 包要求: CUDA ${nccl_cuda_ver}${NC}" + echo -e " ${YELLOW}驱动支持最高: CUDA ${driver_cuda}${NC}" + echo "" + echo " 这会导致 nccl-tests 运行时报错:" + echo " 'CUDA driver version is insufficient for CUDA runtime version'" + echo "" + echo " 解决方案(任选其一):" + echo " A) 降级 NCCL: sudo apt install libnccl2=<版本>+cuda${driver_cuda}" + echo " B) 升级驱动至支持 CUDA ${nccl_cuda_ver} 的版本" + echo " C) 使用 PyTorch 内置 NCCL(测试套件会自动 fallback)" + echo "" + else + ok "NCCL 兼容性: NCCL(cuda${nccl_cuda_ver}) <= 驱动(cuda${driver_cuda})" + fi +} + install_system_deps() { log "安装系统依赖包..." if command -v apt-get &>/dev/null; then @@ -432,7 +515,7 @@ setup_python_venv() { # 安装项目依赖 log "安装 Python 依赖(rich、pyyaml、numpy)..." uv pip install --python "$venv_dir/bin/python" \ - -e "$PROJECT_DIR" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -1; } || true + -e "$PROJECT_DIR" 2>&1 || true ok "项目依赖安装完成" # 安装 PyTorch @@ -450,7 +533,7 @@ setup_python_venv() { log "(下载较大,请耐心等待...)" uv pip install --python "$venv_dir/bin/python" \ "torch>=2.1.0" --index-url "$index_url" \ - 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } || { + 2>&1 || { warn "PyTorch 安装失败,可稍后手动安装:" echo " source $INSTALL_DIR/env.sh" echo " uv pip install torch --index-url $index_url" @@ -492,7 +575,12 @@ build_nvbandwidth() { cd "$src" mkdir -p build && cd build - cmake .. -DCMAKE_BUILD_TYPE=Release 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + + # 使用 detect_cuda_version() 中设置的 CUDA_HOME 和 PATH + # 如果 CUDA_HOME 未设置,则使用默认路径 + local cuda_home="${CUDA_HOME:-/usr/local/cuda}" + + cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_COMPILER="$cuda_home/bin/nvcc" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } make -j"$JOBS" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } if [[ -x ./nvbandwidth ]]; then @@ -512,6 +600,9 @@ build_nccl_tests() { if [[ -x "$src/build/all_reduce_perf" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then ok "nccl-tests: 已编译 ($src/build/)" + if [[ $NCCL_COMPATIBLE -eq 0 ]]; then + warn "nccl-tests: 已编译但系统 NCCL 与驱动不兼容,运行时将 fallback 到 torchrun" + fi return 0 fi @@ -520,6 +611,15 @@ build_nccl_tests() { return 0 fi + # NCCL 不兼容时仍然编译(编译不报错),但给出明确警告 + if [[ $NCCL_COMPATIBLE -eq 0 ]]; then + warn "nccl-tests: 系统 NCCL 与驱动不兼容" + warn " 编译会成功但运行时会报错 'CUDA driver version is insufficient'" + warn " 测试套件会自动 fallback 到 torchrun 方式测试 NCCL" + log " 如需原生 nccl-tests 性能数据,请先解决 NCCL 版本问题(见上方提示)" + echo "" + fi + local cuda_home="${CUDA_HOME:-/usr/local/cuda}" if [[ ! -d "$cuda_home/include" ]]; then warn "nccl-tests: 跳过(CUDA_HOME=$cuda_home 无效)" @@ -688,7 +788,11 @@ print_summary() { local path="${tool_info%%:*}" local name="${tool_info##*:}" if [[ -x "$path" ]]; then - echo -e " ${GREEN}✓${NC} $name" + if [[ "$name" == "nccl-tests" ]] && [[ $NCCL_COMPATIBLE -eq 0 ]]; then + echo -e " ${YELLOW}⚠${NC} $name (已编译,但系统 NCCL 与驱动不兼容)" + else + echo -e " ${GREEN}✓${NC} $name" + fi else echo -e " ${YELLOW}○${NC} $name (未编译)" fi diff --git a/modules/benchmark.py b/modules/benchmark.py index e657733..a87d018 100644 --- a/modules/benchmark.py +++ b/modules/benchmark.py @@ -78,30 +78,33 @@ class Benchmark: self.console.print(f"[cyan]Memory Benchmark via nvbandwidth ({nvbw_path})[/cyan]") results_by_test = {} - per_gpu_d2d = [] + # Testcases to run — keys used internally, try both old and new names testcases = [ - "host_to_device_memcpy_read_ce", - "device_to_host_memcpy_write_ce", - "device_to_device_memcpy_write_ce", - "device_to_device_memcpy_read_ce", - "device_to_device_bidirectional_sm", + ("h2d", ["host_to_device_memcpy_ce", "host_to_device_memcpy_read_ce"]), + ("d2h", ["device_to_host_memcpy_ce", "device_to_host_memcpy_write_ce"]), + ("d2d_write", ["device_to_device_memcpy_write_ce"]), + ("d2d_read", ["device_to_device_memcpy_read_ce"]), + ("d2d_bidir", ["device_to_device_bidirectional_memcpy_write_sm", + "device_to_device_bidirectional_sm"]), ] + # Discover available testcase names + available_names: list[str] = [] try: list_r = subprocess.run( - [nvbw_path, "-l", "-j"], - capture_output=True, text=True, timeout=15, + [nvbw_path, "-l"], capture_output=True, text=True, timeout=15, ) - available = [] if list_r.returncode == 0: - try: - avail_list = json.loads(list_r.stdout) - available = [t.get("name", "") for t in avail_list if isinstance(t, dict)] - except json.JSONDecodeError: - pass + for line in list_r.stdout.splitlines(): + line = line.strip() + if line and ", " in line and line[0].isdigit(): + parts = line.split(", ", 1) + name = parts[1].rstrip(":").strip() + if name: + available_names.append(name) except (subprocess.TimeoutExpired, FileNotFoundError): - available = [] + pass with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), @@ -110,53 +113,51 @@ class Benchmark: ) as progress: task = progress.add_task("nvbandwidth tests...", total=len(testcases)) - for tc in testcases: - if available and tc not in available: + for key, name_candidates in testcases: + # Pick the first available test name + tc = None + for candidate in name_candidates: + if not available_names or candidate in available_names: + tc = candidate + break + if tc is None: progress.advance(task) continue try: - cmd = [ - nvbw_path, - f"-b{buffer_mb}", - f"-i{samples}", - "-j", - f"-t{tc}", - ] + cmd = [nvbw_path, "-t", tc, "-b", str(buffer_mb), + "-i", str(samples), "-j"] r = subprocess.run(cmd, capture_output=True, text=True, timeout=120) if r.returncode == 0 and r.stdout.strip(): - try: - data = json.loads(r.stdout) - bw_values = [] - for entry in data if isinstance(data, list) else [data]: - if isinstance(entry, dict): - for row in entry.get("results", []): - val = row.get("value", 0) - if isinstance(val, (int, float)): - bw_values.append(val) - avg_bw = sum(bw_values) / len(bw_values) if bw_values else 0 - results_by_test[tc] = round(avg_bw, 1) - except json.JSONDecodeError: - results_by_test[tc] = 0 + avg_bw = self._parse_nvbandwidth_json(r.stdout) + results_by_test[key] = round(avg_bw, 1) else: - results_by_test[tc] = 0 + results_by_test[key] = 0 except (subprocess.TimeoutExpired, FileNotFoundError): - results_by_test[tc] = 0 + results_by_test[key] = 0 progress.advance(task) d2d_bw = max( - results_by_test.get("device_to_device_memcpy_write_ce", 0), - results_by_test.get("device_to_device_memcpy_read_ce", 0), - results_by_test.get("device_to_device_bidirectional_sm", 0), - ) - h2d_bw = results_by_test.get("host_to_device_memcpy_read_ce", 0) - d2h_bw = results_by_test.get("device_to_host_memcpy_write_ce", 0) - peak_bw = self.specs["memory_bandwidth_gbps"] - efficiency = ( - (d2d_bw / peak_bw) * 100 if (d2d_bw and peak_bw) else 0 + results_by_test.get("d2d_write", 0), + results_by_test.get("d2d_read", 0), + results_by_test.get("d2d_bidir", 0), ) + h2d_bw = results_by_test.get("h2d", 0) + d2h_bw = results_by_test.get("d2h", 0) + + # D2D goes through NVLink — compare to NVLink per-direction bandwidth + # (nvlink_bandwidth_gbps is bidirectional, so per-direction = /2) + nvlink_bw = self.specs.get("nvlink_bandwidth_gbps", 0) + d2d_peak = nvlink_bw / 2 if nvlink_bw else 0 + d2d_efficiency = (d2d_bw / d2d_peak) * 100 if (d2d_bw and d2d_peak) else 0 + + # H2D/D2H goes through PCIe — estimate peak from PCIe gen + pcie_gen = self.specs.get("pcie_gen", 4) + pcie_peak = {3: 16, 4: 32, 5: 64, 6: 128}.get(pcie_gen, 32) # GB/s x16 + h2d_efficiency = (h2d_bw / pcie_peak) * 100 if (h2d_bw and pcie_peak) else 0 + d2h_efficiency = (d2h_bw / pcie_peak) * 100 if (d2h_bw and pcie_peak) else 0 return { "memory": { @@ -164,13 +165,55 @@ class Benchmark: "h2d_bandwidth_gbps": round(h2d_bw, 1), "d2h_bandwidth_gbps": round(d2h_bw, 1), "d2d_bandwidth_gbps": round(d2d_bw, 1), + "h2d_peak_gbps": pcie_peak, + "d2h_peak_gbps": pcie_peak, + "d2d_peak_gbps": round(d2d_peak, 1), + "h2d_efficiency_pct": round(h2d_efficiency, 1), + "d2h_efficiency_pct": round(d2h_efficiency, 1), + "d2d_efficiency_pct": round(d2d_efficiency, 1), "peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"], - "efficiency_pct": round(efficiency, 1), + "efficiency_pct": round(d2d_efficiency, 1), "results_by_test": results_by_test, - "per_gpu": per_gpu_d2d, + "per_gpu": [], } } + @staticmethod + def _parse_nvbandwidth_json(raw: str) -> float: + """Parse nvbandwidth JSON output (supports v0.5+ and v0.8+ formats).""" + try: + data = json.loads(raw) + except json.JSONDecodeError: + return 0.0 + + # v0.8+ format: {"nvbandwidth": {"testcases": [{"bandwidth_matrix": [...], "sum": N}]}} + if isinstance(data, dict) and "nvbandwidth" in data: + testcases = data["nvbandwidth"].get("testcases", []) + for tc in testcases: + matrix = tc.get("bandwidth_matrix", []) + values = [] + for row in matrix: + for cell in row: + try: + v = float(cell) + values.append(v) + except (ValueError, TypeError): + continue + if values: + return sum(values) / len(values) + return 0.0 + + # v0.5 format: list of dicts with "results" array + entries = data if isinstance(data, list) else [data] + bw_values = [] + for entry in entries: + if isinstance(entry, dict): + for row in entry.get("results", []): + val = row.get("value", 0) + if isinstance(val, (int, float)): + bw_values.append(val) + return sum(bw_values) / len(bw_values) if bw_values else 0.0 + def _run_memory_pytorch(self) -> dict: mem_cfg = self.bench_cfg.get("memory", {}) test_sizes_mb = [1, 4, 16, 64, 256, 1024, 4096] @@ -377,15 +420,16 @@ class Benchmark: table.add_column("Peak", justify="right") table.add_column("Efficiency", justify="right") - for label, achieved, peak in [ - ("H2D (PCIe)", mem["h2d_bandwidth_gbps"], None), - ("D2H (PCIe)", mem["d2h_bandwidth_gbps"], None), - ("D2D (HBM3e)", mem["d2d_bandwidth_gbps"], mem["peak_bandwidth_gbps"]), + for label, achieved, peak_key, eff_key in [ + ("H2D (PCIe)", mem["h2d_bandwidth_gbps"], "h2d_peak_gbps", "h2d_efficiency_pct"), + ("D2H (PCIe)", mem["d2h_bandwidth_gbps"], "d2h_peak_gbps", "d2h_efficiency_pct"), + ("D2D (NVLink)", mem["d2d_bandwidth_gbps"], "d2d_peak_gbps", "d2d_efficiency_pct"), ]: val_str = f"{achieved:.1f} GB/s" if isinstance(achieved, (int, float)) else "N/A" + peak = mem.get(peak_key, 0) peak_str = f"{peak:.0f} GB/s" if peak else "N/A" - if peak and isinstance(achieved, (int, float)) and achieved > 0: - eff = (achieved / peak) * 100 + eff = mem.get(eff_key, 0) + if eff: ec = "green" if eff >= 80 else ("yellow" if eff >= 50 else "red") eff_str = f"[{ec}]{eff:.1f}%[/{ec}]" else: diff --git a/modules/gpu_info.py b/modules/gpu_info.py index 6369cfa..6170905 100644 --- a/modules/gpu_info.py +++ b/modules/gpu_info.py @@ -67,7 +67,7 @@ class GPUInfo: ecc_double = self._run_smi("ecc.errors.double_bit.total.volatile").split("\n") if self._run_smi("ecc.errors.double_bit.total.volatile") else [] driver_info = self._run_smi("driver_version", "csv,noheader") - cuda_info = self._run_smi("cuda_version", "csv,noheader") + cuda_info = self._get_cuda_version() def safe_get(lst, idx, default="N/A"): try: @@ -116,7 +116,7 @@ class GPUInfo: return { "driver_version": safe_get(driver_info.split("\n"), 0) if driver_info else "N/A", - "cuda_version": safe_get(cuda_info.split("\n"), 0) if cuda_info else "N/A", + "cuda_version": cuda_info or "N/A", "gpu_count": gpu_count, "gpus": gpus, "topology": topology, @@ -125,6 +125,21 @@ class GPUInfo: "gpu_label": self.gpu_label, } + def _get_cuda_version(self) -> Optional[str]: + """Parse CUDA version from nvidia-smi header output (query-gpu field removed in newer drivers).""" + try: + r = subprocess.run( + ["nvidia-smi"], capture_output=True, text=True, timeout=15, + ) + if r.returncode == 0: + import re + m = re.search(r"CUDA Version:\s+([\d.]+)", r.stdout) + if m: + return m.group(1) + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + return None + def _get_topology(self) -> str: try: r = subprocess.run( diff --git a/modules/health_check.py b/modules/health_check.py index 3b18055..24c3294 100644 --- a/modules/health_check.py +++ b/modules/health_check.py @@ -125,10 +125,29 @@ class HealthCheck: checks["clock_speed"] = {"sm": sm, "mem": mm, "status": "PASS" if sm > 0 and mm > 0 else "WARN"} throttle_val = throttling_raw[i] if i < len(throttling_raw) else "" - throttle_active = throttle_val not in ("", "None", "Active", "N/A") - if throttle_active: + # Parse bitmask: 0x0 = none, 0x1 = gpu_idle (benign), others = real throttling + throttle_reasons = [] + try: + bitmask = int(throttle_val, 16) if throttle_val.startswith("0x") else 0 + except (ValueError, TypeError): + bitmask = 0 + # Bit 0 = gpu_idle — not a real problem, ignore it + real_throttle = bitmask & ~0x1 + if real_throttle: + if real_throttle & 0x4: + throttle_reasons.append("sw_power_cap") + if real_throttle & 0x8: + throttle_reasons.append("hw_slowdown") + if real_throttle & 0x10: + throttle_reasons.append("hw_thermal_slowdown") + if real_throttle & 0x20: + throttle_reasons.append("hw_power_brake") + if real_throttle & 0x40: + throttle_reasons.append("sw_thermal_slowdown") + if not throttle_reasons: + throttle_reasons.append(f"unknown(0x{real_throttle:x})") overall_pass = False - checks["throttling"] = {"status": "FAIL" if throttle_active else "PASS", "reasons": [throttle_val] if throttle_active else []} + checks["throttling"] = {"status": "FAIL" if real_throttle else "PASS", "reasons": throttle_reasons} pers_val = persistence[i] if i < len(persistence) else "" pers_enabled = pers_val == "Enabled" diff --git a/modules/nccl_test.py b/modules/nccl_test.py index 1443c04..a513b80 100644 --- a/modules/nccl_test.py +++ b/modules/nccl_test.py @@ -65,11 +65,6 @@ class NCCLTest: self.console.print(f"[yellow]NCCL test requires at least 2 GPUs (found {gpu_count})[/yellow]") return {"error": "need_at_least_2_gpus", "gpu_count": gpu_count} - mpirun = self._find_mpirun() - if not mpirun: - self.console.print("[yellow]mpirun/mpiexec not found - falling back to torchrun[/yellow]") - return self._run_torchrun_fallback(gpu_count) - tests = [] if self.nccl_cfg.get("test_allreduce", True): tests.append(("all_reduce_perf", "AllReduce")) @@ -84,9 +79,13 @@ class NCCLTest: if self.nccl_cfg.get("test_sendrecv", False): tests.append(("sendrecv_perf", "SendRecv")) - results = {} default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4 - min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw)) + min_bw = self.nccl_cfg.get("min_bandwidth_gbps") or round(default_min_bw) + + # Strategy: try nccl-tests binary directly (single-node, -g N), + # then mpirun, then torchrun fallback + results = {} + any_binary_worked = False with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), @@ -96,11 +95,28 @@ class NCCLTest: for binary, label in tests: progress.update(task, description=f"NCCL {label}...") - results[label.lower()] = self._run_one_nccl_test( - binary, label, gpu_count, mpirun, min_bw + result = self._run_one_nccl_test_direct( + binary, label, gpu_count, min_bw ) + if result.get("status") not in ("SKIP", None) and "error" not in result: + any_binary_worked = True + results[label.lower()] = result + else: + # Try mpirun fallback + mpirun = self._find_mpirun() + if mpirun: + result = self._run_one_nccl_test_mpirun( + binary, label, gpu_count, mpirun, min_bw + ) + if result.get("status") not in ("SKIP", None) and "error" not in result: + any_binary_worked = True + results[label.lower()] = result progress.advance(task) + if not any_binary_worked: + self.console.print("[yellow]nccl-tests binaries failed, falling back to torchrun[/yellow]") + return self._run_torchrun_fallback(gpu_count) + all_passed = all( r.get("status") == "PASS" for r in results.values() @@ -117,18 +133,57 @@ class NCCLTest: "detected_gpu_type": self.gpu_type, } - def _run_one_nccl_test(self, binary_name: str, label: str, - gpu_count: int, mpirun: str, min_bw: float) -> dict: + def _run_one_nccl_test_direct(self, binary_name: str, label: str, + gpu_count: int, min_bw: float) -> dict: + """Run nccl-tests binary directly with -g N (no mpirun needed for single-node).""" binary = self._find_nccl_test(binary_name) if not binary: return {"status": "SKIP", "error": f"{binary_name} not found"} - sizes = "8:64:256:1024:4096:16384:65536:262144:1048576:4194304:16777216:67108864" + cmd = [ + binary, + "-b", "8", + "-e", "256M", + "-f", "2", + "-g", str(gpu_count), + "-w", "5", + "-n", "20", + ] + + try: + env = os.environ.copy() + env["NCCL_DEBUG"] = "WARN" + r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env) + + combined = r.stdout + r.stderr + # Check for NCCL/CUDA compatibility errors + if "CUDA driver version is insufficient" in combined or \ + "Test NCCL failure" in combined: + error_msg = "NCCL/CUDA driver version mismatch" \ + if "CUDA driver version" in combined \ + else "NCCL test failure (library incompatibility)" + return {"status": "FAIL", "error": error_msg} + + if r.returncode != 0: + return {"status": "FAIL", "error": r.stderr[:300]} + + return self._parse_nccl_output(r.stdout, min_bw) + + except subprocess.TimeoutExpired: + return {"status": "FAIL", "error": "timeout"} + except Exception as e: + return {"status": "FAIL", "error": str(e)} + + def _run_one_nccl_test_mpirun(self, binary_name: str, label: str, + gpu_count: int, mpirun: str, min_bw: float) -> dict: + """Run nccl-tests via mpirun (multi-node or per-GPU-process mode).""" + binary = self._find_nccl_test(binary_name) + if not binary: + return {"status": "SKIP", "error": f"{binary_name} not found"} - ngpus_per_node = gpu_count cmd = [ mpirun, - "-np", str(ngpus_per_node), + "-np", str(gpu_count), "--allow-run-as-root", "-x", "NCCL_DEBUG=WARN", "-x", "CUDA_VISIBLE_DEVICES=" + ",".join(str(i) for i in range(gpu_count)), @@ -146,77 +201,119 @@ class NCCLTest: env["NCCL_DEBUG"] = "WARN" r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env) + combined = r.stdout + r.stderr + if "CUDA driver version is insufficient" in combined or \ + "Test NCCL failure" in combined: + error_msg = "NCCL/CUDA driver version mismatch" \ + if "CUDA driver version" in combined \ + else "NCCL test failure (library incompatibility)" + return {"status": "FAIL", "error": error_msg} + if r.returncode != 0: return {"status": "FAIL", "error": r.stderr[:300]} - best_algbw = 0.0 - best_busbw = 0.0 - size_results = [] - - for line in r.stdout.split("\n"): - line = line.strip() - if not line or line.startswith("#"): - continue - parts = line.split() - if len(parts) >= 7: - try: - size = int(parts[0]) - algbw = float(parts[-3]) if len(parts) >= 3 else 0 - busbw = float(parts[-2]) if len(parts) >= 2 else 0 - time_us = float(parts[2]) if len(parts) >= 3 else 0 - size_results.append({ - "size": size, - "time_us": time_us, - "algbw_gbps": algbw, - "busbw_gbps": busbw, - }) - if busbw > best_busbw: - best_busbw = busbw - if algbw > best_algbw: - best_algbw = algbw - except (ValueError, IndexError): - continue - - status = "PASS" if best_busbw >= min_bw else "WARN" - return { - "status": status, - "best_algbw_gbps": round(best_algbw, 1), - "best_busbw_gbps": round(best_busbw, 1), - "min_required_gbps": min_bw, - "by_size": size_results[-5:] if size_results else [], - } + return self._parse_nccl_output(r.stdout, min_bw) except subprocess.TimeoutExpired: return {"status": "FAIL", "error": "timeout"} except Exception as e: return {"status": "FAIL", "error": str(e)} + @staticmethod + def _parse_nccl_output(stdout: str, min_bw: float) -> dict: + """Parse nccl-tests tabular output and extract bandwidth results.""" + best_algbw = 0.0 + best_busbw = 0.0 + size_results = [] + + for line in stdout.split("\n"): + line = line.strip() + if not line or line.startswith("#"): + continue + parts = line.split() + if len(parts) >= 7: + try: + size = int(parts[0]) + algbw = float(parts[-3]) if len(parts) >= 3 else 0 + busbw = float(parts[-2]) if len(parts) >= 2 else 0 + time_us = float(parts[2]) if len(parts) >= 3 else 0 + size_results.append({ + "size": size, + "time_us": time_us, + "algbw_gbps": algbw, + "busbw_gbps": busbw, + }) + if busbw > best_busbw: + best_busbw = busbw + if algbw > best_algbw: + best_algbw = algbw + except (ValueError, IndexError): + continue + + status = "PASS" if best_busbw >= min_bw else "WARN" + return { + "status": status, + "best_algbw_gbps": round(best_algbw, 1), + "best_busbw_gbps": round(best_busbw, 1), + "min_required_gbps": min_bw, + "by_size": size_results[-5:] if size_results else [], + } + def _run_torchrun_fallback(self, gpu_count: int) -> dict: - self.console.print("[cyan]Using torchrun fallback for NCCL test[/cyan]") - default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4 - min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw)) - size_mb = 64 - elements = size_mb * 1024 * 1024 // 4 - iters = 20 + """Basic NCCL connectivity test via torchrun — verifies NCCL works but does not benchmark performance.""" + self.console.print("[yellow]nccl-tests not available, running basic NCCL connectivity check[/yellow]") code = f""" -import torch, torch.distributed as dist, time, os +import torch, torch.distributed as dist, os os.environ.setdefault("MASTER_ADDR","127.0.0.1") os.environ.setdefault("MASTER_PORT","29500") -os.environ.setdefault("NCCL_DEBUG","WARN") rank=int(os.environ.get("LOCAL_RANK",0)) ws={gpu_count} dist.init_process_group("nccl",rank=rank,world_size=ws) torch.cuda.set_device(rank) -x=torch.randn({elements},device=f"cuda:{{rank}}",dtype=torch.float32) -for _ in range(5): dist.all_reduce(x) -torch.cuda.synchronize() -s=torch.cuda.Event(enable_timing=True); e=torch.cuda.Event(enable_timing=True) -s.record() -for _ in range({iters}): dist.all_reduce(x) -e.record(); torch.cuda.synchronize() -ms=s.elapsed_time(e); gb=({elements}*4*{iters})/1e9; bw=gb/(ms/1000) -if rank==0: print(f"{{bw:.1f}}") + +x=torch.randn(1024*1024,device=f"cuda:{{rank}}",dtype=torch.float32) + +# Test AllReduce +try: + dist.all_reduce(x.clone()) + if rank==0: print("allreduce:ok") +except Exception as e: + if rank==0: print(f"allreduce:fail:{{e}}") + +# Test Broadcast +try: + dist.broadcast(x.clone(),src=0) + if rank==0: print("broadcast:ok") +except Exception as e: + if rank==0: print(f"broadcast:fail:{{e}}") + +# Test AllGather +try: + tensor_list=[torch.empty_like(x) for _ in range(ws)] + dist.all_gather(tensor_list,x.clone()) + if rank==0: print("allgather:ok") +except Exception as e: + if rank==0: print(f"allgather:fail:{{e}}") + +# Test ReduceScatter +try: + chunks=list(x.chunk(ws)) + output=torch.empty_like(chunks[0]) + dist.reduce_scatter(output,chunks) + if rank==0: print("reducescatter:ok") +except Exception as e: + if rank==0: print(f"reducescatter:fail:{{e}}") + +# Test AllToAll +try: + chunks=list(x.chunk(ws)) + output_list=[torch.empty_like(c) for c in chunks] + dist.all_to_all(output_list,chunks) + if rank==0: print("alltoall:ok") +except Exception as e: + if rank==0: print(f"alltoall:fail:{{e}}") + dist.destroy_process_group() """ import tempfile @@ -225,23 +322,44 @@ dist.destroy_process_group() tmp.close() try: + # Prefer torchrun from the same venv as the running Python + import sys + venv_torchrun = os.path.join(os.path.dirname(sys.executable), "torchrun") + torchrun_cmd = venv_torchrun if os.path.isfile(venv_torchrun) else "torchrun" + r = subprocess.run( - ["torchrun", f"--nproc_per_node={gpu_count}", tmp.name], + [torchrun_cmd, f"--nproc_per_node={gpu_count}", tmp.name], capture_output=True, text=True, timeout=120, env={**os.environ, "NCCL_DEBUG": "WARN"}, ) os.unlink(tmp.name) - lines = [l.strip() for l in r.stdout.split("\n") if l.strip()] - bw = float(lines[-1]) if lines else 0 - status = "PASS" if bw >= min_bw else "WARN" - return { - "passed": status == "PASS", - "source": "torchrun_fallback", - "tests": {"allreduce": { + + # Parse connectivity results — format: op_name:ok or op_name:fail:error + tests = {} + all_passed = True + for line in r.stdout.split("\n"): + line = line.strip() + if not line: + continue + parts = line.split(":") + op_name = parts[0] + result = parts[1] if len(parts) > 1 else "unknown" + + if result == "ok": + status = "PASS" + else: + status = "FAIL" + all_passed = False + + tests[op_name] = { "status": status, - "best_busbw_gbps": round(bw, 1), - "min_required_gbps": min_bw, - }}, + "error": ":".join(parts[2:]) if len(parts) > 2 and result == "fail" else None, + } + + return { + "passed": all_passed, + "source": "torchrun_fallback", + "tests": tests, "gpu_count": gpu_count, } except Exception as e: @@ -256,30 +374,53 @@ dist.destroy_process_group() passed = results.get("passed", False) source = results.get("source", "unknown") - verdict = "[bold green]✓ NCCL tests PASSED[/bold green]" if passed else "[bold yellow]⚠ NCCL tests WARNING[/bold yellow]" - c.print(f"{verdict} [dim](via {source})[/dim]") + + if source == "torchrun_fallback": + # Connectivity check mode + verdict = "[bold green]✓ NCCL Connectivity OK[/bold green]" if passed else "[bold red]✗ NCCL Connectivity FAILED[/bold red]" + c.print(f"{verdict} [dim](basic check via torchrun)[/dim]") + + tests = results.get("tests", {}) + if tests: + c.print("\n[dim]Operations tested:[/dim]") + for op_name, result in tests.items(): + if not isinstance(result, dict): + continue + status = result.get("status", "FAIL") + s_color = "green" if status == "PASS" else "red" + error = result.get("error") + if error: + c.print(f" [{s_color}]{op_name}[/{s_color}] — {error}") + else: + c.print(f" [{s_color}]{op_name}[/{s_color}]") + + c.print("\n[yellow]Note: functional connectivity test only (no performance data)[/yellow]") + else: + # nccl-tests mode + verdict = "[bold green]✓ NCCL tests PASSED[/bold green]" if passed else "[bold yellow]⚠ NCCL tests WARNING[/bold yellow]" + c.print(f"{verdict} [dim](via {source})[/dim]") - tests = results.get("tests", {}) - for op_name, result in tests.items(): - if not isinstance(result, dict): - continue - c.print(f"\n[bold cyan]{op_name.upper()}[/bold cyan]") - status = result.get("status", "FAIL") - s_color = "green" if status == "PASS" else ("yellow" if status == "WARN" else "red") - c.print(f" Status: [{s_color}]{status}[/{s_color}] " - f"Best bus BW: {result.get('best_busbw_gbps', 'N/A')} GB/s " - f"(min: {result.get('min_required_gbps', 'N/A')} GB/s)") + tests = results.get("tests", {}) + for op_name, result in tests.items(): + if not isinstance(result, dict): + continue + c.print(f"\n[bold cyan]{op_name.upper()}[/bold cyan]") + status = result.get("status", "FAIL") + s_color = "green" if status == "PASS" else ("yellow" if status == "WARN" else "red") + c.print(f" Status: [{s_color}]{status}[/{s_color}] " + f"Best bus BW: {result.get('best_busbw_gbps', 'N/A')} GB/s " + f"(min: {result.get('min_required_gbps', 'N/A')} GB/s)") - by_size = result.get("by_size", []) - if by_size: - t = Table(box=None, padding=(0, 1)) - t.add_column("Size", style="bold", justify="right") - t.add_column("Time (us)", justify="right") - t.add_column("Alg BW (GB/s)", justify="right") - t.add_column("Bus BW (GB/s)", justify="right") - for r in by_size: - sz = r.get("size", 0) - sz_str = f"{sz/1024:.0f}K" if sz < 1048576 else f"{sz/1048576:.0f}M" - t.add_row(sz_str, f"{r.get('time_us',0):.1f}", - f"{r.get('algbw_gbps',0):.1f}", f"{r.get('busbw_gbps',0):.1f}") - c.print(t) + by_size = result.get("by_size", []) + if by_size: + t = Table(box=None, padding=(0, 1)) + t.add_column("Size", style="bold", justify="right") + t.add_column("Time (us)", justify="right") + t.add_column("Alg BW (GB/s)", justify="right") + t.add_column("Bus BW (GB/s)", justify="right") + for r in by_size: + sz = r.get("size", 0) + sz_str = f"{sz/1024:.0f}K" if sz < 1048576 else f"{sz/1048576:.0f}M" + t.add_row(sz_str, f"{r.get('time_us',0):.1f}", + f"{r.get('algbw_gbps',0):.1f}", f"{r.get('busbw_gbps',0):.1f}") + c.print(t) diff --git a/modules/report.py b/modules/report.py index 190996a..11e335b 100644 --- a/modules/report.py +++ b/modules/report.py @@ -253,14 +253,23 @@ class ReportGenerator: d2d = mem_data.get("d2d_bandwidth_gbps", 0) h2d = mem_data.get("h2d_bandwidth_gbps", 0) d2h = mem_data.get("d2h_bandwidth_gbps", 0) - peak = mem_data.get("peak_bandwidth_gbps", 0) - eff = mem_data.get("efficiency_pct", 0) - lines.append(f"| D2D (HBM) | {d2d:.1f} GB/s | {peak:.0f} GB/s | {eff:.1f}% |") - lines.append(f"| H2D | {h2d:.1f} GB/s | - | - |") - lines.append(f"| D2H | {d2h:.1f} GB/s | - | - |") + # New format with per-metric peaks + h2d_peak = mem_data.get("h2d_peak_gbps", 0) + d2h_peak = mem_data.get("d2h_peak_gbps", 0) + d2d_peak = mem_data.get("d2d_peak_gbps", 0) + h2d_eff = mem_data.get("h2d_efficiency_pct", 0) + d2h_eff = mem_data.get("d2h_efficiency_pct", 0) + d2d_eff = mem_data.get("d2d_efficiency_pct", 0) + # Fallback for old format + if not d2d_peak: + d2d_peak = mem_data.get("peak_bandwidth_gbps", 0) + d2d_eff = mem_data.get("efficiency_pct", 0) + lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |") + lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |") + lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |") lines.append("") - verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL") - lines.append(f"**Verdict: {verdict}** (D2D efficiency {eff:.1f}%)\n") + verdict = "PASS" if d2d_eff >= 50 else ("WARN" if d2d_eff >= 30 else "FAIL") + lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n") # --- Compute Throughput --- comp_data = self._extract_compute_results(results) diff --git a/modules/stress_test.py b/modules/stress_test.py index b7c3988..02647e1 100644 --- a/modules/stress_test.py +++ b/modules/stress_test.py @@ -49,10 +49,19 @@ class StressTest: gpu_burn = self._find_gpu_burn() if gpu_burn: - return self._run_gpu_burn(gpu_burn, duration_sec, use_doubles, use_tensor_cores, target_gpus) + # 尝试使用 gpu-burn + result = self._run_gpu_burn(gpu_burn, duration_sec, use_doubles, use_tensor_cores, target_gpus) + + # 如果 gpu-burn 失败(例如显存不足),自动 fallback 到 PyTorch + if not result.get("passed") and result.get("elapsed_sec", 0) < duration_sec * 0.5: + self.console.print("\n[yellow]gpu-burn 提前退出(可能显存不足),自动切换到 PyTorch 压力测试[/yellow]") + self.console.print("[dim]PyTorch 模式会根据实际可用显存动态调整,更稳定[/dim]\n") + return self._run_pytorch_stress(duration_sec, memory_pct) + + return result - self.console.print("[yellow]gpu_burn not found, falling back to PyTorch stress test[/yellow]") - return self._run_pytorch_stress(duration_sec) + self.console.print("[yellow]gpu_burn not found, using PyTorch stress test[/yellow]") + return self._run_pytorch_stress(duration_sec, memory_pct) def _run_gpu_burn(self, gpu_burn: str, duration: int, doubles: bool, tensor_cores: bool, target_gpus: str) -> dict: @@ -107,7 +116,7 @@ class StressTest: "timestamp": datetime.now().isoformat(), } - def _run_pytorch_stress(self, duration: int) -> dict: + def _run_pytorch_stress(self, duration: int, memory_pct: int = 90) -> dict: try: import torch if not torch.cuda.is_available(): @@ -116,7 +125,7 @@ class StressTest: return {"error": "pytorch_not_available"} gpu_count = torch.cuda.device_count() - self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs)[/cyan]") + self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs, target {memory_pct}% memory)[/cyan]") gpu_status = {} t0 = time.time() @@ -125,22 +134,53 @@ class StressTest: tensors = {} for i in range(gpu_count): with torch.cuda.device(i): - props = torch.cuda.get_device_properties(i) - total_mem = getattr(props, "total_memory", None) or getattr(props, "total_mem", 0) - alloc_size = int(total_mem * 0.9) // 4 - tensors[i] = torch.randn(alloc_size, device=f"cuda:{i}", dtype=torch.float32) + # 获取实际可用显存(考虑其他进程已占用的部分) + free_mem, total_mem = torch.cuda.mem_get_info(i) + + # 根据配置的 memory_pct 计算分配大小 + # 例如:memory_pct=90 表示使用总显存的 90% + target_mem = int(total_mem * memory_pct / 100) + + # 但不能超过实际可用显存(留出 5% 安全余量) + alloc_bytes = min(target_mem, int(free_mem * 0.95)) + + # matmul(A, A.T) 需要 2x 输入显存(输入 + 输出) + # 所以分配 sqrt(alloc_bytes/4/2) 大小的方阵 + side = int((alloc_bytes / 4 / 2) ** 0.5) # float32 = 4 bytes + + actual_mem_mb = side * side * 4 / 1024 / 1024 + total_mem_mb = total_mem / 1024 / 1024 + free_mem_mb = free_mem / 1024 / 1024 + + self.console.print( + f" [dim]GPU {i}: 总显存 {total_mem_mb:.0f}MB, 可用 {free_mem_mb:.0f}MB, " + f"分配 {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - " + f"矩阵 {side}x{side}[/dim]" + ) + tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32) + self.console.print(f"\n[cyan]开始压力测试,持续 {duration} 秒...[/cyan]") + + elapsed_check = 0 while time.time() - t0 < duration: for i in range(gpu_count): with torch.cuda.device(i): - tensors[i] = torch.matmul(tensors[i][:2048, :2048], tensors[i][:2048, :2048].T) + tensors[i] = torch.matmul(tensors[i], tensors[i].T) torch.cuda.synchronize() time.sleep(0.1) + + # 每 10 秒显示一次进度 + current_elapsed = time.time() - t0 + if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0: + self.console.print(f" [dim]已运行 {int(current_elapsed)}s / {duration}s[/dim]") + elapsed_check = current_elapsed for i in range(gpu_count): gpu_status[i] = "PASS" except RuntimeError as e: + error_msg = str(e) + self.console.print(f"\n[red]压力测试出错: {error_msg}[/red]") for i in range(gpu_count): if i not in gpu_status: gpu_status[i] = "FAIL" @@ -148,7 +188,7 @@ class StressTest: "source": "pytorch", "passed": False, "duration_sec": duration, - "error": str(e), + "error": error_msg, "gpu_status": gpu_status, } finally: