fix: resolve stress OOM, D2D efficiency calculation, NCCL execution failures
Key changes: - stress_test: use torch.cuda.mem_get_info() for free memory instead of total, allocate 40% to avoid OOM when other processes occupy GPU memory - benchmark: fix D2D efficiency by comparing to NVLink per-direction bandwidth (not HBM), add H2D/D2H efficiency against PCIe peak - nccl_test: implement direct binary → mpirun → torchrun fallback chain, fix min_bw None bug when YAML value is empty - report: update memory section to use per-metric peak fields - install_deps.sh: add NCCL compatibility detection, enhance CUDA version detection with CUDA_HOME/standard paths, improve _map_cuda_tag logging - gpu_info: parse CUDA version from nvidia-smi header (query field removed in newer drivers) - health_check: parse throttle_reasons bitmask properly, ignore gpu_idle bit - gpu_tester: fix suite summary to exclude metadata keys from pass count 🤖 Generated with [Qoder][https://qoder.com]
This commit is contained in:
parent
24934bc182
commit
f2158f6cd3
1
.gitignore
vendored
1
.gitignore
vendored
@ -13,3 +13,4 @@ reports/
|
||||
.env
|
||||
.venv/
|
||||
venv/
|
||||
.qoder/*
|
||||
|
||||
@ -310,8 +310,10 @@ def _run_full_suite(config: dict, console: Console) -> dict:
|
||||
|
||||
# Summary
|
||||
console.print("\n" + "=" * 60)
|
||||
passed = sum(1 for v in all_results.values() if not isinstance(v, dict) or "error" not in v)
|
||||
total = len(tests)
|
||||
# 只统计测试结果,排除 timestamp 等元数据
|
||||
test_results = {k: v for k, v in all_results.items() if k != "timestamp"}
|
||||
passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v)
|
||||
total = len(test_results)
|
||||
color = "green" if passed == total else ("yellow" if passed > 0 else "red")
|
||||
console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]")
|
||||
return all_results
|
||||
|
||||
156
install_deps.sh
156
install_deps.sh
@ -25,6 +25,9 @@ PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
JOBS="${MAKE_JOBS:-$(nproc)}"
|
||||
VERBOSE="${VERBOSE:-0}"
|
||||
|
||||
# uv 配置:跨文件系统时使用 copy 模式,避免硬链接警告
|
||||
export UV_LINK_MODE="${UV_LINK_MODE:-copy}"
|
||||
|
||||
# 参数标志
|
||||
FLAG_INSTALL_SYS_DEPS=0
|
||||
FLAG_SKIP_PYTORCH=0
|
||||
@ -152,39 +155,59 @@ detect_gpu_and_driver() {
|
||||
}
|
||||
|
||||
detect_cuda_version() {
|
||||
# 方式 1: nvcc(最可靠,代表 toolkit 确实安装了)
|
||||
# 优先级 1: nvcc 在 PATH 中(最可靠,代表 CUDA Toolkit 已正确配置)
|
||||
if command -v nvcc &>/dev/null; then
|
||||
CUDA_VERSION=$(nvcc --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+')
|
||||
if [[ -n "$CUDA_VERSION" ]]; then
|
||||
ok "CUDA: $CUDA_VERSION (via nvcc)"
|
||||
ok "CUDA: $CUDA_VERSION (via nvcc in PATH)"
|
||||
_map_cuda_tag
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# 方式 2: nvidia-smi(驱动支持的最大 CUDA 版本,非 toolkit)
|
||||
local smi_cuda
|
||||
smi_cuda=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+')
|
||||
if [[ -n "$smi_cuda" ]]; then
|
||||
CUDA_VERSION="$smi_cuda"
|
||||
warn "CUDA: $CUDA_VERSION (via nvidia-smi — 仅代表驱动能力,非已安装 toolkit)"
|
||||
warn " → 若编译失败,请安装 CUDA Toolkit: apt install cuda-toolkit-${CUDA_VERSION/./-}"
|
||||
_map_cuda_tag
|
||||
return 0
|
||||
fi
|
||||
|
||||
# 方式 3: /usr/local/cuda
|
||||
if [[ -f /usr/local/cuda/version.txt ]]; then
|
||||
CUDA_VERSION=$(grep -oP '[0-9]+\.[0-9]+' /usr/local/cuda/version.txt | head -1)
|
||||
# 优先级 2: CUDA_HOME 环境变量已设置且有效
|
||||
if [[ -n "${CUDA_HOME:-}" ]] && [[ -x "${CUDA_HOME}/bin/nvcc" ]]; then
|
||||
CUDA_VERSION=$("${CUDA_HOME}/bin/nvcc" --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+')
|
||||
if [[ -n "$CUDA_VERSION" ]]; then
|
||||
ok "CUDA: $CUDA_VERSION (via /usr/local/cuda/version.txt)"
|
||||
ok "CUDA: $CUDA_VERSION (via CUDA_HOME=${CUDA_HOME})"
|
||||
# 将 CUDA_HOME/bin 加入 PATH,供后续编译使用
|
||||
export PATH="${CUDA_HOME}/bin:$PATH"
|
||||
_map_cuda_tag
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
fail "无法检测 CUDA 版本"
|
||||
echo " → 请安装 CUDA Toolkit: https://developer.nvidia.com/cuda-downloads"
|
||||
# 优先级 3: 检查标准路径 /usr/local/cuda(最常见的安装位置)
|
||||
if [[ -x "/usr/local/cuda/bin/nvcc" ]]; then
|
||||
CUDA_VERSION=$("/usr/local/cuda/bin/nvcc" --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+')
|
||||
if [[ -n "$CUDA_VERSION" ]]; then
|
||||
export CUDA_HOME="/usr/local/cuda"
|
||||
export PATH="$CUDA_HOME/bin:$PATH"
|
||||
ok "CUDA: $CUDA_VERSION (via /usr/local/cuda)"
|
||||
_map_cuda_tag
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
# 所有方式都失败,明确报错退出
|
||||
fail "CUDA Toolkit 未找到!"
|
||||
echo ""
|
||||
echo " 当前环境状态:"
|
||||
echo " • nvcc 不在 PATH 中"
|
||||
if [[ -z "${CUDA_HOME:-}" ]]; then
|
||||
echo " • CUDA_HOME 环境变量未设置"
|
||||
else
|
||||
echo " • CUDA_HOME=${CUDA_HOME} (但 nvcc 不存在或不可执行)"
|
||||
fi
|
||||
echo " • /usr/local/cuda/bin/nvcc 不存在或不可执行"
|
||||
echo ""
|
||||
echo " 解决方案(选择其一):"
|
||||
echo " 1. 安装 CUDA Toolkit: https://developer.nvidia.com/cuda-downloads"
|
||||
echo " 2. 如果已安装,请设置环境变量:"
|
||||
echo " export CUDA_HOME=/path/to/cuda"
|
||||
echo " export PATH=\$CUDA_HOME/bin:\$PATH"
|
||||
echo " 3. 创建符号链接: sudo ln -s /path/to/cuda /usr/local/cuda"
|
||||
echo ""
|
||||
return 1
|
||||
}
|
||||
|
||||
@ -194,6 +217,8 @@ _map_cuda_tag() {
|
||||
minor="${CUDA_VERSION#*.}"
|
||||
minor="${minor%%.*}"
|
||||
|
||||
# PyTorch 官方提供的 CUDA wheel 版本: cu118, cu121, cu124, cu128
|
||||
# 选择规则: 取不超过驱动支持 CUDA 版本的最高可用 wheel
|
||||
if [[ "$major" -eq 11 ]]; then
|
||||
CUDA_TAG="cu118"
|
||||
elif [[ "$major" -eq 12 ]]; then
|
||||
@ -204,11 +229,18 @@ _map_cuda_tag() {
|
||||
else
|
||||
CUDA_TAG="cu128"
|
||||
fi
|
||||
else
|
||||
elif [[ "$major" -ge 13 ]]; then
|
||||
# CUDA 13+ 驱动,仍用 cu128(PyTorch 暂无更高版本 wheel)
|
||||
CUDA_TAG="cu128"
|
||||
warn "未知 CUDA $CUDA_VERSION,默认使用 cu128 索引"
|
||||
else
|
||||
CUDA_TAG="cu124"
|
||||
warn "未知 CUDA $CUDA_VERSION,默认使用 cu124 索引"
|
||||
fi
|
||||
log "PyTorch wheel 索引: $CUDA_TAG"
|
||||
|
||||
log "版本选择决策:"
|
||||
log " 驱动支持最高 CUDA: ${CUDA_VERSION}"
|
||||
log " PyTorch 可用 wheel: cu118 / cu121 / cu124 / cu128"
|
||||
log " → 选择: ${CUDA_TAG}(不超过 CUDA ${CUDA_VERSION} 的最高兼容版本)"
|
||||
}
|
||||
|
||||
check_python() {
|
||||
@ -286,11 +318,13 @@ check_nccl_dev() {
|
||||
if ldconfig -p 2>/dev/null | grep -q libnccl; then
|
||||
HAS_NCCL_DEV=1
|
||||
ok "libnccl: 已找到 (via ldconfig)"
|
||||
_check_nccl_compatibility
|
||||
return 0
|
||||
fi
|
||||
if [[ -f /usr/include/nccl.h ]] || dpkg -l libnccl-dev &>/dev/null 2>&1; then
|
||||
HAS_NCCL_DEV=1
|
||||
ok "libnccl-dev: 已安装"
|
||||
_check_nccl_compatibility
|
||||
return 0
|
||||
fi
|
||||
HAS_NCCL_DEV=0
|
||||
@ -299,6 +333,55 @@ check_nccl_dev() {
|
||||
return 0
|
||||
}
|
||||
|
||||
# 检测系统 NCCL 版本是否与当前驱动/CUDA 兼容
|
||||
NCCL_COMPATIBLE=1
|
||||
_check_nccl_compatibility() {
|
||||
NCCL_COMPATIBLE=1
|
||||
|
||||
# 获取 NCCL 包的 CUDA 依赖版本
|
||||
local nccl_pkg_info=""
|
||||
nccl_pkg_info=$(dpkg -l libnccl2 2>/dev/null | grep -oP '\+cuda[0-9.]+' | head -1)
|
||||
if [[ -z "$nccl_pkg_info" ]]; then
|
||||
return 0 # 无法判断,假设兼容
|
||||
fi
|
||||
|
||||
local nccl_cuda_ver="${nccl_pkg_info#+cuda}"
|
||||
local nccl_cuda_major="${nccl_cuda_ver%%.*}"
|
||||
local nccl_cuda_minor="${nccl_cuda_ver#*.}"
|
||||
nccl_cuda_minor="${nccl_cuda_minor%%.*}"
|
||||
|
||||
# 获取驱动支持的最大 CUDA 版本
|
||||
local driver_cuda=""
|
||||
driver_cuda=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+')
|
||||
if [[ -z "$driver_cuda" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
local drv_cuda_major="${driver_cuda%%.*}"
|
||||
local drv_cuda_minor="${driver_cuda#*.}"
|
||||
drv_cuda_minor="${drv_cuda_minor%%.*}"
|
||||
|
||||
# NCCL 需要的 CUDA 版本 > 驱动支持的 CUDA 版本 → 不兼容
|
||||
if [[ "$nccl_cuda_major" -gt "$drv_cuda_major" ]] || \
|
||||
{ [[ "$nccl_cuda_major" -eq "$drv_cuda_major" ]] && [[ "$nccl_cuda_minor" -gt "$drv_cuda_minor" ]]; }; then
|
||||
NCCL_COMPATIBLE=0
|
||||
warn "系统 NCCL 版本不兼容!"
|
||||
echo -e " ${YELLOW}NCCL 包要求: CUDA ${nccl_cuda_ver}${NC}"
|
||||
echo -e " ${YELLOW}驱动支持最高: CUDA ${driver_cuda}${NC}"
|
||||
echo ""
|
||||
echo " 这会导致 nccl-tests 运行时报错:"
|
||||
echo " 'CUDA driver version is insufficient for CUDA runtime version'"
|
||||
echo ""
|
||||
echo " 解决方案(任选其一):"
|
||||
echo " A) 降级 NCCL: sudo apt install libnccl2=<版本>+cuda${driver_cuda}"
|
||||
echo " B) 升级驱动至支持 CUDA ${nccl_cuda_ver} 的版本"
|
||||
echo " C) 使用 PyTorch 内置 NCCL(测试套件会自动 fallback)"
|
||||
echo ""
|
||||
else
|
||||
ok "NCCL 兼容性: NCCL(cuda${nccl_cuda_ver}) <= 驱动(cuda${driver_cuda})"
|
||||
fi
|
||||
}
|
||||
|
||||
install_system_deps() {
|
||||
log "安装系统依赖包..."
|
||||
if command -v apt-get &>/dev/null; then
|
||||
@ -432,7 +515,7 @@ setup_python_venv() {
|
||||
# 安装项目依赖
|
||||
log "安装 Python 依赖(rich、pyyaml、numpy)..."
|
||||
uv pip install --python "$venv_dir/bin/python" \
|
||||
-e "$PROJECT_DIR" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -1; } || true
|
||||
-e "$PROJECT_DIR" 2>&1 || true
|
||||
ok "项目依赖安装完成"
|
||||
|
||||
# 安装 PyTorch
|
||||
@ -450,7 +533,7 @@ setup_python_venv() {
|
||||
log "(下载较大,请耐心等待...)"
|
||||
uv pip install --python "$venv_dir/bin/python" \
|
||||
"torch>=2.1.0" --index-url "$index_url" \
|
||||
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } || {
|
||||
2>&1 || {
|
||||
warn "PyTorch 安装失败,可稍后手动安装:"
|
||||
echo " source $INSTALL_DIR/env.sh"
|
||||
echo " uv pip install torch --index-url $index_url"
|
||||
@ -492,7 +575,12 @@ build_nvbandwidth() {
|
||||
|
||||
cd "$src"
|
||||
mkdir -p build && cd build
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
|
||||
|
||||
# 使用 detect_cuda_version() 中设置的 CUDA_HOME 和 PATH
|
||||
# 如果 CUDA_HOME 未设置,则使用默认路径
|
||||
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
|
||||
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_COMPILER="$cuda_home/bin/nvcc" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
|
||||
make -j"$JOBS" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
|
||||
|
||||
if [[ -x ./nvbandwidth ]]; then
|
||||
@ -512,6 +600,9 @@ build_nccl_tests() {
|
||||
|
||||
if [[ -x "$src/build/all_reduce_perf" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
|
||||
ok "nccl-tests: 已编译 ($src/build/)"
|
||||
if [[ $NCCL_COMPATIBLE -eq 0 ]]; then
|
||||
warn "nccl-tests: 已编译但系统 NCCL 与驱动不兼容,运行时将 fallback 到 torchrun"
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
|
||||
@ -520,6 +611,15 @@ build_nccl_tests() {
|
||||
return 0
|
||||
fi
|
||||
|
||||
# NCCL 不兼容时仍然编译(编译不报错),但给出明确警告
|
||||
if [[ $NCCL_COMPATIBLE -eq 0 ]]; then
|
||||
warn "nccl-tests: 系统 NCCL 与驱动不兼容"
|
||||
warn " 编译会成功但运行时会报错 'CUDA driver version is insufficient'"
|
||||
warn " 测试套件会自动 fallback 到 torchrun 方式测试 NCCL"
|
||||
log " 如需原生 nccl-tests 性能数据,请先解决 NCCL 版本问题(见上方提示)"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
|
||||
if [[ ! -d "$cuda_home/include" ]]; then
|
||||
warn "nccl-tests: 跳过(CUDA_HOME=$cuda_home 无效)"
|
||||
@ -688,7 +788,11 @@ print_summary() {
|
||||
local path="${tool_info%%:*}"
|
||||
local name="${tool_info##*:}"
|
||||
if [[ -x "$path" ]]; then
|
||||
echo -e " ${GREEN}✓${NC} $name"
|
||||
if [[ "$name" == "nccl-tests" ]] && [[ $NCCL_COMPATIBLE -eq 0 ]]; then
|
||||
echo -e " ${YELLOW}⚠${NC} $name (已编译,但系统 NCCL 与驱动不兼容)"
|
||||
else
|
||||
echo -e " ${GREEN}✓${NC} $name"
|
||||
fi
|
||||
else
|
||||
echo -e " ${YELLOW}○${NC} $name (未编译)"
|
||||
fi
|
||||
|
||||
@ -78,30 +78,33 @@ class Benchmark:
|
||||
self.console.print(f"[cyan]Memory Benchmark via nvbandwidth ({nvbw_path})[/cyan]")
|
||||
|
||||
results_by_test = {}
|
||||
per_gpu_d2d = []
|
||||
|
||||
# Testcases to run — keys used internally, try both old and new names
|
||||
testcases = [
|
||||
"host_to_device_memcpy_read_ce",
|
||||
"device_to_host_memcpy_write_ce",
|
||||
"device_to_device_memcpy_write_ce",
|
||||
"device_to_device_memcpy_read_ce",
|
||||
"device_to_device_bidirectional_sm",
|
||||
("h2d", ["host_to_device_memcpy_ce", "host_to_device_memcpy_read_ce"]),
|
||||
("d2h", ["device_to_host_memcpy_ce", "device_to_host_memcpy_write_ce"]),
|
||||
("d2d_write", ["device_to_device_memcpy_write_ce"]),
|
||||
("d2d_read", ["device_to_device_memcpy_read_ce"]),
|
||||
("d2d_bidir", ["device_to_device_bidirectional_memcpy_write_sm",
|
||||
"device_to_device_bidirectional_sm"]),
|
||||
]
|
||||
|
||||
# Discover available testcase names
|
||||
available_names: list[str] = []
|
||||
try:
|
||||
list_r = subprocess.run(
|
||||
[nvbw_path, "-l", "-j"],
|
||||
capture_output=True, text=True, timeout=15,
|
||||
[nvbw_path, "-l"], capture_output=True, text=True, timeout=15,
|
||||
)
|
||||
available = []
|
||||
if list_r.returncode == 0:
|
||||
try:
|
||||
avail_list = json.loads(list_r.stdout)
|
||||
available = [t.get("name", "") for t in avail_list if isinstance(t, dict)]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
for line in list_r.stdout.splitlines():
|
||||
line = line.strip()
|
||||
if line and ", " in line and line[0].isdigit():
|
||||
parts = line.split(", ", 1)
|
||||
name = parts[1].rstrip(":").strip()
|
||||
if name:
|
||||
available_names.append(name)
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
available = []
|
||||
pass
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
|
||||
@ -110,53 +113,51 @@ class Benchmark:
|
||||
) as progress:
|
||||
task = progress.add_task("nvbandwidth tests...", total=len(testcases))
|
||||
|
||||
for tc in testcases:
|
||||
if available and tc not in available:
|
||||
for key, name_candidates in testcases:
|
||||
# Pick the first available test name
|
||||
tc = None
|
||||
for candidate in name_candidates:
|
||||
if not available_names or candidate in available_names:
|
||||
tc = candidate
|
||||
break
|
||||
if tc is None:
|
||||
progress.advance(task)
|
||||
continue
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
nvbw_path,
|
||||
f"-b{buffer_mb}",
|
||||
f"-i{samples}",
|
||||
"-j",
|
||||
f"-t{tc}",
|
||||
]
|
||||
cmd = [nvbw_path, "-t", tc, "-b", str(buffer_mb),
|
||||
"-i", str(samples), "-j"]
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
|
||||
if r.returncode == 0 and r.stdout.strip():
|
||||
try:
|
||||
data = json.loads(r.stdout)
|
||||
bw_values = []
|
||||
for entry in data if isinstance(data, list) else [data]:
|
||||
if isinstance(entry, dict):
|
||||
for row in entry.get("results", []):
|
||||
val = row.get("value", 0)
|
||||
if isinstance(val, (int, float)):
|
||||
bw_values.append(val)
|
||||
avg_bw = sum(bw_values) / len(bw_values) if bw_values else 0
|
||||
results_by_test[tc] = round(avg_bw, 1)
|
||||
except json.JSONDecodeError:
|
||||
results_by_test[tc] = 0
|
||||
avg_bw = self._parse_nvbandwidth_json(r.stdout)
|
||||
results_by_test[key] = round(avg_bw, 1)
|
||||
else:
|
||||
results_by_test[tc] = 0
|
||||
results_by_test[key] = 0
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
results_by_test[tc] = 0
|
||||
results_by_test[key] = 0
|
||||
|
||||
progress.advance(task)
|
||||
|
||||
d2d_bw = max(
|
||||
results_by_test.get("device_to_device_memcpy_write_ce", 0),
|
||||
results_by_test.get("device_to_device_memcpy_read_ce", 0),
|
||||
results_by_test.get("device_to_device_bidirectional_sm", 0),
|
||||
)
|
||||
h2d_bw = results_by_test.get("host_to_device_memcpy_read_ce", 0)
|
||||
d2h_bw = results_by_test.get("device_to_host_memcpy_write_ce", 0)
|
||||
peak_bw = self.specs["memory_bandwidth_gbps"]
|
||||
efficiency = (
|
||||
(d2d_bw / peak_bw) * 100 if (d2d_bw and peak_bw) else 0
|
||||
results_by_test.get("d2d_write", 0),
|
||||
results_by_test.get("d2d_read", 0),
|
||||
results_by_test.get("d2d_bidir", 0),
|
||||
)
|
||||
h2d_bw = results_by_test.get("h2d", 0)
|
||||
d2h_bw = results_by_test.get("d2h", 0)
|
||||
|
||||
# D2D goes through NVLink — compare to NVLink per-direction bandwidth
|
||||
# (nvlink_bandwidth_gbps is bidirectional, so per-direction = /2)
|
||||
nvlink_bw = self.specs.get("nvlink_bandwidth_gbps", 0)
|
||||
d2d_peak = nvlink_bw / 2 if nvlink_bw else 0
|
||||
d2d_efficiency = (d2d_bw / d2d_peak) * 100 if (d2d_bw and d2d_peak) else 0
|
||||
|
||||
# H2D/D2H goes through PCIe — estimate peak from PCIe gen
|
||||
pcie_gen = self.specs.get("pcie_gen", 4)
|
||||
pcie_peak = {3: 16, 4: 32, 5: 64, 6: 128}.get(pcie_gen, 32) # GB/s x16
|
||||
h2d_efficiency = (h2d_bw / pcie_peak) * 100 if (h2d_bw and pcie_peak) else 0
|
||||
d2h_efficiency = (d2h_bw / pcie_peak) * 100 if (d2h_bw and pcie_peak) else 0
|
||||
|
||||
return {
|
||||
"memory": {
|
||||
@ -164,13 +165,55 @@ class Benchmark:
|
||||
"h2d_bandwidth_gbps": round(h2d_bw, 1),
|
||||
"d2h_bandwidth_gbps": round(d2h_bw, 1),
|
||||
"d2d_bandwidth_gbps": round(d2d_bw, 1),
|
||||
"h2d_peak_gbps": pcie_peak,
|
||||
"d2h_peak_gbps": pcie_peak,
|
||||
"d2d_peak_gbps": round(d2d_peak, 1),
|
||||
"h2d_efficiency_pct": round(h2d_efficiency, 1),
|
||||
"d2h_efficiency_pct": round(d2h_efficiency, 1),
|
||||
"d2d_efficiency_pct": round(d2d_efficiency, 1),
|
||||
"peak_bandwidth_gbps": self.specs["memory_bandwidth_gbps"],
|
||||
"efficiency_pct": round(efficiency, 1),
|
||||
"efficiency_pct": round(d2d_efficiency, 1),
|
||||
"results_by_test": results_by_test,
|
||||
"per_gpu": per_gpu_d2d,
|
||||
"per_gpu": [],
|
||||
}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _parse_nvbandwidth_json(raw: str) -> float:
|
||||
"""Parse nvbandwidth JSON output (supports v0.5+ and v0.8+ formats)."""
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
return 0.0
|
||||
|
||||
# v0.8+ format: {"nvbandwidth": {"testcases": [{"bandwidth_matrix": [...], "sum": N}]}}
|
||||
if isinstance(data, dict) and "nvbandwidth" in data:
|
||||
testcases = data["nvbandwidth"].get("testcases", [])
|
||||
for tc in testcases:
|
||||
matrix = tc.get("bandwidth_matrix", [])
|
||||
values = []
|
||||
for row in matrix:
|
||||
for cell in row:
|
||||
try:
|
||||
v = float(cell)
|
||||
values.append(v)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
if values:
|
||||
return sum(values) / len(values)
|
||||
return 0.0
|
||||
|
||||
# v0.5 format: list of dicts with "results" array
|
||||
entries = data if isinstance(data, list) else [data]
|
||||
bw_values = []
|
||||
for entry in entries:
|
||||
if isinstance(entry, dict):
|
||||
for row in entry.get("results", []):
|
||||
val = row.get("value", 0)
|
||||
if isinstance(val, (int, float)):
|
||||
bw_values.append(val)
|
||||
return sum(bw_values) / len(bw_values) if bw_values else 0.0
|
||||
|
||||
def _run_memory_pytorch(self) -> dict:
|
||||
mem_cfg = self.bench_cfg.get("memory", {})
|
||||
test_sizes_mb = [1, 4, 16, 64, 256, 1024, 4096]
|
||||
@ -377,15 +420,16 @@ class Benchmark:
|
||||
table.add_column("Peak", justify="right")
|
||||
table.add_column("Efficiency", justify="right")
|
||||
|
||||
for label, achieved, peak in [
|
||||
("H2D (PCIe)", mem["h2d_bandwidth_gbps"], None),
|
||||
("D2H (PCIe)", mem["d2h_bandwidth_gbps"], None),
|
||||
("D2D (HBM3e)", mem["d2d_bandwidth_gbps"], mem["peak_bandwidth_gbps"]),
|
||||
for label, achieved, peak_key, eff_key in [
|
||||
("H2D (PCIe)", mem["h2d_bandwidth_gbps"], "h2d_peak_gbps", "h2d_efficiency_pct"),
|
||||
("D2H (PCIe)", mem["d2h_bandwidth_gbps"], "d2h_peak_gbps", "d2h_efficiency_pct"),
|
||||
("D2D (NVLink)", mem["d2d_bandwidth_gbps"], "d2d_peak_gbps", "d2d_efficiency_pct"),
|
||||
]:
|
||||
val_str = f"{achieved:.1f} GB/s" if isinstance(achieved, (int, float)) else "N/A"
|
||||
peak = mem.get(peak_key, 0)
|
||||
peak_str = f"{peak:.0f} GB/s" if peak else "N/A"
|
||||
if peak and isinstance(achieved, (int, float)) and achieved > 0:
|
||||
eff = (achieved / peak) * 100
|
||||
eff = mem.get(eff_key, 0)
|
||||
if eff:
|
||||
ec = "green" if eff >= 80 else ("yellow" if eff >= 50 else "red")
|
||||
eff_str = f"[{ec}]{eff:.1f}%[/{ec}]"
|
||||
else:
|
||||
|
||||
@ -67,7 +67,7 @@ class GPUInfo:
|
||||
ecc_double = self._run_smi("ecc.errors.double_bit.total.volatile").split("\n") if self._run_smi("ecc.errors.double_bit.total.volatile") else []
|
||||
|
||||
driver_info = self._run_smi("driver_version", "csv,noheader")
|
||||
cuda_info = self._run_smi("cuda_version", "csv,noheader")
|
||||
cuda_info = self._get_cuda_version()
|
||||
|
||||
def safe_get(lst, idx, default="N/A"):
|
||||
try:
|
||||
@ -116,7 +116,7 @@ class GPUInfo:
|
||||
|
||||
return {
|
||||
"driver_version": safe_get(driver_info.split("\n"), 0) if driver_info else "N/A",
|
||||
"cuda_version": safe_get(cuda_info.split("\n"), 0) if cuda_info else "N/A",
|
||||
"cuda_version": cuda_info or "N/A",
|
||||
"gpu_count": gpu_count,
|
||||
"gpus": gpus,
|
||||
"topology": topology,
|
||||
@ -125,6 +125,21 @@ class GPUInfo:
|
||||
"gpu_label": self.gpu_label,
|
||||
}
|
||||
|
||||
def _get_cuda_version(self) -> Optional[str]:
|
||||
"""Parse CUDA version from nvidia-smi header output (query-gpu field removed in newer drivers)."""
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi"], capture_output=True, text=True, timeout=15,
|
||||
)
|
||||
if r.returncode == 0:
|
||||
import re
|
||||
m = re.search(r"CUDA Version:\s+([\d.]+)", r.stdout)
|
||||
if m:
|
||||
return m.group(1)
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
pass
|
||||
return None
|
||||
|
||||
def _get_topology(self) -> str:
|
||||
try:
|
||||
r = subprocess.run(
|
||||
|
||||
@ -125,10 +125,29 @@ class HealthCheck:
|
||||
checks["clock_speed"] = {"sm": sm, "mem": mm, "status": "PASS" if sm > 0 and mm > 0 else "WARN"}
|
||||
|
||||
throttle_val = throttling_raw[i] if i < len(throttling_raw) else ""
|
||||
throttle_active = throttle_val not in ("", "None", "Active", "N/A")
|
||||
if throttle_active:
|
||||
# Parse bitmask: 0x0 = none, 0x1 = gpu_idle (benign), others = real throttling
|
||||
throttle_reasons = []
|
||||
try:
|
||||
bitmask = int(throttle_val, 16) if throttle_val.startswith("0x") else 0
|
||||
except (ValueError, TypeError):
|
||||
bitmask = 0
|
||||
# Bit 0 = gpu_idle — not a real problem, ignore it
|
||||
real_throttle = bitmask & ~0x1
|
||||
if real_throttle:
|
||||
if real_throttle & 0x4:
|
||||
throttle_reasons.append("sw_power_cap")
|
||||
if real_throttle & 0x8:
|
||||
throttle_reasons.append("hw_slowdown")
|
||||
if real_throttle & 0x10:
|
||||
throttle_reasons.append("hw_thermal_slowdown")
|
||||
if real_throttle & 0x20:
|
||||
throttle_reasons.append("hw_power_brake")
|
||||
if real_throttle & 0x40:
|
||||
throttle_reasons.append("sw_thermal_slowdown")
|
||||
if not throttle_reasons:
|
||||
throttle_reasons.append(f"unknown(0x{real_throttle:x})")
|
||||
overall_pass = False
|
||||
checks["throttling"] = {"status": "FAIL" if throttle_active else "PASS", "reasons": [throttle_val] if throttle_active else []}
|
||||
checks["throttling"] = {"status": "FAIL" if real_throttle else "PASS", "reasons": throttle_reasons}
|
||||
|
||||
pers_val = persistence[i] if i < len(persistence) else ""
|
||||
pers_enabled = pers_val == "Enabled"
|
||||
|
||||
@ -65,11 +65,6 @@ class NCCLTest:
|
||||
self.console.print(f"[yellow]NCCL test requires at least 2 GPUs (found {gpu_count})[/yellow]")
|
||||
return {"error": "need_at_least_2_gpus", "gpu_count": gpu_count}
|
||||
|
||||
mpirun = self._find_mpirun()
|
||||
if not mpirun:
|
||||
self.console.print("[yellow]mpirun/mpiexec not found - falling back to torchrun[/yellow]")
|
||||
return self._run_torchrun_fallback(gpu_count)
|
||||
|
||||
tests = []
|
||||
if self.nccl_cfg.get("test_allreduce", True):
|
||||
tests.append(("all_reduce_perf", "AllReduce"))
|
||||
@ -84,9 +79,13 @@ class NCCLTest:
|
||||
if self.nccl_cfg.get("test_sendrecv", False):
|
||||
tests.append(("sendrecv_perf", "SendRecv"))
|
||||
|
||||
results = {}
|
||||
default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
|
||||
min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw))
|
||||
min_bw = self.nccl_cfg.get("min_bandwidth_gbps") or round(default_min_bw)
|
||||
|
||||
# Strategy: try nccl-tests binary directly (single-node, -g N),
|
||||
# then mpirun, then torchrun fallback
|
||||
results = {}
|
||||
any_binary_worked = False
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
|
||||
@ -96,11 +95,28 @@ class NCCLTest:
|
||||
|
||||
for binary, label in tests:
|
||||
progress.update(task, description=f"NCCL {label}...")
|
||||
results[label.lower()] = self._run_one_nccl_test(
|
||||
binary, label, gpu_count, mpirun, min_bw
|
||||
result = self._run_one_nccl_test_direct(
|
||||
binary, label, gpu_count, min_bw
|
||||
)
|
||||
if result.get("status") not in ("SKIP", None) and "error" not in result:
|
||||
any_binary_worked = True
|
||||
results[label.lower()] = result
|
||||
else:
|
||||
# Try mpirun fallback
|
||||
mpirun = self._find_mpirun()
|
||||
if mpirun:
|
||||
result = self._run_one_nccl_test_mpirun(
|
||||
binary, label, gpu_count, mpirun, min_bw
|
||||
)
|
||||
if result.get("status") not in ("SKIP", None) and "error" not in result:
|
||||
any_binary_worked = True
|
||||
results[label.lower()] = result
|
||||
progress.advance(task)
|
||||
|
||||
if not any_binary_worked:
|
||||
self.console.print("[yellow]nccl-tests binaries failed, falling back to torchrun[/yellow]")
|
||||
return self._run_torchrun_fallback(gpu_count)
|
||||
|
||||
all_passed = all(
|
||||
r.get("status") == "PASS"
|
||||
for r in results.values()
|
||||
@ -117,18 +133,57 @@ class NCCLTest:
|
||||
"detected_gpu_type": self.gpu_type,
|
||||
}
|
||||
|
||||
def _run_one_nccl_test(self, binary_name: str, label: str,
|
||||
gpu_count: int, mpirun: str, min_bw: float) -> dict:
|
||||
def _run_one_nccl_test_direct(self, binary_name: str, label: str,
|
||||
gpu_count: int, min_bw: float) -> dict:
|
||||
"""Run nccl-tests binary directly with -g N (no mpirun needed for single-node)."""
|
||||
binary = self._find_nccl_test(binary_name)
|
||||
if not binary:
|
||||
return {"status": "SKIP", "error": f"{binary_name} not found"}
|
||||
|
||||
sizes = "8:64:256:1024:4096:16384:65536:262144:1048576:4194304:16777216:67108864"
|
||||
cmd = [
|
||||
binary,
|
||||
"-b", "8",
|
||||
"-e", "256M",
|
||||
"-f", "2",
|
||||
"-g", str(gpu_count),
|
||||
"-w", "5",
|
||||
"-n", "20",
|
||||
]
|
||||
|
||||
try:
|
||||
env = os.environ.copy()
|
||||
env["NCCL_DEBUG"] = "WARN"
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env)
|
||||
|
||||
combined = r.stdout + r.stderr
|
||||
# Check for NCCL/CUDA compatibility errors
|
||||
if "CUDA driver version is insufficient" in combined or \
|
||||
"Test NCCL failure" in combined:
|
||||
error_msg = "NCCL/CUDA driver version mismatch" \
|
||||
if "CUDA driver version" in combined \
|
||||
else "NCCL test failure (library incompatibility)"
|
||||
return {"status": "FAIL", "error": error_msg}
|
||||
|
||||
if r.returncode != 0:
|
||||
return {"status": "FAIL", "error": r.stderr[:300]}
|
||||
|
||||
return self._parse_nccl_output(r.stdout, min_bw)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return {"status": "FAIL", "error": "timeout"}
|
||||
except Exception as e:
|
||||
return {"status": "FAIL", "error": str(e)}
|
||||
|
||||
def _run_one_nccl_test_mpirun(self, binary_name: str, label: str,
|
||||
gpu_count: int, mpirun: str, min_bw: float) -> dict:
|
||||
"""Run nccl-tests via mpirun (multi-node or per-GPU-process mode)."""
|
||||
binary = self._find_nccl_test(binary_name)
|
||||
if not binary:
|
||||
return {"status": "SKIP", "error": f"{binary_name} not found"}
|
||||
|
||||
ngpus_per_node = gpu_count
|
||||
cmd = [
|
||||
mpirun,
|
||||
"-np", str(ngpus_per_node),
|
||||
"-np", str(gpu_count),
|
||||
"--allow-run-as-root",
|
||||
"-x", "NCCL_DEBUG=WARN",
|
||||
"-x", "CUDA_VISIBLE_DEVICES=" + ",".join(str(i) for i in range(gpu_count)),
|
||||
@ -146,77 +201,119 @@ class NCCLTest:
|
||||
env["NCCL_DEBUG"] = "WARN"
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env)
|
||||
|
||||
combined = r.stdout + r.stderr
|
||||
if "CUDA driver version is insufficient" in combined or \
|
||||
"Test NCCL failure" in combined:
|
||||
error_msg = "NCCL/CUDA driver version mismatch" \
|
||||
if "CUDA driver version" in combined \
|
||||
else "NCCL test failure (library incompatibility)"
|
||||
return {"status": "FAIL", "error": error_msg}
|
||||
|
||||
if r.returncode != 0:
|
||||
return {"status": "FAIL", "error": r.stderr[:300]}
|
||||
|
||||
best_algbw = 0.0
|
||||
best_busbw = 0.0
|
||||
size_results = []
|
||||
|
||||
for line in r.stdout.split("\n"):
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
parts = line.split()
|
||||
if len(parts) >= 7:
|
||||
try:
|
||||
size = int(parts[0])
|
||||
algbw = float(parts[-3]) if len(parts) >= 3 else 0
|
||||
busbw = float(parts[-2]) if len(parts) >= 2 else 0
|
||||
time_us = float(parts[2]) if len(parts) >= 3 else 0
|
||||
size_results.append({
|
||||
"size": size,
|
||||
"time_us": time_us,
|
||||
"algbw_gbps": algbw,
|
||||
"busbw_gbps": busbw,
|
||||
})
|
||||
if busbw > best_busbw:
|
||||
best_busbw = busbw
|
||||
if algbw > best_algbw:
|
||||
best_algbw = algbw
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
status = "PASS" if best_busbw >= min_bw else "WARN"
|
||||
return {
|
||||
"status": status,
|
||||
"best_algbw_gbps": round(best_algbw, 1),
|
||||
"best_busbw_gbps": round(best_busbw, 1),
|
||||
"min_required_gbps": min_bw,
|
||||
"by_size": size_results[-5:] if size_results else [],
|
||||
}
|
||||
return self._parse_nccl_output(r.stdout, min_bw)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return {"status": "FAIL", "error": "timeout"}
|
||||
except Exception as e:
|
||||
return {"status": "FAIL", "error": str(e)}
|
||||
|
||||
@staticmethod
|
||||
def _parse_nccl_output(stdout: str, min_bw: float) -> dict:
|
||||
"""Parse nccl-tests tabular output and extract bandwidth results."""
|
||||
best_algbw = 0.0
|
||||
best_busbw = 0.0
|
||||
size_results = []
|
||||
|
||||
for line in stdout.split("\n"):
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#"):
|
||||
continue
|
||||
parts = line.split()
|
||||
if len(parts) >= 7:
|
||||
try:
|
||||
size = int(parts[0])
|
||||
algbw = float(parts[-3]) if len(parts) >= 3 else 0
|
||||
busbw = float(parts[-2]) if len(parts) >= 2 else 0
|
||||
time_us = float(parts[2]) if len(parts) >= 3 else 0
|
||||
size_results.append({
|
||||
"size": size,
|
||||
"time_us": time_us,
|
||||
"algbw_gbps": algbw,
|
||||
"busbw_gbps": busbw,
|
||||
})
|
||||
if busbw > best_busbw:
|
||||
best_busbw = busbw
|
||||
if algbw > best_algbw:
|
||||
best_algbw = algbw
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
status = "PASS" if best_busbw >= min_bw else "WARN"
|
||||
return {
|
||||
"status": status,
|
||||
"best_algbw_gbps": round(best_algbw, 1),
|
||||
"best_busbw_gbps": round(best_busbw, 1),
|
||||
"min_required_gbps": min_bw,
|
||||
"by_size": size_results[-5:] if size_results else [],
|
||||
}
|
||||
|
||||
def _run_torchrun_fallback(self, gpu_count: int) -> dict:
|
||||
self.console.print("[cyan]Using torchrun fallback for NCCL test[/cyan]")
|
||||
default_min_bw = self.specs.get("nvlink_bandwidth_gbps", 900) * 0.4
|
||||
min_bw = self.nccl_cfg.get("min_bandwidth_gbps", round(default_min_bw))
|
||||
size_mb = 64
|
||||
elements = size_mb * 1024 * 1024 // 4
|
||||
iters = 20
|
||||
"""Basic NCCL connectivity test via torchrun — verifies NCCL works but does not benchmark performance."""
|
||||
self.console.print("[yellow]nccl-tests not available, running basic NCCL connectivity check[/yellow]")
|
||||
|
||||
code = f"""
|
||||
import torch, torch.distributed as dist, time, os
|
||||
import torch, torch.distributed as dist, os
|
||||
os.environ.setdefault("MASTER_ADDR","127.0.0.1")
|
||||
os.environ.setdefault("MASTER_PORT","29500")
|
||||
os.environ.setdefault("NCCL_DEBUG","WARN")
|
||||
rank=int(os.environ.get("LOCAL_RANK",0))
|
||||
ws={gpu_count}
|
||||
dist.init_process_group("nccl",rank=rank,world_size=ws)
|
||||
torch.cuda.set_device(rank)
|
||||
x=torch.randn({elements},device=f"cuda:{{rank}}",dtype=torch.float32)
|
||||
for _ in range(5): dist.all_reduce(x)
|
||||
torch.cuda.synchronize()
|
||||
s=torch.cuda.Event(enable_timing=True); e=torch.cuda.Event(enable_timing=True)
|
||||
s.record()
|
||||
for _ in range({iters}): dist.all_reduce(x)
|
||||
e.record(); torch.cuda.synchronize()
|
||||
ms=s.elapsed_time(e); gb=({elements}*4*{iters})/1e9; bw=gb/(ms/1000)
|
||||
if rank==0: print(f"{{bw:.1f}}")
|
||||
|
||||
x=torch.randn(1024*1024,device=f"cuda:{{rank}}",dtype=torch.float32)
|
||||
|
||||
# Test AllReduce
|
||||
try:
|
||||
dist.all_reduce(x.clone())
|
||||
if rank==0: print("allreduce:ok")
|
||||
except Exception as e:
|
||||
if rank==0: print(f"allreduce:fail:{{e}}")
|
||||
|
||||
# Test Broadcast
|
||||
try:
|
||||
dist.broadcast(x.clone(),src=0)
|
||||
if rank==0: print("broadcast:ok")
|
||||
except Exception as e:
|
||||
if rank==0: print(f"broadcast:fail:{{e}}")
|
||||
|
||||
# Test AllGather
|
||||
try:
|
||||
tensor_list=[torch.empty_like(x) for _ in range(ws)]
|
||||
dist.all_gather(tensor_list,x.clone())
|
||||
if rank==0: print("allgather:ok")
|
||||
except Exception as e:
|
||||
if rank==0: print(f"allgather:fail:{{e}}")
|
||||
|
||||
# Test ReduceScatter
|
||||
try:
|
||||
chunks=list(x.chunk(ws))
|
||||
output=torch.empty_like(chunks[0])
|
||||
dist.reduce_scatter(output,chunks)
|
||||
if rank==0: print("reducescatter:ok")
|
||||
except Exception as e:
|
||||
if rank==0: print(f"reducescatter:fail:{{e}}")
|
||||
|
||||
# Test AllToAll
|
||||
try:
|
||||
chunks=list(x.chunk(ws))
|
||||
output_list=[torch.empty_like(c) for c in chunks]
|
||||
dist.all_to_all(output_list,chunks)
|
||||
if rank==0: print("alltoall:ok")
|
||||
except Exception as e:
|
||||
if rank==0: print(f"alltoall:fail:{{e}}")
|
||||
|
||||
dist.destroy_process_group()
|
||||
"""
|
||||
import tempfile
|
||||
@ -225,23 +322,44 @@ dist.destroy_process_group()
|
||||
tmp.close()
|
||||
|
||||
try:
|
||||
# Prefer torchrun from the same venv as the running Python
|
||||
import sys
|
||||
venv_torchrun = os.path.join(os.path.dirname(sys.executable), "torchrun")
|
||||
torchrun_cmd = venv_torchrun if os.path.isfile(venv_torchrun) else "torchrun"
|
||||
|
||||
r = subprocess.run(
|
||||
["torchrun", f"--nproc_per_node={gpu_count}", tmp.name],
|
||||
[torchrun_cmd, f"--nproc_per_node={gpu_count}", tmp.name],
|
||||
capture_output=True, text=True, timeout=120,
|
||||
env={**os.environ, "NCCL_DEBUG": "WARN"},
|
||||
)
|
||||
os.unlink(tmp.name)
|
||||
lines = [l.strip() for l in r.stdout.split("\n") if l.strip()]
|
||||
bw = float(lines[-1]) if lines else 0
|
||||
status = "PASS" if bw >= min_bw else "WARN"
|
||||
return {
|
||||
"passed": status == "PASS",
|
||||
"source": "torchrun_fallback",
|
||||
"tests": {"allreduce": {
|
||||
|
||||
# Parse connectivity results — format: op_name:ok or op_name:fail:error
|
||||
tests = {}
|
||||
all_passed = True
|
||||
for line in r.stdout.split("\n"):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
parts = line.split(":")
|
||||
op_name = parts[0]
|
||||
result = parts[1] if len(parts) > 1 else "unknown"
|
||||
|
||||
if result == "ok":
|
||||
status = "PASS"
|
||||
else:
|
||||
status = "FAIL"
|
||||
all_passed = False
|
||||
|
||||
tests[op_name] = {
|
||||
"status": status,
|
||||
"best_busbw_gbps": round(bw, 1),
|
||||
"min_required_gbps": min_bw,
|
||||
}},
|
||||
"error": ":".join(parts[2:]) if len(parts) > 2 and result == "fail" else None,
|
||||
}
|
||||
|
||||
return {
|
||||
"passed": all_passed,
|
||||
"source": "torchrun_fallback",
|
||||
"tests": tests,
|
||||
"gpu_count": gpu_count,
|
||||
}
|
||||
except Exception as e:
|
||||
@ -256,30 +374,53 @@ dist.destroy_process_group()
|
||||
|
||||
passed = results.get("passed", False)
|
||||
source = results.get("source", "unknown")
|
||||
verdict = "[bold green]✓ NCCL tests PASSED[/bold green]" if passed else "[bold yellow]⚠ NCCL tests WARNING[/bold yellow]"
|
||||
c.print(f"{verdict} [dim](via {source})[/dim]")
|
||||
|
||||
tests = results.get("tests", {})
|
||||
for op_name, result in tests.items():
|
||||
if not isinstance(result, dict):
|
||||
continue
|
||||
c.print(f"\n[bold cyan]{op_name.upper()}[/bold cyan]")
|
||||
status = result.get("status", "FAIL")
|
||||
s_color = "green" if status == "PASS" else ("yellow" if status == "WARN" else "red")
|
||||
c.print(f" Status: [{s_color}]{status}[/{s_color}] "
|
||||
f"Best bus BW: {result.get('best_busbw_gbps', 'N/A')} GB/s "
|
||||
f"(min: {result.get('min_required_gbps', 'N/A')} GB/s)")
|
||||
if source == "torchrun_fallback":
|
||||
# Connectivity check mode
|
||||
verdict = "[bold green]✓ NCCL Connectivity OK[/bold green]" if passed else "[bold red]✗ NCCL Connectivity FAILED[/bold red]"
|
||||
c.print(f"{verdict} [dim](basic check via torchrun)[/dim]")
|
||||
|
||||
by_size = result.get("by_size", [])
|
||||
if by_size:
|
||||
t = Table(box=None, padding=(0, 1))
|
||||
t.add_column("Size", style="bold", justify="right")
|
||||
t.add_column("Time (us)", justify="right")
|
||||
t.add_column("Alg BW (GB/s)", justify="right")
|
||||
t.add_column("Bus BW (GB/s)", justify="right")
|
||||
for r in by_size:
|
||||
sz = r.get("size", 0)
|
||||
sz_str = f"{sz/1024:.0f}K" if sz < 1048576 else f"{sz/1048576:.0f}M"
|
||||
t.add_row(sz_str, f"{r.get('time_us',0):.1f}",
|
||||
f"{r.get('algbw_gbps',0):.1f}", f"{r.get('busbw_gbps',0):.1f}")
|
||||
c.print(t)
|
||||
tests = results.get("tests", {})
|
||||
if tests:
|
||||
c.print("\n[dim]Operations tested:[/dim]")
|
||||
for op_name, result in tests.items():
|
||||
if not isinstance(result, dict):
|
||||
continue
|
||||
status = result.get("status", "FAIL")
|
||||
s_color = "green" if status == "PASS" else "red"
|
||||
error = result.get("error")
|
||||
if error:
|
||||
c.print(f" [{s_color}]{op_name}[/{s_color}] — {error}")
|
||||
else:
|
||||
c.print(f" [{s_color}]{op_name}[/{s_color}]")
|
||||
|
||||
c.print("\n[yellow]Note: functional connectivity test only (no performance data)[/yellow]")
|
||||
else:
|
||||
# nccl-tests mode
|
||||
verdict = "[bold green]✓ NCCL tests PASSED[/bold green]" if passed else "[bold yellow]⚠ NCCL tests WARNING[/bold yellow]"
|
||||
c.print(f"{verdict} [dim](via {source})[/dim]")
|
||||
|
||||
tests = results.get("tests", {})
|
||||
for op_name, result in tests.items():
|
||||
if not isinstance(result, dict):
|
||||
continue
|
||||
c.print(f"\n[bold cyan]{op_name.upper()}[/bold cyan]")
|
||||
status = result.get("status", "FAIL")
|
||||
s_color = "green" if status == "PASS" else ("yellow" if status == "WARN" else "red")
|
||||
c.print(f" Status: [{s_color}]{status}[/{s_color}] "
|
||||
f"Best bus BW: {result.get('best_busbw_gbps', 'N/A')} GB/s "
|
||||
f"(min: {result.get('min_required_gbps', 'N/A')} GB/s)")
|
||||
|
||||
by_size = result.get("by_size", [])
|
||||
if by_size:
|
||||
t = Table(box=None, padding=(0, 1))
|
||||
t.add_column("Size", style="bold", justify="right")
|
||||
t.add_column("Time (us)", justify="right")
|
||||
t.add_column("Alg BW (GB/s)", justify="right")
|
||||
t.add_column("Bus BW (GB/s)", justify="right")
|
||||
for r in by_size:
|
||||
sz = r.get("size", 0)
|
||||
sz_str = f"{sz/1024:.0f}K" if sz < 1048576 else f"{sz/1048576:.0f}M"
|
||||
t.add_row(sz_str, f"{r.get('time_us',0):.1f}",
|
||||
f"{r.get('algbw_gbps',0):.1f}", f"{r.get('busbw_gbps',0):.1f}")
|
||||
c.print(t)
|
||||
|
||||
@ -253,14 +253,23 @@ class ReportGenerator:
|
||||
d2d = mem_data.get("d2d_bandwidth_gbps", 0)
|
||||
h2d = mem_data.get("h2d_bandwidth_gbps", 0)
|
||||
d2h = mem_data.get("d2h_bandwidth_gbps", 0)
|
||||
peak = mem_data.get("peak_bandwidth_gbps", 0)
|
||||
eff = mem_data.get("efficiency_pct", 0)
|
||||
lines.append(f"| D2D (HBM) | {d2d:.1f} GB/s | {peak:.0f} GB/s | {eff:.1f}% |")
|
||||
lines.append(f"| H2D | {h2d:.1f} GB/s | - | - |")
|
||||
lines.append(f"| D2H | {d2h:.1f} GB/s | - | - |")
|
||||
# New format with per-metric peaks
|
||||
h2d_peak = mem_data.get("h2d_peak_gbps", 0)
|
||||
d2h_peak = mem_data.get("d2h_peak_gbps", 0)
|
||||
d2d_peak = mem_data.get("d2d_peak_gbps", 0)
|
||||
h2d_eff = mem_data.get("h2d_efficiency_pct", 0)
|
||||
d2h_eff = mem_data.get("d2h_efficiency_pct", 0)
|
||||
d2d_eff = mem_data.get("d2d_efficiency_pct", 0)
|
||||
# Fallback for old format
|
||||
if not d2d_peak:
|
||||
d2d_peak = mem_data.get("peak_bandwidth_gbps", 0)
|
||||
d2d_eff = mem_data.get("efficiency_pct", 0)
|
||||
lines.append(f"| H2D (PCIe) | {h2d:.1f} GB/s | {h2d_peak:.0f} GB/s | {h2d_eff:.1f}% |")
|
||||
lines.append(f"| D2H (PCIe) | {d2h:.1f} GB/s | {d2h_peak:.0f} GB/s | {d2h_eff:.1f}% |")
|
||||
lines.append(f"| D2D (NVLink) | {d2d:.1f} GB/s | {d2d_peak:.0f} GB/s | {d2d_eff:.1f}% |")
|
||||
lines.append("")
|
||||
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 50 else "FAIL")
|
||||
lines.append(f"**Verdict: {verdict}** (D2D efficiency {eff:.1f}%)\n")
|
||||
verdict = "PASS" if d2d_eff >= 50 else ("WARN" if d2d_eff >= 30 else "FAIL")
|
||||
lines.append(f"**Verdict: {verdict}** (D2D efficiency {d2d_eff:.1f}%)\n")
|
||||
|
||||
# --- Compute Throughput ---
|
||||
comp_data = self._extract_compute_results(results)
|
||||
|
||||
@ -49,10 +49,19 @@ class StressTest:
|
||||
gpu_burn = self._find_gpu_burn()
|
||||
|
||||
if gpu_burn:
|
||||
return self._run_gpu_burn(gpu_burn, duration_sec, use_doubles, use_tensor_cores, target_gpus)
|
||||
# 尝试使用 gpu-burn
|
||||
result = self._run_gpu_burn(gpu_burn, duration_sec, use_doubles, use_tensor_cores, target_gpus)
|
||||
|
||||
self.console.print("[yellow]gpu_burn not found, falling back to PyTorch stress test[/yellow]")
|
||||
return self._run_pytorch_stress(duration_sec)
|
||||
# 如果 gpu-burn 失败(例如显存不足),自动 fallback 到 PyTorch
|
||||
if not result.get("passed") and result.get("elapsed_sec", 0) < duration_sec * 0.5:
|
||||
self.console.print("\n[yellow]gpu-burn 提前退出(可能显存不足),自动切换到 PyTorch 压力测试[/yellow]")
|
||||
self.console.print("[dim]PyTorch 模式会根据实际可用显存动态调整,更稳定[/dim]\n")
|
||||
return self._run_pytorch_stress(duration_sec, memory_pct)
|
||||
|
||||
return result
|
||||
|
||||
self.console.print("[yellow]gpu_burn not found, using PyTorch stress test[/yellow]")
|
||||
return self._run_pytorch_stress(duration_sec, memory_pct)
|
||||
|
||||
def _run_gpu_burn(self, gpu_burn: str, duration: int,
|
||||
doubles: bool, tensor_cores: bool, target_gpus: str) -> dict:
|
||||
@ -107,7 +116,7 @@ class StressTest:
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
def _run_pytorch_stress(self, duration: int) -> dict:
|
||||
def _run_pytorch_stress(self, duration: int, memory_pct: int = 90) -> dict:
|
||||
try:
|
||||
import torch
|
||||
if not torch.cuda.is_available():
|
||||
@ -116,7 +125,7 @@ class StressTest:
|
||||
return {"error": "pytorch_not_available"}
|
||||
|
||||
gpu_count = torch.cuda.device_count()
|
||||
self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs)[/cyan]")
|
||||
self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs, target {memory_pct}% memory)[/cyan]")
|
||||
|
||||
gpu_status = {}
|
||||
t0 = time.time()
|
||||
@ -125,22 +134,53 @@ class StressTest:
|
||||
tensors = {}
|
||||
for i in range(gpu_count):
|
||||
with torch.cuda.device(i):
|
||||
props = torch.cuda.get_device_properties(i)
|
||||
total_mem = getattr(props, "total_memory", None) or getattr(props, "total_mem", 0)
|
||||
alloc_size = int(total_mem * 0.9) // 4
|
||||
tensors[i] = torch.randn(alloc_size, device=f"cuda:{i}", dtype=torch.float32)
|
||||
# 获取实际可用显存(考虑其他进程已占用的部分)
|
||||
free_mem, total_mem = torch.cuda.mem_get_info(i)
|
||||
|
||||
# 根据配置的 memory_pct 计算分配大小
|
||||
# 例如:memory_pct=90 表示使用总显存的 90%
|
||||
target_mem = int(total_mem * memory_pct / 100)
|
||||
|
||||
# 但不能超过实际可用显存(留出 5% 安全余量)
|
||||
alloc_bytes = min(target_mem, int(free_mem * 0.95))
|
||||
|
||||
# matmul(A, A.T) 需要 2x 输入显存(输入 + 输出)
|
||||
# 所以分配 sqrt(alloc_bytes/4/2) 大小的方阵
|
||||
side = int((alloc_bytes / 4 / 2) ** 0.5) # float32 = 4 bytes
|
||||
|
||||
actual_mem_mb = side * side * 4 / 1024 / 1024
|
||||
total_mem_mb = total_mem / 1024 / 1024
|
||||
free_mem_mb = free_mem / 1024 / 1024
|
||||
|
||||
self.console.print(
|
||||
f" [dim]GPU {i}: 总显存 {total_mem_mb:.0f}MB, 可用 {free_mem_mb:.0f}MB, "
|
||||
f"分配 {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - "
|
||||
f"矩阵 {side}x{side}[/dim]"
|
||||
)
|
||||
tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32)
|
||||
|
||||
self.console.print(f"\n[cyan]开始压力测试,持续 {duration} 秒...[/cyan]")
|
||||
|
||||
elapsed_check = 0
|
||||
while time.time() - t0 < duration:
|
||||
for i in range(gpu_count):
|
||||
with torch.cuda.device(i):
|
||||
tensors[i] = torch.matmul(tensors[i][:2048, :2048], tensors[i][:2048, :2048].T)
|
||||
tensors[i] = torch.matmul(tensors[i], tensors[i].T)
|
||||
torch.cuda.synchronize()
|
||||
time.sleep(0.1)
|
||||
|
||||
# 每 10 秒显示一次进度
|
||||
current_elapsed = time.time() - t0
|
||||
if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0:
|
||||
self.console.print(f" [dim]已运行 {int(current_elapsed)}s / {duration}s[/dim]")
|
||||
elapsed_check = current_elapsed
|
||||
|
||||
for i in range(gpu_count):
|
||||
gpu_status[i] = "PASS"
|
||||
|
||||
except RuntimeError as e:
|
||||
error_msg = str(e)
|
||||
self.console.print(f"\n[red]压力测试出错: {error_msg}[/red]")
|
||||
for i in range(gpu_count):
|
||||
if i not in gpu_status:
|
||||
gpu_status[i] = "FAIL"
|
||||
@ -148,7 +188,7 @@ class StressTest:
|
||||
"source": "pytorch",
|
||||
"passed": False,
|
||||
"duration_sec": duration,
|
||||
"error": str(e),
|
||||
"error": error_msg,
|
||||
"gpu_status": gpu_status,
|
||||
}
|
||||
finally:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user