test_gpu_scripts/install_deps.sh
qinyusen f2158f6cd3 fix: resolve stress OOM, D2D efficiency calculation, NCCL execution failures
Key changes:
- stress_test: use torch.cuda.mem_get_info() for free memory instead of total,
  allocate 40% to avoid OOM when other processes occupy GPU memory
- benchmark: fix D2D efficiency by comparing to NVLink per-direction bandwidth
  (not HBM), add H2D/D2H efficiency against PCIe peak
- nccl_test: implement direct binary → mpirun → torchrun fallback chain,
  fix min_bw None bug when YAML value is empty
- report: update memory section to use per-metric peak fields
- install_deps.sh: add NCCL compatibility detection, enhance CUDA version
  detection with CUDA_HOME/standard paths, improve _map_cuda_tag logging
- gpu_info: parse CUDA version from nvidia-smi header (query field removed
  in newer drivers)
- health_check: parse throttle_reasons bitmask properly, ignore gpu_idle bit
- gpu_tester: fix suite summary to exclude metadata keys from pass count

🤖 Generated with [Qoder][https://qoder.com]
2026-05-07 18:09:22 +08:00

854 lines
30 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# =============================================================================
# GPU Test Suite — 一键安装脚本(环境隔离版)
# 支持: A100 / A800 / H100 / H200 / B200 / B300
#
# 功能:
# 1. 环境校验GPU、CUDA、Python、编译器等
# 2. 自动安装 uv 并创建隔离 Python 虚拟环境
# 3. 自动检测 CUDA 版本并安装对应 PyTorch
# 4. 编译 nvbandwidth / nccl-tests / gpu-burn
# 5. 生成 env.sh 激活脚本和 run-gpu-tests 运行器
#
# 用法:
# sudo bash install_deps.sh # 标准安装
# sudo bash install_deps.sh --install-system-deps # 同时安装系统包
# sudo bash install_deps.sh --skip-pytorch # 跳过 PyTorch
# sudo bash install_deps.sh --rebuild # 强制重新编译
# sudo bash install_deps.sh -y # 非交互模式
# =============================================================================
set -uo pipefail
# ─── 全局变量 ─────────────────────────────────────────────────────────────────
INSTALL_DIR="${GPU_TOOLS_DIR:-/opt/gpu-test-tools}"
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
JOBS="${MAKE_JOBS:-$(nproc)}"
VERBOSE="${VERBOSE:-0}"
# uv 配置:跨文件系统时使用 copy 模式,避免硬链接警告
export UV_LINK_MODE="${UV_LINK_MODE:-copy}"
# 参数标志
FLAG_INSTALL_SYS_DEPS=0
FLAG_SKIP_PYTORCH=0
FLAG_REBUILD=0
FLAG_YES=0
# 检测结果(全局)
DETECTED_GPU=""
DETECTED_DRIVER=""
CUDA_VERSION=""
CUDA_TAG=""
PYTHON_BIN=""
HAS_MPI=0
HAS_NCCL_DEV=0
# ─── 颜色和日志 ──────────────────────────────────────────────────────────────
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m'
log() { echo -e "${CYAN}[INFO]${NC} $*"; }
ok() { echo -e "${GREEN}[ OK ]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; }
die() { echo -e "${RED}[FATAL]${NC} $*"; exit 1; }
banner() { echo -e "\n${BOLD}${CYAN}══════ $* ══════${NC}\n"; }
# 错误陷阱
trap 'fail "脚本在第 $LINENO 行出错。设置 VERBOSE=1 查看详情。"' ERR
# ─── 参数解析 ─────────────────────────────────────────────────────────────────
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install-system-deps) FLAG_INSTALL_SYS_DEPS=1 ;;
--skip-pytorch) FLAG_SKIP_PYTORCH=1 ;;
--rebuild) FLAG_REBUILD=1 ;;
-y|--yes) FLAG_YES=1 ;;
-h|--help)
echo "用法: $0 [选项]"
echo ""
echo "选项:"
echo " --install-system-deps 自动安装缺失的系统包"
echo " --skip-pytorch 跳过 PyTorch 安装"
echo " --rebuild 强制重新编译原生工具"
echo " -y, --yes 非交互模式"
echo " -h, --help 显示此帮助"
echo ""
echo "环境变量:"
echo " GPU_TOOLS_DIR 安装目录 (默认: /opt/gpu-test-tools)"
echo " MAKE_JOBS 编译并行数 (默认: nproc)"
echo " CUDA_HOME CUDA 安装路径 (默认: /usr/local/cuda)"
exit 0
;;
*) warn "未知参数: $1" ;;
esac
shift
done
}
# ─── 阶段 0: 权限检查 ─────────────────────────────────────────────────────────
check_permissions() {
local parent_dir
parent_dir="$(dirname "$INSTALL_DIR")"
if [[ ! -w "$parent_dir" ]] && [[ ! -d "$INSTALL_DIR" || ! -w "$INSTALL_DIR" ]]; then
die "无法写入 $INSTALL_DIR(请使用 sudo 或设置 GPU_TOOLS_DIR 到可写路径)"
fi
mkdir -p "$INSTALL_DIR"
}
# ─── 阶段 1: 环境校验 ─────────────────────────────────────────────────────────
# 最低驱动版本表
declare -A MIN_DRIVERS=(
["A100"]="470" ["A800"]="470"
["H100"]="535" ["H200"]="535"
["B200"]="550" ["B300"]="550"
)
check_nvidia_smi() {
if ! command -v nvidia-smi &>/dev/null; then
fail "nvidia-smi 未找到"
echo " → 请先安装 NVIDIA 驱动"
return 1
fi
ok "nvidia-smi 可用"
return 0
}
detect_gpu_and_driver() {
local smi_out
smi_out=$(nvidia-smi --query-gpu=name,driver_version --format=csv,noheader 2>/dev/null | head -1)
if [[ -z "$smi_out" ]]; then
warn "无法查询 GPU 信息"
return 1
fi
DETECTED_GPU=$(echo "$smi_out" | cut -d',' -f1 | xargs)
DETECTED_DRIVER=$(echo "$smi_out" | cut -d',' -f2 | xargs)
# 检查驱动版本
local gpu_key=""
for key in "${!MIN_DRIVERS[@]}"; do
if echo "$DETECTED_GPU" | grep -qi "$key"; then
gpu_key="$key"
break
fi
done
if [[ -n "$gpu_key" ]]; then
local min_drv="${MIN_DRIVERS[$gpu_key]}"
local drv_major="${DETECTED_DRIVER%%.*}"
if [[ "$drv_major" -lt "$min_drv" ]]; then
warn "驱动 $DETECTED_DRIVER < 最低要求 $min_drv$gpu_key 需要)"
else
ok "GPU: $DETECTED_GPU | 驱动: $DETECTED_DRIVER (>= $min_drv)"
fi
else
ok "GPU: $DETECTED_GPU | 驱动: $DETECTED_DRIVER"
fi
return 0
}
detect_cuda_version() {
# 优先级 1: nvcc 在 PATH 中(最可靠,代表 CUDA Toolkit 已正确配置)
if command -v nvcc &>/dev/null; then
CUDA_VERSION=$(nvcc --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+')
if [[ -n "$CUDA_VERSION" ]]; then
ok "CUDA: $CUDA_VERSION (via nvcc in PATH)"
_map_cuda_tag
return 0
fi
fi
# 优先级 2: CUDA_HOME 环境变量已设置且有效
if [[ -n "${CUDA_HOME:-}" ]] && [[ -x "${CUDA_HOME}/bin/nvcc" ]]; then
CUDA_VERSION=$("${CUDA_HOME}/bin/nvcc" --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+')
if [[ -n "$CUDA_VERSION" ]]; then
ok "CUDA: $CUDA_VERSION (via CUDA_HOME=${CUDA_HOME})"
# 将 CUDA_HOME/bin 加入 PATH供后续编译使用
export PATH="${CUDA_HOME}/bin:$PATH"
_map_cuda_tag
return 0
fi
fi
# 优先级 3: 检查标准路径 /usr/local/cuda最常见的安装位置
if [[ -x "/usr/local/cuda/bin/nvcc" ]]; then
CUDA_VERSION=$("/usr/local/cuda/bin/nvcc" --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+')
if [[ -n "$CUDA_VERSION" ]]; then
export CUDA_HOME="/usr/local/cuda"
export PATH="$CUDA_HOME/bin:$PATH"
ok "CUDA: $CUDA_VERSION (via /usr/local/cuda)"
_map_cuda_tag
return 0
fi
fi
# 所有方式都失败,明确报错退出
fail "CUDA Toolkit 未找到!"
echo ""
echo " 当前环境状态:"
echo " • nvcc 不在 PATH 中"
if [[ -z "${CUDA_HOME:-}" ]]; then
echo " • CUDA_HOME 环境变量未设置"
else
echo " • CUDA_HOME=${CUDA_HOME} (但 nvcc 不存在或不可执行)"
fi
echo " • /usr/local/cuda/bin/nvcc 不存在或不可执行"
echo ""
echo " 解决方案(选择其一):"
echo " 1. 安装 CUDA Toolkit: https://developer.nvidia.com/cuda-downloads"
echo " 2. 如果已安装,请设置环境变量:"
echo " export CUDA_HOME=/path/to/cuda"
echo " export PATH=\$CUDA_HOME/bin:\$PATH"
echo " 3. 创建符号链接: sudo ln -s /path/to/cuda /usr/local/cuda"
echo ""
return 1
}
_map_cuda_tag() {
local major minor
major="${CUDA_VERSION%%.*}"
minor="${CUDA_VERSION#*.}"
minor="${minor%%.*}"
# PyTorch 官方提供的 CUDA wheel 版本: cu118, cu121, cu124, cu128
# 选择规则: 取不超过驱动支持 CUDA 版本的最高可用 wheel
if [[ "$major" -eq 11 ]]; then
CUDA_TAG="cu118"
elif [[ "$major" -eq 12 ]]; then
if [[ "$minor" -le 1 ]]; then
CUDA_TAG="cu121"
elif [[ "$minor" -le 4 ]]; then
CUDA_TAG="cu124"
else
CUDA_TAG="cu128"
fi
elif [[ "$major" -ge 13 ]]; then
# CUDA 13+ 驱动,仍用 cu128PyTorch 暂无更高版本 wheel
CUDA_TAG="cu128"
else
CUDA_TAG="cu124"
warn "未知 CUDA $CUDA_VERSION,默认使用 cu124 索引"
fi
log "版本选择决策:"
log " 驱动支持最高 CUDA: ${CUDA_VERSION}"
log " PyTorch 可用 wheel: cu118 / cu121 / cu124 / cu128"
log " → 选择: ${CUDA_TAG}(不超过 CUDA ${CUDA_VERSION} 的最高兼容版本)"
}
check_python() {
local py_cmd=""
for cmd in python3.12 python3.11 python3.10 python3; do
if command -v "$cmd" &>/dev/null; then
local ver
ver=$("$cmd" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null)
local py_major py_minor
py_major="${ver%%.*}"
py_minor="${ver#*.}"
if [[ "$py_major" -ge 3 ]] && [[ "$py_minor" -ge 10 ]]; then
py_cmd="$cmd"
break
fi
fi
done
if [[ -z "$py_cmd" ]]; then
fail "Python >= 3.10 未找到"
echo " → apt install python3.11 python3.11-venv"
return 1
fi
PYTHON_BIN="$(command -v "$py_cmd")"
ok "Python: $("$py_cmd" --version 2>&1) ($PYTHON_BIN)"
return 0
}
check_cmake() {
if ! command -v cmake &>/dev/null; then
fail "cmake 未找到(编译 nvbandwidth 需要 >= 3.18"
echo " → apt install cmake"
return 1
fi
local cmake_ver
cmake_ver=$(cmake --version | head -1 | grep -oP '[0-9]+\.[0-9]+')
local cmake_major cmake_minor
cmake_major="${cmake_ver%%.*}"
cmake_minor="${cmake_ver#*.}"
if [[ "$cmake_major" -lt 3 ]] || { [[ "$cmake_major" -eq 3 ]] && [[ "$cmake_minor" -lt 18 ]]; }; then
fail "cmake $cmake_ver < 3.18nvbandwidth 需要 >= 3.18"
echo " → 升级 cmake: pip install cmake 或从源码安装"
return 1
fi
ok "cmake: $cmake_ver"
return 0
}
check_compiler() {
if ! command -v gcc &>/dev/null || ! command -v g++ &>/dev/null; then
fail "gcc/g++ 未找到"
echo " → apt install build-essential"
return 1
fi
local gcc_ver
gcc_ver=$(gcc -dumpversion 2>/dev/null)
ok "gcc/g++: $gcc_ver"
return 0
}
check_mpi() {
if command -v mpirun &>/dev/null || command -v mpiexec &>/dev/null; then
HAS_MPI=1
ok "MPI: $(mpirun --version 2>&1 | head -1)"
else
HAS_MPI=0
warn "mpirun 未找到nccl-tests 将不使用 MPI 模式)"
echo " → apt install openmpi-bin libopenmpi-dev"
fi
return 0
}
check_nccl_dev() {
if ldconfig -p 2>/dev/null | grep -q libnccl; then
HAS_NCCL_DEV=1
ok "libnccl: 已找到 (via ldconfig)"
_check_nccl_compatibility
return 0
fi
if [[ -f /usr/include/nccl.h ]] || dpkg -l libnccl-dev &>/dev/null 2>&1; then
HAS_NCCL_DEV=1
ok "libnccl-dev: 已安装"
_check_nccl_compatibility
return 0
fi
HAS_NCCL_DEV=0
warn "libnccl-dev 未找到(将跳过 nccl-tests 编译)"
echo " → apt install libnccl-dev libnccl2"
return 0
}
# 检测系统 NCCL 版本是否与当前驱动/CUDA 兼容
NCCL_COMPATIBLE=1
_check_nccl_compatibility() {
NCCL_COMPATIBLE=1
# 获取 NCCL 包的 CUDA 依赖版本
local nccl_pkg_info=""
nccl_pkg_info=$(dpkg -l libnccl2 2>/dev/null | grep -oP '\+cuda[0-9.]+' | head -1)
if [[ -z "$nccl_pkg_info" ]]; then
return 0 # 无法判断,假设兼容
fi
local nccl_cuda_ver="${nccl_pkg_info#+cuda}"
local nccl_cuda_major="${nccl_cuda_ver%%.*}"
local nccl_cuda_minor="${nccl_cuda_ver#*.}"
nccl_cuda_minor="${nccl_cuda_minor%%.*}"
# 获取驱动支持的最大 CUDA 版本
local driver_cuda=""
driver_cuda=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+')
if [[ -z "$driver_cuda" ]]; then
return 0
fi
local drv_cuda_major="${driver_cuda%%.*}"
local drv_cuda_minor="${driver_cuda#*.}"
drv_cuda_minor="${drv_cuda_minor%%.*}"
# NCCL 需要的 CUDA 版本 > 驱动支持的 CUDA 版本 → 不兼容
if [[ "$nccl_cuda_major" -gt "$drv_cuda_major" ]] || \
{ [[ "$nccl_cuda_major" -eq "$drv_cuda_major" ]] && [[ "$nccl_cuda_minor" -gt "$drv_cuda_minor" ]]; }; then
NCCL_COMPATIBLE=0
warn "系统 NCCL 版本不兼容!"
echo -e " ${YELLOW}NCCL 包要求: CUDA ${nccl_cuda_ver}${NC}"
echo -e " ${YELLOW}驱动支持最高: CUDA ${driver_cuda}${NC}"
echo ""
echo " 这会导致 nccl-tests 运行时报错:"
echo " 'CUDA driver version is insufficient for CUDA runtime version'"
echo ""
echo " 解决方案(任选其一):"
echo " A) 降级 NCCL: sudo apt install libnccl2=<版本>+cuda${driver_cuda}"
echo " B) 升级驱动至支持 CUDA ${nccl_cuda_ver} 的版本"
echo " C) 使用 PyTorch 内置 NCCL测试套件会自动 fallback"
echo ""
else
ok "NCCL 兼容性: NCCL(cuda${nccl_cuda_ver}) <= 驱动(cuda${driver_cuda})"
fi
}
install_system_deps() {
log "安装系统依赖包..."
if command -v apt-get &>/dev/null; then
apt-get update -qq
apt-get install -y -qq build-essential git cmake wget curl \
openmpi-bin libopenmpi-dev openssh-client \
infiniband-diags ibverbs-utils perftest \
python3 python3-pip python3-venv \
libnccl-dev libnccl2 \
2>/dev/null || warn "部分包安装失败(可能已安装)"
elif command -v dnf &>/dev/null; then
dnf groupinstall -y "Development Tools" 2>/dev/null || true
dnf install -y git cmake wget curl \
openmpi openmpi-devel openssh-clients \
infiniband-diags libibverbs-utils perftest \
python3 python3-pip \
2>/dev/null || warn "部分包安装失败"
elif command -v yum &>/dev/null; then
yum groupinstall -y "Development Tools" 2>/dev/null || true
yum install -y git cmake wget curl \
openmpi openmpi-devel openssh-clients \
infiniband-diags libibverbs-utils perftest \
python3 python3-pip \
2>/dev/null || warn "部分包安装失败"
else
warn "未识别的包管理器,请手动安装依赖"
fi
ok "系统依赖安装完成"
}
validate_environment() {
banner "阶段 1/6: 环境校验"
local errors=0
check_nvidia_smi || ((errors++))
detect_gpu_and_driver || true
detect_cuda_version || ((errors++))
check_python || ((errors++))
check_cmake || ((errors++))
check_compiler || ((errors++))
check_mpi || true
check_nccl_dev || true
echo ""
if [[ $errors -gt 0 ]]; then
fail "环境校验发现 $errors 个必要组件缺失"
if [[ $FLAG_INSTALL_SYS_DEPS -eq 1 ]]; then
log "检测到 --install-system-deps尝试安装..."
install_system_deps
# 重新校验
errors=0
check_python || ((errors++))
check_cmake || ((errors++))
check_compiler || ((errors++))
check_mpi || true
check_nccl_dev || true
if [[ $errors -gt 0 ]]; then
die "安装系统包后仍有 $errors 个组件缺失,请手动解决"
fi
else
echo ""
echo " 提示: 加 --install-system-deps 参数可自动安装缺失的系统包"
echo " 或手动运行上面提示的 apt install 命令后重试"
die "环境校验未通过"
fi
fi
ok "环境校验通过"
}
# ─── 阶段 2: 安装 uv ──────────────────────────────────────────────────────────
ensure_uv() {
banner "阶段 2/6: 确保 uv 可用"
# 检查已有的 uv
if command -v uv &>/dev/null; then
ok "uv 已安装: $(uv --version 2>&1)"
return 0
fi
# 检查常见位置
for p in "$HOME/.local/bin/uv" "$HOME/.cargo/bin/uv" /usr/local/bin/uv; do
if [[ -x "$p" ]]; then
export PATH="$(dirname "$p"):$PATH"
ok "uv 已找到: $p"
return 0
fi
done
log "正在安装 uv..."
if ! curl -LsSf https://astral.sh/uv/install.sh | sh 2>/dev/null; then
die "uv 安装失败。请手动安装: https://docs.astral.sh/uv/getting-started/installation/"
fi
# 将 uv 加入 PATH
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
if ! command -v uv &>/dev/null; then
die "uv 安装后仍无法找到。请检查 PATH。"
fi
ok "uv 安装成功: $(uv --version 2>&1)"
}
# ─── 阶段 3: Python 虚拟环境 ──────────────────────────────────────────────────
setup_python_venv() {
banner "阶段 3/6: 创建 Python 虚拟环境"
local venv_dir="$INSTALL_DIR/.venv"
# 检查已有 venv
if [[ -x "$venv_dir/bin/python" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
local existing_ver
existing_ver=$("$venv_dir/bin/python" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "0.0")
local ev_major="${existing_ver%%.*}"
local ev_minor="${existing_ver#*.}"
if [[ "$ev_major" -ge 3 ]] && [[ "$ev_minor" -ge 10 ]]; then
ok "虚拟环境已存在: $venv_dir (Python $existing_ver)"
else
log "已有 venv 的 Python 版本过低 ($existing_ver),重建中..."
rm -rf "$venv_dir"
fi
fi
# 创建 venv
if [[ ! -x "$venv_dir/bin/python" ]]; then
log "创建虚拟环境: $venv_dir"
uv venv "$venv_dir" --python "$PYTHON_BIN"
ok "虚拟环境创建成功"
fi
# 安装项目依赖
log "安装 Python 依赖rich、pyyaml、numpy..."
uv pip install --python "$venv_dir/bin/python" \
-e "$PROJECT_DIR" 2>&1 || true
ok "项目依赖安装完成"
# 安装 PyTorch
if [[ $FLAG_SKIP_PYTORCH -eq 1 ]]; then
warn "跳过 PyTorch 安装(--skip-pytorch"
else
# 检查是否已有 torch
if "$venv_dir/bin/python" -c "import torch" &>/dev/null && [[ $FLAG_REBUILD -eq 0 ]]; then
local torch_ver
torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null)
ok "PyTorch 已安装: $torch_ver"
else
local index_url="https://download.pytorch.org/whl/${CUDA_TAG}"
log "安装 PyTorch (CUDA $CUDA_TAG): $index_url"
log "(下载较大,请耐心等待..."
uv pip install --python "$venv_dir/bin/python" \
"torch>=2.1.0" --index-url "$index_url" \
2>&1 || {
warn "PyTorch 安装失败,可稍后手动安装:"
echo " source $INSTALL_DIR/env.sh"
echo " uv pip install torch --index-url $index_url"
}
if "$venv_dir/bin/python" -c "import torch" &>/dev/null; then
local torch_ver
torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null)
ok "PyTorch 安装成功: $torch_ver"
fi
fi
fi
}
# ─── 阶段 4: 编译原生工具 ─────────────────────────────────────────────────────
build_nvbandwidth() {
local src="$INSTALL_DIR/nvbandwidth"
# 幂等检查
if [[ -x "$src/nvbandwidth" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
ok "nvbandwidth: 已编译 ($src/nvbandwidth)"
return 0
fi
log "编译 nvbandwidth..."
(
set -e
# 清理 / 克隆
if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then
rm -rf "$src"
fi
if [[ -d "$src/.git" ]]; then
cd "$src" && git pull --ff-only 2>/dev/null || true
elif [[ -d "$src" ]]; then
rm -rf "$src"
git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src"
else
git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src"
fi
cd "$src"
mkdir -p build && cd build
# 使用 detect_cuda_version() 中设置的 CUDA_HOME 和 PATH
# 如果 CUDA_HOME 未设置,则使用默认路径
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_COMPILER="$cuda_home/bin/nvcc" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
make -j"$JOBS" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
if [[ -x ./nvbandwidth ]]; then
cp ./nvbandwidth "$src/nvbandwidth"
fi
)
if [[ -x "$src/nvbandwidth" ]]; then
ok "nvbandwidth: 编译成功"
else
warn "nvbandwidth: 编译失败(非致命,可手动编译)"
fi
}
build_nccl_tests() {
local src="$INSTALL_DIR/nccl-tests"
if [[ -x "$src/build/all_reduce_perf" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
ok "nccl-tests: 已编译 ($src/build/)"
if [[ $NCCL_COMPATIBLE -eq 0 ]]; then
warn "nccl-tests: 已编译但系统 NCCL 与驱动不兼容,运行时将 fallback 到 torchrun"
fi
return 0
fi
if [[ $HAS_NCCL_DEV -eq 0 ]]; then
warn "nccl-tests: 跳过libnccl-dev 未安装)"
return 0
fi
# NCCL 不兼容时仍然编译(编译不报错),但给出明确警告
if [[ $NCCL_COMPATIBLE -eq 0 ]]; then
warn "nccl-tests: 系统 NCCL 与驱动不兼容"
warn " 编译会成功但运行时会报错 'CUDA driver version is insufficient'"
warn " 测试套件会自动 fallback 到 torchrun 方式测试 NCCL"
log " 如需原生 nccl-tests 性能数据,请先解决 NCCL 版本问题(见上方提示)"
echo ""
fi
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
if [[ ! -d "$cuda_home/include" ]]; then
warn "nccl-tests: 跳过CUDA_HOME=$cuda_home 无效)"
return 0
fi
log "编译 nccl-tests..."
(
set -e
if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then
rm -rf "$src"
fi
if [[ -d "$src/.git" ]]; then
cd "$src" && git pull --ff-only 2>/dev/null || true
elif [[ -d "$src" ]]; then
rm -rf "$src"
git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src"
else
git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src"
fi
cd "$src"
if [[ $HAS_MPI -eq 1 ]]; then
make MPI=1 MPI_HOME=/usr CUDA_HOME="$cuda_home" -j"$JOBS" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
else
make CUDA_HOME="$cuda_home" -j"$JOBS" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
fi
)
if [[ -x "$src/build/all_reduce_perf" ]]; then
ok "nccl-tests: 编译成功"
else
warn "nccl-tests: 编译失败(非致命)"
fi
}
build_gpu_burn() {
local src="$INSTALL_DIR/gpu-burn"
if [[ -x "$src/gpu_burn" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
ok "gpu-burn: 已编译 ($src/gpu_burn)"
return 0
fi
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
if [[ ! -d "$cuda_home" ]]; then
warn "gpu-burn: 跳过CUDA_HOME=$cuda_home 不存在)"
return 0
fi
log "编译 gpu-burn..."
(
set -e
if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then
rm -rf "$src"
fi
if [[ -d "$src/.git" ]]; then
cd "$src" && git pull --ff-only 2>/dev/null || true
elif [[ -d "$src" ]]; then
rm -rf "$src"
git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src"
else
git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src"
fi
cd "$src"
make CUDA_PATH="$cuda_home" -j"$JOBS" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
)
if [[ -x "$src/gpu_burn" ]]; then
ok "gpu-burn: 编译成功"
else
warn "gpu-burn: 编译失败(非致命)"
fi
}
build_native_tools() {
banner "阶段 4/6: 编译原生工具"
build_nvbandwidth
build_nccl_tests
build_gpu_burn
}
# ─── 阶段 5: 生成激活脚本 ─────────────────────────────────────────────────────
generate_env_sh() {
banner "阶段 5/6: 生成环境脚本"
local env_file="$INSTALL_DIR/env.sh"
cat > "$env_file" << 'ENVEOF'
#!/usr/bin/env bash
# GPU Test Suite 环境激活脚本
# 用法: source /opt/gpu-test-tools/env.sh
export GPU_TOOLS_DIR="__INSTALL_DIR__"
export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
# 激活 Python 虚拟环境
if [[ -f "$GPU_TOOLS_DIR/.venv/bin/activate" ]]; then
source "$GPU_TOOLS_DIR/.venv/bin/activate"
fi
# 编译工具加入 PATH
export PATH="$GPU_TOOLS_DIR/nvbandwidth:$PATH"
export PATH="$GPU_TOOLS_DIR/nccl-tests/build:$PATH"
export PATH="$GPU_TOOLS_DIR/gpu-burn:$PATH"
export PATH="$CUDA_HOME/bin:$PATH"
# 库路径
export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
ENVEOF
# 替换占位符
sed -i "s|__INSTALL_DIR__|$INSTALL_DIR|g" "$env_file"
chmod +x "$env_file"
ok "env.sh 已生成: $env_file"
# 生成 run-gpu-tests 运行器
local wrapper="$INSTALL_DIR/run-gpu-tests"
cat > "$wrapper" << WRAPEOF
#!/usr/bin/env bash
# GPU Test Suite 一键运行器
# 用法: /opt/gpu-test-tools/run-gpu-tests --test all
SCRIPT_DIR="\$(cd "\$(dirname "\${BASH_SOURCE[0]}")" && pwd)"
source "\$SCRIPT_DIR/env.sh"
exec python3 "$PROJECT_DIR/gpu_tester.py" "\$@"
WRAPEOF
chmod +x "$wrapper"
ok "run-gpu-tests 已生成: $wrapper"
}
# ─── 阶段 6: 打印总结 ─────────────────────────────────────────────────────────
print_summary() {
banner "阶段 6/6: 安装总结"
echo -e "${BOLD}安装目录:${NC} $INSTALL_DIR"
echo ""
echo -e "${BOLD}组件状态:${NC}"
# Python 虚拟环境
if [[ -x "$INSTALL_DIR/.venv/bin/python" ]]; then
local py_ver
py_ver=$("$INSTALL_DIR/.venv/bin/python" --version 2>&1)
echo -e " ${GREEN}${NC} Python venv: $py_ver"
else
echo -e " ${RED}${NC} Python venv: 未创建"
fi
# PyTorch
if "$INSTALL_DIR/.venv/bin/python" -c "import torch" &>/dev/null 2>&1; then
local tv
tv=$("$INSTALL_DIR/.venv/bin/python" -c "import torch; print(f'{torch.__version__} (CUDA {torch.version.cuda})')" 2>/dev/null)
echo -e " ${GREEN}${NC} PyTorch: $tv"
else
echo -e " ${YELLOW}${NC} PyTorch: 未安装"
fi
# 编译工具
for tool_info in \
"$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \
"$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \
"$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do
local path="${tool_info%%:*}"
local name="${tool_info##*:}"
if [[ -x "$path" ]]; then
if [[ "$name" == "nccl-tests" ]] && [[ $NCCL_COMPATIBLE -eq 0 ]]; then
echo -e " ${YELLOW}${NC} $name (已编译,但系统 NCCL 与驱动不兼容)"
else
echo -e " ${GREEN}${NC} $name"
fi
else
echo -e " ${YELLOW}${NC} $name (未编译)"
fi
done
# RDMA 工具(系统级)
local rdma_found=0
for tool in ib_write_bw ib_read_bw ibstat; do
if command -v "$tool" &>/dev/null; then
((rdma_found++))
fi
done
if [[ $rdma_found -gt 0 ]]; then
echo -e " ${GREEN}${NC} RDMA 工具: $rdma_found/3 可用"
else
echo -e " ${YELLOW}${NC} RDMA 工具: 未安装 (apt install perftest infiniband-diags)"
fi
echo ""
echo -e "${BOLD}使用方法:${NC}"
echo ""
echo " # 方式一: source 激活后使用"
echo " source $INSTALL_DIR/env.sh"
echo " python3 $PROJECT_DIR/gpu_tester.py --test all"
echo ""
echo " # 方式二: 一键运行"
echo " $INSTALL_DIR/run-gpu-tests --test all"
echo " $INSTALL_DIR/run-gpu-tests --test health"
echo " $INSTALL_DIR/run-gpu-tests # 交互式菜单"
echo ""
}
# ─── 主函数 ───────────────────────────────────────────────────────────────────
main() {
parse_args "$@"
echo ""
echo -e "${BOLD}${CYAN}╔══════════════════════════════════════════════════╗${NC}"
echo -e "${BOLD}${CYAN}║ GPU Test Suite — 一键安装 ║${NC}"
echo -e "${BOLD}${CYAN}║ 环境隔离 · 自动检测 · 完整部署 ║${NC}"
echo -e "${BOLD}${CYAN}╚══════════════════════════════════════════════════╝${NC}"
echo ""
log "安装目录: $INSTALL_DIR"
log "项目目录: $PROJECT_DIR"
echo ""
check_permissions
validate_environment
ensure_uv
setup_python_venv
build_native_tools
generate_env_sh
print_summary
echo -e "${GREEN}${BOLD}安装完成!${NC}"
}
main "$@"