test_gpu_scripts/install_deps.sh
qinyusen 24934bc182 feat: rewrite install_deps.sh with env isolation and add numpy to requirements
- Complete rewrite of install_deps.sh (6-phase architecture):
  environment validation, uv-based venv isolation, CUDA auto-detection,
  idempotent native tool compilation, env.sh/run-gpu-tests generation
- Add numpy>=1.24 to requirements.txt to align with pyproject.toml
- Support --install-system-deps, --skip-pytorch, --rebuild, -y flags
- Use subshells for compilation to prevent CWD pollution
- Generate env.sh activation script and run-gpu-tests wrapper

🤖 Generated with [Qoder][https://qoder.com]
2026-05-07 01:32:13 +08:00

750 lines
26 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# =============================================================================
# GPU Test Suite — 一键安装脚本(环境隔离版)
# 支持: A100 / A800 / H100 / H200 / B200 / B300
#
# 功能:
# 1. 环境校验GPU、CUDA、Python、编译器等
# 2. 自动安装 uv 并创建隔离 Python 虚拟环境
# 3. 自动检测 CUDA 版本并安装对应 PyTorch
# 4. 编译 nvbandwidth / nccl-tests / gpu-burn
# 5. 生成 env.sh 激活脚本和 run-gpu-tests 运行器
#
# 用法:
# sudo bash install_deps.sh # 标准安装
# sudo bash install_deps.sh --install-system-deps # 同时安装系统包
# sudo bash install_deps.sh --skip-pytorch # 跳过 PyTorch
# sudo bash install_deps.sh --rebuild # 强制重新编译
# sudo bash install_deps.sh -y # 非交互模式
# =============================================================================
set -uo pipefail
# ─── 全局变量 ─────────────────────────────────────────────────────────────────
INSTALL_DIR="${GPU_TOOLS_DIR:-/opt/gpu-test-tools}"
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
JOBS="${MAKE_JOBS:-$(nproc)}"
VERBOSE="${VERBOSE:-0}"
# 参数标志
FLAG_INSTALL_SYS_DEPS=0
FLAG_SKIP_PYTORCH=0
FLAG_REBUILD=0
FLAG_YES=0
# 检测结果(全局)
DETECTED_GPU=""
DETECTED_DRIVER=""
CUDA_VERSION=""
CUDA_TAG=""
PYTHON_BIN=""
HAS_MPI=0
HAS_NCCL_DEV=0
# ─── 颜色和日志 ──────────────────────────────────────────────────────────────
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m'
log() { echo -e "${CYAN}[INFO]${NC} $*"; }
ok() { echo -e "${GREEN}[ OK ]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; }
die() { echo -e "${RED}[FATAL]${NC} $*"; exit 1; }
banner() { echo -e "\n${BOLD}${CYAN}══════ $* ══════${NC}\n"; }
# 错误陷阱
trap 'fail "脚本在第 $LINENO 行出错。设置 VERBOSE=1 查看详情。"' ERR
# ─── 参数解析 ─────────────────────────────────────────────────────────────────
parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install-system-deps) FLAG_INSTALL_SYS_DEPS=1 ;;
--skip-pytorch) FLAG_SKIP_PYTORCH=1 ;;
--rebuild) FLAG_REBUILD=1 ;;
-y|--yes) FLAG_YES=1 ;;
-h|--help)
echo "用法: $0 [选项]"
echo ""
echo "选项:"
echo " --install-system-deps 自动安装缺失的系统包"
echo " --skip-pytorch 跳过 PyTorch 安装"
echo " --rebuild 强制重新编译原生工具"
echo " -y, --yes 非交互模式"
echo " -h, --help 显示此帮助"
echo ""
echo "环境变量:"
echo " GPU_TOOLS_DIR 安装目录 (默认: /opt/gpu-test-tools)"
echo " MAKE_JOBS 编译并行数 (默认: nproc)"
echo " CUDA_HOME CUDA 安装路径 (默认: /usr/local/cuda)"
exit 0
;;
*) warn "未知参数: $1" ;;
esac
shift
done
}
# ─── 阶段 0: 权限检查 ─────────────────────────────────────────────────────────
check_permissions() {
local parent_dir
parent_dir="$(dirname "$INSTALL_DIR")"
if [[ ! -w "$parent_dir" ]] && [[ ! -d "$INSTALL_DIR" || ! -w "$INSTALL_DIR" ]]; then
die "无法写入 $INSTALL_DIR(请使用 sudo 或设置 GPU_TOOLS_DIR 到可写路径)"
fi
mkdir -p "$INSTALL_DIR"
}
# ─── 阶段 1: 环境校验 ─────────────────────────────────────────────────────────
# 最低驱动版本表
declare -A MIN_DRIVERS=(
["A100"]="470" ["A800"]="470"
["H100"]="535" ["H200"]="535"
["B200"]="550" ["B300"]="550"
)
check_nvidia_smi() {
if ! command -v nvidia-smi &>/dev/null; then
fail "nvidia-smi 未找到"
echo " → 请先安装 NVIDIA 驱动"
return 1
fi
ok "nvidia-smi 可用"
return 0
}
detect_gpu_and_driver() {
local smi_out
smi_out=$(nvidia-smi --query-gpu=name,driver_version --format=csv,noheader 2>/dev/null | head -1)
if [[ -z "$smi_out" ]]; then
warn "无法查询 GPU 信息"
return 1
fi
DETECTED_GPU=$(echo "$smi_out" | cut -d',' -f1 | xargs)
DETECTED_DRIVER=$(echo "$smi_out" | cut -d',' -f2 | xargs)
# 检查驱动版本
local gpu_key=""
for key in "${!MIN_DRIVERS[@]}"; do
if echo "$DETECTED_GPU" | grep -qi "$key"; then
gpu_key="$key"
break
fi
done
if [[ -n "$gpu_key" ]]; then
local min_drv="${MIN_DRIVERS[$gpu_key]}"
local drv_major="${DETECTED_DRIVER%%.*}"
if [[ "$drv_major" -lt "$min_drv" ]]; then
warn "驱动 $DETECTED_DRIVER < 最低要求 $min_drv$gpu_key 需要)"
else
ok "GPU: $DETECTED_GPU | 驱动: $DETECTED_DRIVER (>= $min_drv)"
fi
else
ok "GPU: $DETECTED_GPU | 驱动: $DETECTED_DRIVER"
fi
return 0
}
detect_cuda_version() {
# 方式 1: nvcc最可靠代表 toolkit 确实安装了)
if command -v nvcc &>/dev/null; then
CUDA_VERSION=$(nvcc --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+')
if [[ -n "$CUDA_VERSION" ]]; then
ok "CUDA: $CUDA_VERSION (via nvcc)"
_map_cuda_tag
return 0
fi
fi
# 方式 2: nvidia-smi驱动支持的最大 CUDA 版本,非 toolkit
local smi_cuda
smi_cuda=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+')
if [[ -n "$smi_cuda" ]]; then
CUDA_VERSION="$smi_cuda"
warn "CUDA: $CUDA_VERSION (via nvidia-smi — 仅代表驱动能力,非已安装 toolkit)"
warn " → 若编译失败,请安装 CUDA Toolkit: apt install cuda-toolkit-${CUDA_VERSION/./-}"
_map_cuda_tag
return 0
fi
# 方式 3: /usr/local/cuda
if [[ -f /usr/local/cuda/version.txt ]]; then
CUDA_VERSION=$(grep -oP '[0-9]+\.[0-9]+' /usr/local/cuda/version.txt | head -1)
if [[ -n "$CUDA_VERSION" ]]; then
ok "CUDA: $CUDA_VERSION (via /usr/local/cuda/version.txt)"
_map_cuda_tag
return 0
fi
fi
fail "无法检测 CUDA 版本"
echo " → 请安装 CUDA Toolkit: https://developer.nvidia.com/cuda-downloads"
return 1
}
_map_cuda_tag() {
local major minor
major="${CUDA_VERSION%%.*}"
minor="${CUDA_VERSION#*.}"
minor="${minor%%.*}"
if [[ "$major" -eq 11 ]]; then
CUDA_TAG="cu118"
elif [[ "$major" -eq 12 ]]; then
if [[ "$minor" -le 1 ]]; then
CUDA_TAG="cu121"
elif [[ "$minor" -le 4 ]]; then
CUDA_TAG="cu124"
else
CUDA_TAG="cu128"
fi
else
CUDA_TAG="cu128"
warn "未知 CUDA $CUDA_VERSION,默认使用 cu128 索引"
fi
log "PyTorch wheel 索引: $CUDA_TAG"
}
check_python() {
local py_cmd=""
for cmd in python3.12 python3.11 python3.10 python3; do
if command -v "$cmd" &>/dev/null; then
local ver
ver=$("$cmd" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null)
local py_major py_minor
py_major="${ver%%.*}"
py_minor="${ver#*.}"
if [[ "$py_major" -ge 3 ]] && [[ "$py_minor" -ge 10 ]]; then
py_cmd="$cmd"
break
fi
fi
done
if [[ -z "$py_cmd" ]]; then
fail "Python >= 3.10 未找到"
echo " → apt install python3.11 python3.11-venv"
return 1
fi
PYTHON_BIN="$(command -v "$py_cmd")"
ok "Python: $("$py_cmd" --version 2>&1) ($PYTHON_BIN)"
return 0
}
check_cmake() {
if ! command -v cmake &>/dev/null; then
fail "cmake 未找到(编译 nvbandwidth 需要 >= 3.18"
echo " → apt install cmake"
return 1
fi
local cmake_ver
cmake_ver=$(cmake --version | head -1 | grep -oP '[0-9]+\.[0-9]+')
local cmake_major cmake_minor
cmake_major="${cmake_ver%%.*}"
cmake_minor="${cmake_ver#*.}"
if [[ "$cmake_major" -lt 3 ]] || { [[ "$cmake_major" -eq 3 ]] && [[ "$cmake_minor" -lt 18 ]]; }; then
fail "cmake $cmake_ver < 3.18nvbandwidth 需要 >= 3.18"
echo " → 升级 cmake: pip install cmake 或从源码安装"
return 1
fi
ok "cmake: $cmake_ver"
return 0
}
check_compiler() {
if ! command -v gcc &>/dev/null || ! command -v g++ &>/dev/null; then
fail "gcc/g++ 未找到"
echo " → apt install build-essential"
return 1
fi
local gcc_ver
gcc_ver=$(gcc -dumpversion 2>/dev/null)
ok "gcc/g++: $gcc_ver"
return 0
}
check_mpi() {
if command -v mpirun &>/dev/null || command -v mpiexec &>/dev/null; then
HAS_MPI=1
ok "MPI: $(mpirun --version 2>&1 | head -1)"
else
HAS_MPI=0
warn "mpirun 未找到nccl-tests 将不使用 MPI 模式)"
echo " → apt install openmpi-bin libopenmpi-dev"
fi
return 0
}
check_nccl_dev() {
if ldconfig -p 2>/dev/null | grep -q libnccl; then
HAS_NCCL_DEV=1
ok "libnccl: 已找到 (via ldconfig)"
return 0
fi
if [[ -f /usr/include/nccl.h ]] || dpkg -l libnccl-dev &>/dev/null 2>&1; then
HAS_NCCL_DEV=1
ok "libnccl-dev: 已安装"
return 0
fi
HAS_NCCL_DEV=0
warn "libnccl-dev 未找到(将跳过 nccl-tests 编译)"
echo " → apt install libnccl-dev libnccl2"
return 0
}
install_system_deps() {
log "安装系统依赖包..."
if command -v apt-get &>/dev/null; then
apt-get update -qq
apt-get install -y -qq build-essential git cmake wget curl \
openmpi-bin libopenmpi-dev openssh-client \
infiniband-diags ibverbs-utils perftest \
python3 python3-pip python3-venv \
libnccl-dev libnccl2 \
2>/dev/null || warn "部分包安装失败(可能已安装)"
elif command -v dnf &>/dev/null; then
dnf groupinstall -y "Development Tools" 2>/dev/null || true
dnf install -y git cmake wget curl \
openmpi openmpi-devel openssh-clients \
infiniband-diags libibverbs-utils perftest \
python3 python3-pip \
2>/dev/null || warn "部分包安装失败"
elif command -v yum &>/dev/null; then
yum groupinstall -y "Development Tools" 2>/dev/null || true
yum install -y git cmake wget curl \
openmpi openmpi-devel openssh-clients \
infiniband-diags libibverbs-utils perftest \
python3 python3-pip \
2>/dev/null || warn "部分包安装失败"
else
warn "未识别的包管理器,请手动安装依赖"
fi
ok "系统依赖安装完成"
}
validate_environment() {
banner "阶段 1/6: 环境校验"
local errors=0
check_nvidia_smi || ((errors++))
detect_gpu_and_driver || true
detect_cuda_version || ((errors++))
check_python || ((errors++))
check_cmake || ((errors++))
check_compiler || ((errors++))
check_mpi || true
check_nccl_dev || true
echo ""
if [[ $errors -gt 0 ]]; then
fail "环境校验发现 $errors 个必要组件缺失"
if [[ $FLAG_INSTALL_SYS_DEPS -eq 1 ]]; then
log "检测到 --install-system-deps尝试安装..."
install_system_deps
# 重新校验
errors=0
check_python || ((errors++))
check_cmake || ((errors++))
check_compiler || ((errors++))
check_mpi || true
check_nccl_dev || true
if [[ $errors -gt 0 ]]; then
die "安装系统包后仍有 $errors 个组件缺失,请手动解决"
fi
else
echo ""
echo " 提示: 加 --install-system-deps 参数可自动安装缺失的系统包"
echo " 或手动运行上面提示的 apt install 命令后重试"
die "环境校验未通过"
fi
fi
ok "环境校验通过"
}
# ─── 阶段 2: 安装 uv ──────────────────────────────────────────────────────────
ensure_uv() {
banner "阶段 2/6: 确保 uv 可用"
# 检查已有的 uv
if command -v uv &>/dev/null; then
ok "uv 已安装: $(uv --version 2>&1)"
return 0
fi
# 检查常见位置
for p in "$HOME/.local/bin/uv" "$HOME/.cargo/bin/uv" /usr/local/bin/uv; do
if [[ -x "$p" ]]; then
export PATH="$(dirname "$p"):$PATH"
ok "uv 已找到: $p"
return 0
fi
done
log "正在安装 uv..."
if ! curl -LsSf https://astral.sh/uv/install.sh | sh 2>/dev/null; then
die "uv 安装失败。请手动安装: https://docs.astral.sh/uv/getting-started/installation/"
fi
# 将 uv 加入 PATH
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
if ! command -v uv &>/dev/null; then
die "uv 安装后仍无法找到。请检查 PATH。"
fi
ok "uv 安装成功: $(uv --version 2>&1)"
}
# ─── 阶段 3: Python 虚拟环境 ──────────────────────────────────────────────────
setup_python_venv() {
banner "阶段 3/6: 创建 Python 虚拟环境"
local venv_dir="$INSTALL_DIR/.venv"
# 检查已有 venv
if [[ -x "$venv_dir/bin/python" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
local existing_ver
existing_ver=$("$venv_dir/bin/python" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "0.0")
local ev_major="${existing_ver%%.*}"
local ev_minor="${existing_ver#*.}"
if [[ "$ev_major" -ge 3 ]] && [[ "$ev_minor" -ge 10 ]]; then
ok "虚拟环境已存在: $venv_dir (Python $existing_ver)"
else
log "已有 venv 的 Python 版本过低 ($existing_ver),重建中..."
rm -rf "$venv_dir"
fi
fi
# 创建 venv
if [[ ! -x "$venv_dir/bin/python" ]]; then
log "创建虚拟环境: $venv_dir"
uv venv "$venv_dir" --python "$PYTHON_BIN"
ok "虚拟环境创建成功"
fi
# 安装项目依赖
log "安装 Python 依赖rich、pyyaml、numpy..."
uv pip install --python "$venv_dir/bin/python" \
-e "$PROJECT_DIR" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -1; } || true
ok "项目依赖安装完成"
# 安装 PyTorch
if [[ $FLAG_SKIP_PYTORCH -eq 1 ]]; then
warn "跳过 PyTorch 安装(--skip-pytorch"
else
# 检查是否已有 torch
if "$venv_dir/bin/python" -c "import torch" &>/dev/null && [[ $FLAG_REBUILD -eq 0 ]]; then
local torch_ver
torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null)
ok "PyTorch 已安装: $torch_ver"
else
local index_url="https://download.pytorch.org/whl/${CUDA_TAG}"
log "安装 PyTorch (CUDA $CUDA_TAG): $index_url"
log "(下载较大,请耐心等待..."
uv pip install --python "$venv_dir/bin/python" \
"torch>=2.1.0" --index-url "$index_url" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } || {
warn "PyTorch 安装失败,可稍后手动安装:"
echo " source $INSTALL_DIR/env.sh"
echo " uv pip install torch --index-url $index_url"
}
if "$venv_dir/bin/python" -c "import torch" &>/dev/null; then
local torch_ver
torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null)
ok "PyTorch 安装成功: $torch_ver"
fi
fi
fi
}
# ─── 阶段 4: 编译原生工具 ─────────────────────────────────────────────────────
build_nvbandwidth() {
local src="$INSTALL_DIR/nvbandwidth"
# 幂等检查
if [[ -x "$src/nvbandwidth" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
ok "nvbandwidth: 已编译 ($src/nvbandwidth)"
return 0
fi
log "编译 nvbandwidth..."
(
set -e
# 清理 / 克隆
if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then
rm -rf "$src"
fi
if [[ -d "$src/.git" ]]; then
cd "$src" && git pull --ff-only 2>/dev/null || true
elif [[ -d "$src" ]]; then
rm -rf "$src"
git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src"
else
git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src"
fi
cd "$src"
mkdir -p build && cd build
cmake .. -DCMAKE_BUILD_TYPE=Release 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
make -j"$JOBS" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
if [[ -x ./nvbandwidth ]]; then
cp ./nvbandwidth "$src/nvbandwidth"
fi
)
if [[ -x "$src/nvbandwidth" ]]; then
ok "nvbandwidth: 编译成功"
else
warn "nvbandwidth: 编译失败(非致命,可手动编译)"
fi
}
build_nccl_tests() {
local src="$INSTALL_DIR/nccl-tests"
if [[ -x "$src/build/all_reduce_perf" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
ok "nccl-tests: 已编译 ($src/build/)"
return 0
fi
if [[ $HAS_NCCL_DEV -eq 0 ]]; then
warn "nccl-tests: 跳过libnccl-dev 未安装)"
return 0
fi
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
if [[ ! -d "$cuda_home/include" ]]; then
warn "nccl-tests: 跳过CUDA_HOME=$cuda_home 无效)"
return 0
fi
log "编译 nccl-tests..."
(
set -e
if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then
rm -rf "$src"
fi
if [[ -d "$src/.git" ]]; then
cd "$src" && git pull --ff-only 2>/dev/null || true
elif [[ -d "$src" ]]; then
rm -rf "$src"
git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src"
else
git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src"
fi
cd "$src"
if [[ $HAS_MPI -eq 1 ]]; then
make MPI=1 MPI_HOME=/usr CUDA_HOME="$cuda_home" -j"$JOBS" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
else
make CUDA_HOME="$cuda_home" -j"$JOBS" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
fi
)
if [[ -x "$src/build/all_reduce_perf" ]]; then
ok "nccl-tests: 编译成功"
else
warn "nccl-tests: 编译失败(非致命)"
fi
}
build_gpu_burn() {
local src="$INSTALL_DIR/gpu-burn"
if [[ -x "$src/gpu_burn" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
ok "gpu-burn: 已编译 ($src/gpu_burn)"
return 0
fi
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
if [[ ! -d "$cuda_home" ]]; then
warn "gpu-burn: 跳过CUDA_HOME=$cuda_home 不存在)"
return 0
fi
log "编译 gpu-burn..."
(
set -e
if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then
rm -rf "$src"
fi
if [[ -d "$src/.git" ]]; then
cd "$src" && git pull --ff-only 2>/dev/null || true
elif [[ -d "$src" ]]; then
rm -rf "$src"
git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src"
else
git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src"
fi
cd "$src"
make CUDA_PATH="$cuda_home" -j"$JOBS" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
)
if [[ -x "$src/gpu_burn" ]]; then
ok "gpu-burn: 编译成功"
else
warn "gpu-burn: 编译失败(非致命)"
fi
}
build_native_tools() {
banner "阶段 4/6: 编译原生工具"
build_nvbandwidth
build_nccl_tests
build_gpu_burn
}
# ─── 阶段 5: 生成激活脚本 ─────────────────────────────────────────────────────
generate_env_sh() {
banner "阶段 5/6: 生成环境脚本"
local env_file="$INSTALL_DIR/env.sh"
cat > "$env_file" << 'ENVEOF'
#!/usr/bin/env bash
# GPU Test Suite 环境激活脚本
# 用法: source /opt/gpu-test-tools/env.sh
export GPU_TOOLS_DIR="__INSTALL_DIR__"
export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
# 激活 Python 虚拟环境
if [[ -f "$GPU_TOOLS_DIR/.venv/bin/activate" ]]; then
source "$GPU_TOOLS_DIR/.venv/bin/activate"
fi
# 编译工具加入 PATH
export PATH="$GPU_TOOLS_DIR/nvbandwidth:$PATH"
export PATH="$GPU_TOOLS_DIR/nccl-tests/build:$PATH"
export PATH="$GPU_TOOLS_DIR/gpu-burn:$PATH"
export PATH="$CUDA_HOME/bin:$PATH"
# 库路径
export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
ENVEOF
# 替换占位符
sed -i "s|__INSTALL_DIR__|$INSTALL_DIR|g" "$env_file"
chmod +x "$env_file"
ok "env.sh 已生成: $env_file"
# 生成 run-gpu-tests 运行器
local wrapper="$INSTALL_DIR/run-gpu-tests"
cat > "$wrapper" << WRAPEOF
#!/usr/bin/env bash
# GPU Test Suite 一键运行器
# 用法: /opt/gpu-test-tools/run-gpu-tests --test all
SCRIPT_DIR="\$(cd "\$(dirname "\${BASH_SOURCE[0]}")" && pwd)"
source "\$SCRIPT_DIR/env.sh"
exec python3 "$PROJECT_DIR/gpu_tester.py" "\$@"
WRAPEOF
chmod +x "$wrapper"
ok "run-gpu-tests 已生成: $wrapper"
}
# ─── 阶段 6: 打印总结 ─────────────────────────────────────────────────────────
print_summary() {
banner "阶段 6/6: 安装总结"
echo -e "${BOLD}安装目录:${NC} $INSTALL_DIR"
echo ""
echo -e "${BOLD}组件状态:${NC}"
# Python 虚拟环境
if [[ -x "$INSTALL_DIR/.venv/bin/python" ]]; then
local py_ver
py_ver=$("$INSTALL_DIR/.venv/bin/python" --version 2>&1)
echo -e " ${GREEN}${NC} Python venv: $py_ver"
else
echo -e " ${RED}${NC} Python venv: 未创建"
fi
# PyTorch
if "$INSTALL_DIR/.venv/bin/python" -c "import torch" &>/dev/null 2>&1; then
local tv
tv=$("$INSTALL_DIR/.venv/bin/python" -c "import torch; print(f'{torch.__version__} (CUDA {torch.version.cuda})')" 2>/dev/null)
echo -e " ${GREEN}${NC} PyTorch: $tv"
else
echo -e " ${YELLOW}${NC} PyTorch: 未安装"
fi
# 编译工具
for tool_info in \
"$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \
"$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \
"$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do
local path="${tool_info%%:*}"
local name="${tool_info##*:}"
if [[ -x "$path" ]]; then
echo -e " ${GREEN}${NC} $name"
else
echo -e " ${YELLOW}${NC} $name (未编译)"
fi
done
# RDMA 工具(系统级)
local rdma_found=0
for tool in ib_write_bw ib_read_bw ibstat; do
if command -v "$tool" &>/dev/null; then
((rdma_found++))
fi
done
if [[ $rdma_found -gt 0 ]]; then
echo -e " ${GREEN}${NC} RDMA 工具: $rdma_found/3 可用"
else
echo -e " ${YELLOW}${NC} RDMA 工具: 未安装 (apt install perftest infiniband-diags)"
fi
echo ""
echo -e "${BOLD}使用方法:${NC}"
echo ""
echo " # 方式一: source 激活后使用"
echo " source $INSTALL_DIR/env.sh"
echo " python3 $PROJECT_DIR/gpu_tester.py --test all"
echo ""
echo " # 方式二: 一键运行"
echo " $INSTALL_DIR/run-gpu-tests --test all"
echo " $INSTALL_DIR/run-gpu-tests --test health"
echo " $INSTALL_DIR/run-gpu-tests # 交互式菜单"
echo ""
}
# ─── 主函数 ───────────────────────────────────────────────────────────────────
main() {
parse_args "$@"
echo ""
echo -e "${BOLD}${CYAN}╔══════════════════════════════════════════════════╗${NC}"
echo -e "${BOLD}${CYAN}║ GPU Test Suite — 一键安装 ║${NC}"
echo -e "${BOLD}${CYAN}║ 环境隔离 · 自动检测 · 完整部署 ║${NC}"
echo -e "${BOLD}${CYAN}╚══════════════════════════════════════════════════╝${NC}"
echo ""
log "安装目录: $INSTALL_DIR"
log "项目目录: $PROJECT_DIR"
echo ""
check_permissions
validate_environment
ensure_uv
setup_python_venv
build_native_tools
generate_env_sh
print_summary
echo -e "${GREEN}${BOLD}安装完成!${NC}"
}
main "$@"