feat: rewrite install_deps.sh with env isolation and add numpy to requirements

- Complete rewrite of install_deps.sh (6-phase architecture):
  environment validation, uv-based venv isolation, CUDA auto-detection,
  idempotent native tool compilation, env.sh/run-gpu-tests generation
- Add numpy>=1.24 to requirements.txt to align with pyproject.toml
- Support --install-system-deps, --skip-pytorch, --rebuild, -y flags
- Use subshells for compilation to prevent CWD pollution
- Generate env.sh activation script and run-gpu-tests wrapper

🤖 Generated with [Qoder][https://qoder.com]
This commit is contained in:
qinyusen 2026-05-07 01:32:13 +08:00
parent 3e967dd34a
commit 24934bc182
2 changed files with 666 additions and 150 deletions

View File

@ -1,234 +1,749 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -euo pipefail # =============================================================================
# GPU Test Suite — 一键安装脚本(环境隔离版)
# 支持: A100 / A800 / H100 / H200 / B200 / B300
#
# 功能:
# 1. 环境校验GPU、CUDA、Python、编译器等
# 2. 自动安装 uv 并创建隔离 Python 虚拟环境
# 3. 自动检测 CUDA 版本并安装对应 PyTorch
# 4. 编译 nvbandwidth / nccl-tests / gpu-burn
# 5. 生成 env.sh 激活脚本和 run-gpu-tests 运行器
#
# 用法:
# sudo bash install_deps.sh # 标准安装
# sudo bash install_deps.sh --install-system-deps # 同时安装系统包
# sudo bash install_deps.sh --skip-pytorch # 跳过 PyTorch
# sudo bash install_deps.sh --rebuild # 强制重新编译
# sudo bash install_deps.sh -y # 非交互模式
# =============================================================================
set -uo pipefail
# ─── 全局变量 ─────────────────────────────────────────────────────────────────
INSTALL_DIR="${GPU_TOOLS_DIR:-/opt/gpu-test-tools}" INSTALL_DIR="${GPU_TOOLS_DIR:-/opt/gpu-test-tools}"
PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
JOBS="${MAKE_JOBS:-$(nproc)}" JOBS="${MAKE_JOBS:-$(nproc)}"
VERBOSE="${VERBOSE:-0}" VERBOSE="${VERBOSE:-0}"
# 参数标志
FLAG_INSTALL_SYS_DEPS=0
FLAG_SKIP_PYTORCH=0
FLAG_REBUILD=0
FLAG_YES=0
# 检测结果(全局)
DETECTED_GPU=""
DETECTED_DRIVER=""
CUDA_VERSION=""
CUDA_TAG=""
PYTHON_BIN=""
HAS_MPI=0
HAS_NCCL_DEV=0
# ─── 颜色和日志 ──────────────────────────────────────────────────────────────
RED='\033[0;31m' RED='\033[0;31m'
GREEN='\033[0;32m' GREEN='\033[0;32m'
YELLOW='\033[1;33m' YELLOW='\033[1;33m'
CYAN='\033[0;36m' CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m' NC='\033[0m'
log() { echo -e "${CYAN}[INFO]${NC} $*"; } log() { echo -e "${CYAN}[INFO]${NC} $*"; }
ok() { echo -e "${GREEN}[ OK ]${NC} $*"; } ok() { echo -e "${GREEN}[ OK ]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; } fail() { echo -e "${RED}[FAIL]${NC} $*"; }
die() { echo -e "${RED}[FATAL]${NC} $*"; exit 1; }
banner() { echo -e "\n${BOLD}${CYAN}══════ $* ══════${NC}\n"; }
check_root() { # 错误陷阱
if [[ $EUID -ne 0 ]]; then trap 'fail "脚本在第 $LINENO 行出错。设置 VERBOSE=1 查看详情。"' ERR
warn "Not running as root. Some installations may fail."
warn "Re-run with: sudo $0" # ─── 参数解析 ─────────────────────────────────────────────────────────────────
fi parse_args() {
while [[ $# -gt 0 ]]; do
case "$1" in
--install-system-deps) FLAG_INSTALL_SYS_DEPS=1 ;;
--skip-pytorch) FLAG_SKIP_PYTORCH=1 ;;
--rebuild) FLAG_REBUILD=1 ;;
-y|--yes) FLAG_YES=1 ;;
-h|--help)
echo "用法: $0 [选项]"
echo ""
echo "选项:"
echo " --install-system-deps 自动安装缺失的系统包"
echo " --skip-pytorch 跳过 PyTorch 安装"
echo " --rebuild 强制重新编译原生工具"
echo " -y, --yes 非交互模式"
echo " -h, --help 显示此帮助"
echo ""
echo "环境变量:"
echo " GPU_TOOLS_DIR 安装目录 (默认: /opt/gpu-test-tools)"
echo " MAKE_JOBS 编译并行数 (默认: nproc)"
echo " CUDA_HOME CUDA 安装路径 (默认: /usr/local/cuda)"
exit 0
;;
*) warn "未知参数: $1" ;;
esac
shift
done
} }
detect_gpu() { # ─── 阶段 0: 权限检查 ─────────────────────────────────────────────────────────
if ! command -v nvidia-smi &>/dev/null; then check_permissions() {
fail "nvidia-smi not found. Install NVIDIA drivers first." local parent_dir
exit 1 parent_dir="$(dirname "$INSTALL_DIR")"
if [[ ! -w "$parent_dir" ]] && [[ ! -d "$INSTALL_DIR" || ! -w "$INSTALL_DIR" ]]; then
die "无法写入 $INSTALL_DIR(请使用 sudo 或设置 GPU_TOOLS_DIR 到可写路径)"
fi fi
local gpu_name mkdir -p "$INSTALL_DIR"
gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1) }
log "Detected GPU: $gpu_name"
# ─── 阶段 1: 环境校验 ─────────────────────────────────────────────────────────
# 最低驱动版本表
declare -A MIN_DRIVERS=(
["A100"]="470" ["A800"]="470"
["H100"]="535" ["H200"]="535"
["B200"]="550" ["B300"]="550"
)
check_nvidia_smi() {
if ! command -v nvidia-smi &>/dev/null; then
fail "nvidia-smi 未找到"
echo " → 请先安装 NVIDIA 驱动"
return 1
fi
ok "nvidia-smi 可用"
return 0
}
detect_gpu_and_driver() {
local smi_out
smi_out=$(nvidia-smi --query-gpu=name,driver_version --format=csv,noheader 2>/dev/null | head -1)
if [[ -z "$smi_out" ]]; then
warn "无法查询 GPU 信息"
return 1
fi
DETECTED_GPU=$(echo "$smi_out" | cut -d',' -f1 | xargs)
DETECTED_DRIVER=$(echo "$smi_out" | cut -d',' -f2 | xargs)
# 检查驱动版本
local gpu_key=""
for key in "${!MIN_DRIVERS[@]}"; do
if echo "$DETECTED_GPU" | grep -qi "$key"; then
gpu_key="$key"
break
fi
done
if [[ -n "$gpu_key" ]]; then
local min_drv="${MIN_DRIVERS[$gpu_key]}"
local drv_major="${DETECTED_DRIVER%%.*}"
if [[ "$drv_major" -lt "$min_drv" ]]; then
warn "驱动 $DETECTED_DRIVER < 最低要求 $min_drv$gpu_key 需要)"
else
ok "GPU: $DETECTED_GPU | 驱动: $DETECTED_DRIVER (>= $min_drv)"
fi
else
ok "GPU: $DETECTED_GPU | 驱动: $DETECTED_DRIVER"
fi
return 0
}
detect_cuda_version() {
# 方式 1: nvcc最可靠代表 toolkit 确实安装了)
if command -v nvcc &>/dev/null; then
CUDA_VERSION=$(nvcc --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+')
if [[ -n "$CUDA_VERSION" ]]; then
ok "CUDA: $CUDA_VERSION (via nvcc)"
_map_cuda_tag
return 0
fi
fi
# 方式 2: nvidia-smi驱动支持的最大 CUDA 版本,非 toolkit
local smi_cuda
smi_cuda=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+')
if [[ -n "$smi_cuda" ]]; then
CUDA_VERSION="$smi_cuda"
warn "CUDA: $CUDA_VERSION (via nvidia-smi — 仅代表驱动能力,非已安装 toolkit)"
warn " → 若编译失败,请安装 CUDA Toolkit: apt install cuda-toolkit-${CUDA_VERSION/./-}"
_map_cuda_tag
return 0
fi
# 方式 3: /usr/local/cuda
if [[ -f /usr/local/cuda/version.txt ]]; then
CUDA_VERSION=$(grep -oP '[0-9]+\.[0-9]+' /usr/local/cuda/version.txt | head -1)
if [[ -n "$CUDA_VERSION" ]]; then
ok "CUDA: $CUDA_VERSION (via /usr/local/cuda/version.txt)"
_map_cuda_tag
return 0
fi
fi
fail "无法检测 CUDA 版本"
echo " → 请安装 CUDA Toolkit: https://developer.nvidia.com/cuda-downloads"
return 1
}
_map_cuda_tag() {
local major minor
major="${CUDA_VERSION%%.*}"
minor="${CUDA_VERSION#*.}"
minor="${minor%%.*}"
if [[ "$major" -eq 11 ]]; then
CUDA_TAG="cu118"
elif [[ "$major" -eq 12 ]]; then
if [[ "$minor" -le 1 ]]; then
CUDA_TAG="cu121"
elif [[ "$minor" -le 4 ]]; then
CUDA_TAG="cu124"
else
CUDA_TAG="cu128"
fi
else
CUDA_TAG="cu128"
warn "未知 CUDA $CUDA_VERSION,默认使用 cu128 索引"
fi
log "PyTorch wheel 索引: $CUDA_TAG"
}
check_python() {
local py_cmd=""
for cmd in python3.12 python3.11 python3.10 python3; do
if command -v "$cmd" &>/dev/null; then
local ver
ver=$("$cmd" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null)
local py_major py_minor
py_major="${ver%%.*}"
py_minor="${ver#*.}"
if [[ "$py_major" -ge 3 ]] && [[ "$py_minor" -ge 10 ]]; then
py_cmd="$cmd"
break
fi
fi
done
if [[ -z "$py_cmd" ]]; then
fail "Python >= 3.10 未找到"
echo " → apt install python3.11 python3.11-venv"
return 1
fi
PYTHON_BIN="$(command -v "$py_cmd")"
ok "Python: $("$py_cmd" --version 2>&1) ($PYTHON_BIN)"
return 0
}
check_cmake() {
if ! command -v cmake &>/dev/null; then
fail "cmake 未找到(编译 nvbandwidth 需要 >= 3.18"
echo " → apt install cmake"
return 1
fi
local cmake_ver
cmake_ver=$(cmake --version | head -1 | grep -oP '[0-9]+\.[0-9]+')
local cmake_major cmake_minor
cmake_major="${cmake_ver%%.*}"
cmake_minor="${cmake_ver#*.}"
if [[ "$cmake_major" -lt 3 ]] || { [[ "$cmake_major" -eq 3 ]] && [[ "$cmake_minor" -lt 18 ]]; }; then
fail "cmake $cmake_ver < 3.18nvbandwidth 需要 >= 3.18"
echo " → 升级 cmake: pip install cmake 或从源码安装"
return 1
fi
ok "cmake: $cmake_ver"
return 0
}
check_compiler() {
if ! command -v gcc &>/dev/null || ! command -v g++ &>/dev/null; then
fail "gcc/g++ 未找到"
echo " → apt install build-essential"
return 1
fi
local gcc_ver
gcc_ver=$(gcc -dumpversion 2>/dev/null)
ok "gcc/g++: $gcc_ver"
return 0
}
check_mpi() {
if command -v mpirun &>/dev/null || command -v mpiexec &>/dev/null; then
HAS_MPI=1
ok "MPI: $(mpirun --version 2>&1 | head -1)"
else
HAS_MPI=0
warn "mpirun 未找到nccl-tests 将不使用 MPI 模式)"
echo " → apt install openmpi-bin libopenmpi-dev"
fi
return 0
}
check_nccl_dev() {
if ldconfig -p 2>/dev/null | grep -q libnccl; then
HAS_NCCL_DEV=1
ok "libnccl: 已找到 (via ldconfig)"
return 0
fi
if [[ -f /usr/include/nccl.h ]] || dpkg -l libnccl-dev &>/dev/null 2>&1; then
HAS_NCCL_DEV=1
ok "libnccl-dev: 已安装"
return 0
fi
HAS_NCCL_DEV=0
warn "libnccl-dev 未找到(将跳过 nccl-tests 编译)"
echo " → apt install libnccl-dev libnccl2"
return 0
} }
install_system_deps() { install_system_deps() {
log "Installing system dependencies..." log "安装系统依赖包..."
if command -v apt-get &>/dev/null; then if command -v apt-get &>/dev/null; then
apt-get update -qq apt-get update -qq
apt-get install -y -qq build-essential git cmake wget curl \ apt-get install -y -qq build-essential git cmake wget curl \
openmpi-bin libopenmpi-dev openssh-client \ openmpi-bin libopenmpi-dev openssh-client \
infiniband-diags ibverbs-utils perftest \ infiniband-diags ibverbs-utils perftest \
python3 python3-pip python3-venv \ python3 python3-pip python3-venv \
2>/dev/null || warn "Some apt packages failed (may already be installed)" libnccl-dev libnccl2 \
elif command -v yum &>/dev/null; then 2>/dev/null || warn "部分包安装失败(可能已安装)"
yum groupinstall -y "Development Tools" 2>/dev/null || true
yum install -y git cmake wget curl \
openmpi openmpi-devel openssh-clients \
infiniband-diags libibverbs-utils perftest \
python3 python3-pip \
2>/dev/null || warn "Some yum packages failed"
elif command -v dnf &>/dev/null; then elif command -v dnf &>/dev/null; then
dnf groupinstall -y "Development Tools" 2>/dev/null || true dnf groupinstall -y "Development Tools" 2>/dev/null || true
dnf install -y git cmake wget curl \ dnf install -y git cmake wget curl \
openmpi openmpi-devel openssh-clients \ openmpi openmpi-devel openssh-clients \
infiniband-diags libibverbs-utils perftest \ infiniband-diags libibverbs-utils perftest \
python3 python3-pip \ python3 python3-pip \
2>/dev/null || warn "Some dnf packages failed" 2>/dev/null || warn "部分包安装失败"
elif command -v yum &>/dev/null; then
yum groupinstall -y "Development Tools" 2>/dev/null || true
yum install -y git cmake wget curl \
openmpi openmpi-devel openssh-clients \
infiniband-diags libibverbs-utils perftest \
python3 python3-pip \
2>/dev/null || warn "部分包安装失败"
else else
warn "Unsupported package manager. Install deps manually." warn "未识别的包管理器,请手动安装依赖"
fi fi
ok "System dependencies" ok "系统依赖安装完成"
} }
install_python_deps() { validate_environment() {
log "Installing Python dependencies..." banner "阶段 1/6: 环境校验"
pip3 install --quiet rich pyyaml 2>/dev/null || pip install --quiet rich pyyaml
ok "Python dependencies (rich, pyyaml)" local errors=0
check_nvidia_smi || ((errors++))
detect_gpu_and_driver || true
detect_cuda_version || ((errors++))
check_python || ((errors++))
check_cmake || ((errors++))
check_compiler || ((errors++))
check_mpi || true
check_nccl_dev || true
echo ""
if [[ $errors -gt 0 ]]; then
fail "环境校验发现 $errors 个必要组件缺失"
if [[ $FLAG_INSTALL_SYS_DEPS -eq 1 ]]; then
log "检测到 --install-system-deps尝试安装..."
install_system_deps
# 重新校验
errors=0
check_python || ((errors++))
check_cmake || ((errors++))
check_compiler || ((errors++))
check_mpi || true
check_nccl_dev || true
if [[ $errors -gt 0 ]]; then
die "安装系统包后仍有 $errors 个组件缺失,请手动解决"
fi
else
echo ""
echo " 提示: 加 --install-system-deps 参数可自动安装缺失的系统包"
echo " 或手动运行上面提示的 apt install 命令后重试"
die "环境校验未通过"
fi
fi
ok "环境校验通过"
} }
install_nvbandwidth() { # ─── 阶段 2: 安装 uv ──────────────────────────────────────────────────────────
log "Installing nvbandwidth..." ensure_uv() {
banner "阶段 2/6: 确保 uv 可用"
# 检查已有的 uv
if command -v uv &>/dev/null; then
ok "uv 已安装: $(uv --version 2>&1)"
return 0
fi
# 检查常见位置
for p in "$HOME/.local/bin/uv" "$HOME/.cargo/bin/uv" /usr/local/bin/uv; do
if [[ -x "$p" ]]; then
export PATH="$(dirname "$p"):$PATH"
ok "uv 已找到: $p"
return 0
fi
done
log "正在安装 uv..."
if ! curl -LsSf https://astral.sh/uv/install.sh | sh 2>/dev/null; then
die "uv 安装失败。请手动安装: https://docs.astral.sh/uv/getting-started/installation/"
fi
# 将 uv 加入 PATH
export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
if ! command -v uv &>/dev/null; then
die "uv 安装后仍无法找到。请检查 PATH。"
fi
ok "uv 安装成功: $(uv --version 2>&1)"
}
# ─── 阶段 3: Python 虚拟环境 ──────────────────────────────────────────────────
setup_python_venv() {
banner "阶段 3/6: 创建 Python 虚拟环境"
local venv_dir="$INSTALL_DIR/.venv"
# 检查已有 venv
if [[ -x "$venv_dir/bin/python" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
local existing_ver
existing_ver=$("$venv_dir/bin/python" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "0.0")
local ev_major="${existing_ver%%.*}"
local ev_minor="${existing_ver#*.}"
if [[ "$ev_major" -ge 3 ]] && [[ "$ev_minor" -ge 10 ]]; then
ok "虚拟环境已存在: $venv_dir (Python $existing_ver)"
else
log "已有 venv 的 Python 版本过低 ($existing_ver),重建中..."
rm -rf "$venv_dir"
fi
fi
# 创建 venv
if [[ ! -x "$venv_dir/bin/python" ]]; then
log "创建虚拟环境: $venv_dir"
uv venv "$venv_dir" --python "$PYTHON_BIN"
ok "虚拟环境创建成功"
fi
# 安装项目依赖
log "安装 Python 依赖rich、pyyaml、numpy..."
uv pip install --python "$venv_dir/bin/python" \
-e "$PROJECT_DIR" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -1; } || true
ok "项目依赖安装完成"
# 安装 PyTorch
if [[ $FLAG_SKIP_PYTORCH -eq 1 ]]; then
warn "跳过 PyTorch 安装(--skip-pytorch"
else
# 检查是否已有 torch
if "$venv_dir/bin/python" -c "import torch" &>/dev/null && [[ $FLAG_REBUILD -eq 0 ]]; then
local torch_ver
torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null)
ok "PyTorch 已安装: $torch_ver"
else
local index_url="https://download.pytorch.org/whl/${CUDA_TAG}"
log "安装 PyTorch (CUDA $CUDA_TAG): $index_url"
log "(下载较大,请耐心等待..."
uv pip install --python "$venv_dir/bin/python" \
"torch>=2.1.0" --index-url "$index_url" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } || {
warn "PyTorch 安装失败,可稍后手动安装:"
echo " source $INSTALL_DIR/env.sh"
echo " uv pip install torch --index-url $index_url"
}
if "$venv_dir/bin/python" -c "import torch" &>/dev/null; then
local torch_ver
torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null)
ok "PyTorch 安装成功: $torch_ver"
fi
fi
fi
}
# ─── 阶段 4: 编译原生工具 ─────────────────────────────────────────────────────
build_nvbandwidth() {
local src="$INSTALL_DIR/nvbandwidth" local src="$INSTALL_DIR/nvbandwidth"
if [[ -x "$src/nvbandwidth" ]]; then
ok "nvbandwidth already installed at $src/nvbandwidth" # 幂等检查
return if [[ -x "$src/nvbandwidth" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
ok "nvbandwidth: 已编译 ($src/nvbandwidth)"
return 0
fi
log "编译 nvbandwidth..."
(
set -e
# 清理 / 克隆
if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then
rm -rf "$src"
fi
if [[ -d "$src/.git" ]]; then
cd "$src" && git pull --ff-only 2>/dev/null || true
elif [[ -d "$src" ]]; then
rm -rf "$src"
git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src"
else
git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src"
fi fi
mkdir -p "$INSTALL_DIR"
git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" 2>/dev/null
cd "$src" cd "$src"
mkdir -p build && cd build mkdir -p build && cd build
cmake .. -DCMAKE_BUILD_TYPE=Release 2>/dev/null cmake .. -DCMAKE_BUILD_TYPE=Release 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
make -j"$JOBS" 2>/dev/null make -j"$JOBS" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
if [[ -x "$src/build/nvbandwidth" ]]; then
cp "$src/build/nvbandwidth" "$src/nvbandwidth" if [[ -x ./nvbandwidth ]]; then
ok "nvbandwidth installed at $src/nvbandwidth" cp ./nvbandwidth "$src/nvbandwidth"
fi
)
if [[ -x "$src/nvbandwidth" ]]; then
ok "nvbandwidth: 编译成功"
else else
warn "nvbandwidth build failed. Try building manually in $src" warn "nvbandwidth: 编译失败(非致命,可手动编译)"
fi fi
} }
install_nccl_tests() { build_nccl_tests() {
log "Installing nccl-tests..."
local src="$INSTALL_DIR/nccl-tests" local src="$INSTALL_DIR/nccl-tests"
if [[ -x "$src/build/all_reduce_perf" ]]; then
ok "nccl-tests already installed at $src/build/" if [[ -x "$src/build/all_reduce_perf" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
return ok "nccl-tests: 已编译 ($src/build/)"
return 0
fi
if [[ $HAS_NCCL_DEV -eq 0 ]]; then
warn "nccl-tests: 跳过libnccl-dev 未安装)"
return 0
fi
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
if [[ ! -d "$cuda_home/include" ]]; then
warn "nccl-tests: 跳过CUDA_HOME=$cuda_home 无效)"
return 0
fi
log "编译 nccl-tests..."
(
set -e
if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then
rm -rf "$src"
fi
if [[ -d "$src/.git" ]]; then
cd "$src" && git pull --ff-only 2>/dev/null || true
elif [[ -d "$src" ]]; then
rm -rf "$src"
git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src"
else
git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src"
fi fi
mkdir -p "$INSTALL_DIR"
git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" 2>/dev/null
cd "$src" cd "$src"
if [[ $HAS_MPI -eq 1 ]]; then
make MPI=1 MPI_HOME=/usr CUDA_HOME="$cuda_home" -j"$JOBS" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
else
make CUDA_HOME="$cuda_home" -j"$JOBS" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
fi
)
if [[ -x "$src/build/all_reduce_perf" ]]; then
ok "nccl-tests: 编译成功"
else
warn "nccl-tests: 编译失败(非致命)"
fi
}
build_gpu_burn() {
local src="$INSTALL_DIR/gpu-burn"
if [[ -x "$src/gpu_burn" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then
ok "gpu-burn: 已编译 ($src/gpu_burn)"
return 0
fi
local cuda_home="${CUDA_HOME:-/usr/local/cuda}" local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
if [[ ! -d "$cuda_home" ]]; then if [[ ! -d "$cuda_home" ]]; then
warn "CUDA_HOME not found at $cuda_home. Set CUDA_HOME env var." warn "gpu-burn: 跳过CUDA_HOME=$cuda_home 不存在)"
return return 0
fi fi
make MPI=1 MPI_HOME=/usr -j"$JOBS" 2>/dev/null || \ log "编译 gpu-burn..."
make CUDA_HOME="$cuda_home" -j"$JOBS" 2>/dev/null || \ (
warn "nccl-tests build failed. Try: cd $src && make MPI=1" set -e
if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then
if [[ -x "$src/build/all_reduce_perf" ]]; then rm -rf "$src"
ok "nccl-tests installed at $src/build/" fi
if [[ -d "$src/.git" ]]; then
cd "$src" && git pull --ff-only 2>/dev/null || true
elif [[ -d "$src" ]]; then
rm -rf "$src"
git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src"
else else
warn "nccl-tests build incomplete" git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src"
fi
}
install_gpu_burn() {
log "Installing gpu-burn..."
local src="$INSTALL_DIR/gpu-burn"
if [[ -x "$src/gpu_burn" ]]; then
ok "gpu-burn already installed at $src/gpu_burn"
return
fi fi
mkdir -p "$INSTALL_DIR"
git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" 2>/dev/null
cd "$src" cd "$src"
make -j"$JOBS" 2>/dev/null || warn "gpu-burn build failed" make CUDA_PATH="$cuda_home" -j"$JOBS" \
2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; }
)
if [[ -x "$src/gpu_burn" ]]; then if [[ -x "$src/gpu_burn" ]]; then
ok "gpu-burn installed at $src/gpu_burn" ok "gpu-burn: 编译成功"
else else
warn "gpu-burn build incomplete" warn "gpu-burn: 编译失败(非致命)"
fi fi
} }
check_dcgm() { build_native_tools() {
log "Checking DCGM..." banner "阶段 4/6: 编译原生工具"
if command -v nv-hostengine &>/dev/null || command -v dcgmi &>/dev/null; then build_nvbandwidth
ok "DCGM already installed" build_nccl_tests
return build_gpu_burn
fi
if dpkg -l datacenter-gpu-manager &>/dev/null 2>&1; then
ok "DCGM package installed"
return
fi
warn "DCGM not found. Install from: https://docs.nvidia.com/datacenter/dcgm/latest/installation-guide.html"
warn " Ubuntu: sudo apt install datacenter-gpu-manager"
warn " Or: curl -fsSL https://deb.nvidia.com/datacenter-gpu-manager/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-dcgm.gpg"
} }
check_rdma_tools() { # ─── 阶段 5: 生成激活脚本 ─────────────────────────────────────────────────────
log "Checking RDMA tools..." generate_env_sh() {
local found=0 banner "阶段 5/6: 生成环境脚本"
for tool in ib_write_bw ib_read_bw ib_write_lat ib_read_lat ibstat ibv_devinfo; do
if command -v "$tool" &>/dev/null; then local env_file="$INSTALL_DIR/env.sh"
found=$((found + 1)) cat > "$env_file" << 'ENVEOF'
else #!/usr/bin/env bash
warn " $tool not found (install: perftest infiniband-diags)" # GPU Test Suite 环境激活脚本
fi # 用法: source /opt/gpu-test-tools/env.sh
done
if [[ $found -gt 0 ]]; then export GPU_TOOLS_DIR="__INSTALL_DIR__"
ok "$found/$RDMA_TOOL_COUNT RDMA tools found" 2>/dev/null || ok "Some RDMA tools found" export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
# 激活 Python 虚拟环境
if [[ -f "$GPU_TOOLS_DIR/.venv/bin/activate" ]]; then
source "$GPU_TOOLS_DIR/.venv/bin/activate"
fi fi
# 编译工具加入 PATH
export PATH="$GPU_TOOLS_DIR/nvbandwidth:$PATH"
export PATH="$GPU_TOOLS_DIR/nccl-tests/build:$PATH"
export PATH="$GPU_TOOLS_DIR/gpu-burn:$PATH"
export PATH="$CUDA_HOME/bin:$PATH"
# 库路径
export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}"
ENVEOF
# 替换占位符
sed -i "s|__INSTALL_DIR__|$INSTALL_DIR|g" "$env_file"
chmod +x "$env_file"
ok "env.sh 已生成: $env_file"
# 生成 run-gpu-tests 运行器
local wrapper="$INSTALL_DIR/run-gpu-tests"
cat > "$wrapper" << WRAPEOF
#!/usr/bin/env bash
# GPU Test Suite 一键运行器
# 用法: /opt/gpu-test-tools/run-gpu-tests --test all
SCRIPT_DIR="\$(cd "\$(dirname "\${BASH_SOURCE[0]}")" && pwd)"
source "\$SCRIPT_DIR/env.sh"
exec python3 "$PROJECT_DIR/gpu_tester.py" "\$@"
WRAPEOF
chmod +x "$wrapper"
ok "run-gpu-tests 已生成: $wrapper"
} }
# ─── 阶段 6: 打印总结 ─────────────────────────────────────────────────────────
print_summary() { print_summary() {
echo "" banner "阶段 6/6: 安装总结"
echo "=========================================="
echo " GPU Test Suite - Installation Summary"
echo "=========================================="
echo ""
echo " Install directory: $INSTALL_DIR"
echo ""
echo " Tools status:"
for tool_path in \ echo -e "${BOLD}安装目录:${NC} $INSTALL_DIR"
echo ""
echo -e "${BOLD}组件状态:${NC}"
# Python 虚拟环境
if [[ -x "$INSTALL_DIR/.venv/bin/python" ]]; then
local py_ver
py_ver=$("$INSTALL_DIR/.venv/bin/python" --version 2>&1)
echo -e " ${GREEN}${NC} Python venv: $py_ver"
else
echo -e " ${RED}${NC} Python venv: 未创建"
fi
# PyTorch
if "$INSTALL_DIR/.venv/bin/python" -c "import torch" &>/dev/null 2>&1; then
local tv
tv=$("$INSTALL_DIR/.venv/bin/python" -c "import torch; print(f'{torch.__version__} (CUDA {torch.version.cuda})')" 2>/dev/null)
echo -e " ${GREEN}${NC} PyTorch: $tv"
else
echo -e " ${YELLOW}${NC} PyTorch: 未安装"
fi
# 编译工具
for tool_info in \
"$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \ "$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \
"$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \ "$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \
"$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do "$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do
path="${tool_path%%:*}" local path="${tool_info%%:*}"
name="${tool_path##*:}" local name="${tool_info##*:}"
if [[ -x "$path" ]]; then if [[ -x "$path" ]]; then
echo -e " [${GREEN}${NC}] $name" echo -e " ${GREEN}${NC} $name"
else else
echo -e " [${YELLOW}?${NC}] $name (not built)" echo -e " ${YELLOW}${NC} $name (未编译)"
fi fi
done done
echo "" # RDMA 工具(系统级)
echo " System tools:" local rdma_found=0
for cmd in nvidia-smi mpirun nvbandwidth ib_write_bw dcgmi; do for tool in ib_write_bw ib_read_bw ibstat; do
if command -v "$cmd" &>/dev/null; then if command -v "$tool" &>/dev/null; then
echo -e " [${GREEN}${NC}] $cmd" ((rdma_found++))
else
echo -e " [${YELLOW}-${NC}] $cmd (not found)"
fi fi
done done
if [[ $rdma_found -gt 0 ]]; then
echo -e " ${GREEN}${NC} RDMA 工具: $rdma_found/3 可用"
else
echo -e " ${YELLOW}${NC} RDMA 工具: 未安装 (apt install perftest infiniband-diags)"
fi
echo "" echo ""
echo " Usage:" echo -e "${BOLD}使用方法:${NC}"
echo " python3 gpu_tester.py # Interactive menu" echo ""
echo " python3 gpu_tester.py --test all # Full suite" echo " # 方式一: source 激活后使用"
echo " source $INSTALL_DIR/env.sh"
echo " python3 $PROJECT_DIR/gpu_tester.py --test all"
echo ""
echo " # 方式二: 一键运行"
echo " $INSTALL_DIR/run-gpu-tests --test all"
echo " $INSTALL_DIR/run-gpu-tests --test health"
echo " $INSTALL_DIR/run-gpu-tests # 交互式菜单"
echo "" echo ""
} }
# ─── 主函数 ───────────────────────────────────────────────────────────────────
main() { main() {
parse_args "$@"
echo "" echo ""
echo "==========================================" echo -e "${BOLD}${CYAN}╔══════════════════════════════════════════════════╗${NC}"
echo " GPU Test Suite - Dependency Installer" echo -e "${BOLD}${CYAN}║ GPU Test Suite — 一键安装 ║${NC}"
echo "==========================================" echo -e "${BOLD}${CYAN}║ 环境隔离 · 自动检测 · 完整部署 ║${NC}"
echo -e "${BOLD}${CYAN}╚══════════════════════════════════════════════════╝${NC}"
echo ""
log "安装目录: $INSTALL_DIR"
log "项目目录: $PROJECT_DIR"
echo "" echo ""
check_root check_permissions
detect_gpu validate_environment
ensure_uv
mkdir -p "$INSTALL_DIR" setup_python_venv
build_native_tools
install_system_deps generate_env_sh
install_python_deps
install_nvbandwidth
install_nccl_tests
install_gpu_burn
check_dcgm
check_rdma_tools
print_summary print_summary
echo -e "${GREEN}${BOLD}安装完成!${NC}"
} }
main "$@" main "$@"

View File

@ -1,2 +1,3 @@
rich>=13.0 rich>=13.0
pyyaml>=6.0 pyyaml>=6.0
numpy>=1.24