#!/usr/bin/env bash # ============================================================================= # GPU Test Suite — 一键安装脚本(环境隔离版) # 支持: A100 / A800 / H100 / H200 / B200 / B300 # # 功能: # 1. 环境校验(GPU、CUDA、Python、编译器等) # 2. 自动安装 uv 并创建隔离 Python 虚拟环境 # 3. 自动检测 CUDA 版本并安装对应 PyTorch # 4. 编译 nvbandwidth / nccl-tests / gpu-burn # 5. 生成 env.sh 激活脚本和 run-gpu-tests 运行器 # # 用法: # sudo bash install_deps.sh # 标准安装 # sudo bash install_deps.sh --install-system-deps # 同时安装系统包 # sudo bash install_deps.sh --skip-pytorch # 跳过 PyTorch # sudo bash install_deps.sh --rebuild # 强制重新编译 # sudo bash install_deps.sh -y # 非交互模式 # ============================================================================= set -uo pipefail # ─── 全局变量 ───────────────────────────────────────────────────────────────── INSTALL_DIR="${GPU_TOOLS_DIR:-/opt/gpu-test-tools}" PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" JOBS="${MAKE_JOBS:-$(nproc)}" VERBOSE="${VERBOSE:-0}" # 参数标志 FLAG_INSTALL_SYS_DEPS=0 FLAG_SKIP_PYTORCH=0 FLAG_REBUILD=0 FLAG_YES=0 # 检测结果(全局) DETECTED_GPU="" DETECTED_DRIVER="" CUDA_VERSION="" CUDA_TAG="" PYTHON_BIN="" HAS_MPI=0 HAS_NCCL_DEV=0 # ─── 颜色和日志 ────────────────────────────────────────────────────────────── RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' log() { echo -e "${CYAN}[INFO]${NC} $*"; } ok() { echo -e "${GREEN}[ OK ]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } fail() { echo -e "${RED}[FAIL]${NC} $*"; } die() { echo -e "${RED}[FATAL]${NC} $*"; exit 1; } banner() { echo -e "\n${BOLD}${CYAN}══════ $* ══════${NC}\n"; } # 错误陷阱 trap 'fail "脚本在第 $LINENO 行出错。设置 VERBOSE=1 查看详情。"' ERR # ─── 参数解析 ───────────────────────────────────────────────────────────────── parse_args() { while [[ $# -gt 0 ]]; do case "$1" in --install-system-deps) FLAG_INSTALL_SYS_DEPS=1 ;; --skip-pytorch) FLAG_SKIP_PYTORCH=1 ;; --rebuild) FLAG_REBUILD=1 ;; -y|--yes) FLAG_YES=1 ;; -h|--help) echo "用法: $0 [选项]" echo "" echo "选项:" echo " --install-system-deps 自动安装缺失的系统包" echo " --skip-pytorch 跳过 PyTorch 安装" echo " --rebuild 强制重新编译原生工具" echo " -y, --yes 非交互模式" echo " -h, --help 显示此帮助" echo "" echo "环境变量:" echo " GPU_TOOLS_DIR 安装目录 (默认: /opt/gpu-test-tools)" echo " MAKE_JOBS 编译并行数 (默认: nproc)" echo " CUDA_HOME CUDA 安装路径 (默认: /usr/local/cuda)" exit 0 ;; *) warn "未知参数: $1" ;; esac shift done } # ─── 阶段 0: 权限检查 ───────────────────────────────────────────────────────── check_permissions() { local parent_dir parent_dir="$(dirname "$INSTALL_DIR")" if [[ ! -w "$parent_dir" ]] && [[ ! -d "$INSTALL_DIR" || ! -w "$INSTALL_DIR" ]]; then die "无法写入 $INSTALL_DIR(请使用 sudo 或设置 GPU_TOOLS_DIR 到可写路径)" fi mkdir -p "$INSTALL_DIR" } # ─── 阶段 1: 环境校验 ───────────────────────────────────────────────────────── # 最低驱动版本表 declare -A MIN_DRIVERS=( ["A100"]="470" ["A800"]="470" ["H100"]="535" ["H200"]="535" ["B200"]="550" ["B300"]="550" ) check_nvidia_smi() { if ! command -v nvidia-smi &>/dev/null; then fail "nvidia-smi 未找到" echo " → 请先安装 NVIDIA 驱动" return 1 fi ok "nvidia-smi 可用" return 0 } detect_gpu_and_driver() { local smi_out smi_out=$(nvidia-smi --query-gpu=name,driver_version --format=csv,noheader 2>/dev/null | head -1) if [[ -z "$smi_out" ]]; then warn "无法查询 GPU 信息" return 1 fi DETECTED_GPU=$(echo "$smi_out" | cut -d',' -f1 | xargs) DETECTED_DRIVER=$(echo "$smi_out" | cut -d',' -f2 | xargs) # 检查驱动版本 local gpu_key="" for key in "${!MIN_DRIVERS[@]}"; do if echo "$DETECTED_GPU" | grep -qi "$key"; then gpu_key="$key" break fi done if [[ -n "$gpu_key" ]]; then local min_drv="${MIN_DRIVERS[$gpu_key]}" local drv_major="${DETECTED_DRIVER%%.*}" if [[ "$drv_major" -lt "$min_drv" ]]; then warn "驱动 $DETECTED_DRIVER < 最低要求 $min_drv($gpu_key 需要)" else ok "GPU: $DETECTED_GPU | 驱动: $DETECTED_DRIVER (>= $min_drv)" fi else ok "GPU: $DETECTED_GPU | 驱动: $DETECTED_DRIVER" fi return 0 } detect_cuda_version() { # 方式 1: nvcc(最可靠,代表 toolkit 确实安装了) if command -v nvcc &>/dev/null; then CUDA_VERSION=$(nvcc --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+') if [[ -n "$CUDA_VERSION" ]]; then ok "CUDA: $CUDA_VERSION (via nvcc)" _map_cuda_tag return 0 fi fi # 方式 2: nvidia-smi(驱动支持的最大 CUDA 版本,非 toolkit) local smi_cuda smi_cuda=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+') if [[ -n "$smi_cuda" ]]; then CUDA_VERSION="$smi_cuda" warn "CUDA: $CUDA_VERSION (via nvidia-smi — 仅代表驱动能力,非已安装 toolkit)" warn " → 若编译失败,请安装 CUDA Toolkit: apt install cuda-toolkit-${CUDA_VERSION/./-}" _map_cuda_tag return 0 fi # 方式 3: /usr/local/cuda if [[ -f /usr/local/cuda/version.txt ]]; then CUDA_VERSION=$(grep -oP '[0-9]+\.[0-9]+' /usr/local/cuda/version.txt | head -1) if [[ -n "$CUDA_VERSION" ]]; then ok "CUDA: $CUDA_VERSION (via /usr/local/cuda/version.txt)" _map_cuda_tag return 0 fi fi fail "无法检测 CUDA 版本" echo " → 请安装 CUDA Toolkit: https://developer.nvidia.com/cuda-downloads" return 1 } _map_cuda_tag() { local major minor major="${CUDA_VERSION%%.*}" minor="${CUDA_VERSION#*.}" minor="${minor%%.*}" if [[ "$major" -eq 11 ]]; then CUDA_TAG="cu118" elif [[ "$major" -eq 12 ]]; then if [[ "$minor" -le 1 ]]; then CUDA_TAG="cu121" elif [[ "$minor" -le 4 ]]; then CUDA_TAG="cu124" else CUDA_TAG="cu128" fi else CUDA_TAG="cu128" warn "未知 CUDA $CUDA_VERSION,默认使用 cu128 索引" fi log "PyTorch wheel 索引: $CUDA_TAG" } check_python() { local py_cmd="" for cmd in python3.12 python3.11 python3.10 python3; do if command -v "$cmd" &>/dev/null; then local ver ver=$("$cmd" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null) local py_major py_minor py_major="${ver%%.*}" py_minor="${ver#*.}" if [[ "$py_major" -ge 3 ]] && [[ "$py_minor" -ge 10 ]]; then py_cmd="$cmd" break fi fi done if [[ -z "$py_cmd" ]]; then fail "Python >= 3.10 未找到" echo " → apt install python3.11 python3.11-venv" return 1 fi PYTHON_BIN="$(command -v "$py_cmd")" ok "Python: $("$py_cmd" --version 2>&1) ($PYTHON_BIN)" return 0 } check_cmake() { if ! command -v cmake &>/dev/null; then fail "cmake 未找到(编译 nvbandwidth 需要 >= 3.18)" echo " → apt install cmake" return 1 fi local cmake_ver cmake_ver=$(cmake --version | head -1 | grep -oP '[0-9]+\.[0-9]+') local cmake_major cmake_minor cmake_major="${cmake_ver%%.*}" cmake_minor="${cmake_ver#*.}" if [[ "$cmake_major" -lt 3 ]] || { [[ "$cmake_major" -eq 3 ]] && [[ "$cmake_minor" -lt 18 ]]; }; then fail "cmake $cmake_ver < 3.18(nvbandwidth 需要 >= 3.18)" echo " → 升级 cmake: pip install cmake 或从源码安装" return 1 fi ok "cmake: $cmake_ver" return 0 } check_compiler() { if ! command -v gcc &>/dev/null || ! command -v g++ &>/dev/null; then fail "gcc/g++ 未找到" echo " → apt install build-essential" return 1 fi local gcc_ver gcc_ver=$(gcc -dumpversion 2>/dev/null) ok "gcc/g++: $gcc_ver" return 0 } check_mpi() { if command -v mpirun &>/dev/null || command -v mpiexec &>/dev/null; then HAS_MPI=1 ok "MPI: $(mpirun --version 2>&1 | head -1)" else HAS_MPI=0 warn "mpirun 未找到(nccl-tests 将不使用 MPI 模式)" echo " → apt install openmpi-bin libopenmpi-dev" fi return 0 } check_nccl_dev() { if ldconfig -p 2>/dev/null | grep -q libnccl; then HAS_NCCL_DEV=1 ok "libnccl: 已找到 (via ldconfig)" return 0 fi if [[ -f /usr/include/nccl.h ]] || dpkg -l libnccl-dev &>/dev/null 2>&1; then HAS_NCCL_DEV=1 ok "libnccl-dev: 已安装" return 0 fi HAS_NCCL_DEV=0 warn "libnccl-dev 未找到(将跳过 nccl-tests 编译)" echo " → apt install libnccl-dev libnccl2" return 0 } install_system_deps() { log "安装系统依赖包..." if command -v apt-get &>/dev/null; then apt-get update -qq apt-get install -y -qq build-essential git cmake wget curl \ openmpi-bin libopenmpi-dev openssh-client \ infiniband-diags ibverbs-utils perftest \ python3 python3-pip python3-venv \ libnccl-dev libnccl2 \ 2>/dev/null || warn "部分包安装失败(可能已安装)" elif command -v dnf &>/dev/null; then dnf groupinstall -y "Development Tools" 2>/dev/null || true dnf install -y git cmake wget curl \ openmpi openmpi-devel openssh-clients \ infiniband-diags libibverbs-utils perftest \ python3 python3-pip \ 2>/dev/null || warn "部分包安装失败" elif command -v yum &>/dev/null; then yum groupinstall -y "Development Tools" 2>/dev/null || true yum install -y git cmake wget curl \ openmpi openmpi-devel openssh-clients \ infiniband-diags libibverbs-utils perftest \ python3 python3-pip \ 2>/dev/null || warn "部分包安装失败" else warn "未识别的包管理器,请手动安装依赖" fi ok "系统依赖安装完成" } validate_environment() { banner "阶段 1/6: 环境校验" local errors=0 check_nvidia_smi || ((errors++)) detect_gpu_and_driver || true detect_cuda_version || ((errors++)) check_python || ((errors++)) check_cmake || ((errors++)) check_compiler || ((errors++)) check_mpi || true check_nccl_dev || true echo "" if [[ $errors -gt 0 ]]; then fail "环境校验发现 $errors 个必要组件缺失" if [[ $FLAG_INSTALL_SYS_DEPS -eq 1 ]]; then log "检测到 --install-system-deps,尝试安装..." install_system_deps # 重新校验 errors=0 check_python || ((errors++)) check_cmake || ((errors++)) check_compiler || ((errors++)) check_mpi || true check_nccl_dev || true if [[ $errors -gt 0 ]]; then die "安装系统包后仍有 $errors 个组件缺失,请手动解决" fi else echo "" echo " 提示: 加 --install-system-deps 参数可自动安装缺失的系统包" echo " 或手动运行上面提示的 apt install 命令后重试" die "环境校验未通过" fi fi ok "环境校验通过" } # ─── 阶段 2: 安装 uv ────────────────────────────────────────────────────────── ensure_uv() { banner "阶段 2/6: 确保 uv 可用" # 检查已有的 uv if command -v uv &>/dev/null; then ok "uv 已安装: $(uv --version 2>&1)" return 0 fi # 检查常见位置 for p in "$HOME/.local/bin/uv" "$HOME/.cargo/bin/uv" /usr/local/bin/uv; do if [[ -x "$p" ]]; then export PATH="$(dirname "$p"):$PATH" ok "uv 已找到: $p" return 0 fi done log "正在安装 uv..." if ! curl -LsSf https://astral.sh/uv/install.sh | sh 2>/dev/null; then die "uv 安装失败。请手动安装: https://docs.astral.sh/uv/getting-started/installation/" fi # 将 uv 加入 PATH export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" if ! command -v uv &>/dev/null; then die "uv 安装后仍无法找到。请检查 PATH。" fi ok "uv 安装成功: $(uv --version 2>&1)" } # ─── 阶段 3: Python 虚拟环境 ────────────────────────────────────────────────── setup_python_venv() { banner "阶段 3/6: 创建 Python 虚拟环境" local venv_dir="$INSTALL_DIR/.venv" # 检查已有 venv if [[ -x "$venv_dir/bin/python" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then local existing_ver existing_ver=$("$venv_dir/bin/python" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "0.0") local ev_major="${existing_ver%%.*}" local ev_minor="${existing_ver#*.}" if [[ "$ev_major" -ge 3 ]] && [[ "$ev_minor" -ge 10 ]]; then ok "虚拟环境已存在: $venv_dir (Python $existing_ver)" else log "已有 venv 的 Python 版本过低 ($existing_ver),重建中..." rm -rf "$venv_dir" fi fi # 创建 venv if [[ ! -x "$venv_dir/bin/python" ]]; then log "创建虚拟环境: $venv_dir" uv venv "$venv_dir" --python "$PYTHON_BIN" ok "虚拟环境创建成功" fi # 安装项目依赖 log "安装 Python 依赖(rich、pyyaml、numpy)..." uv pip install --python "$venv_dir/bin/python" \ -e "$PROJECT_DIR" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -1; } || true ok "项目依赖安装完成" # 安装 PyTorch if [[ $FLAG_SKIP_PYTORCH -eq 1 ]]; then warn "跳过 PyTorch 安装(--skip-pytorch)" else # 检查是否已有 torch if "$venv_dir/bin/python" -c "import torch" &>/dev/null && [[ $FLAG_REBUILD -eq 0 ]]; then local torch_ver torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null) ok "PyTorch 已安装: $torch_ver" else local index_url="https://download.pytorch.org/whl/${CUDA_TAG}" log "安装 PyTorch (CUDA $CUDA_TAG): $index_url" log "(下载较大,请耐心等待...)" uv pip install --python "$venv_dir/bin/python" \ "torch>=2.1.0" --index-url "$index_url" \ 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } || { warn "PyTorch 安装失败,可稍后手动安装:" echo " source $INSTALL_DIR/env.sh" echo " uv pip install torch --index-url $index_url" } if "$venv_dir/bin/python" -c "import torch" &>/dev/null; then local torch_ver torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null) ok "PyTorch 安装成功: $torch_ver" fi fi fi } # ─── 阶段 4: 编译原生工具 ───────────────────────────────────────────────────── build_nvbandwidth() { local src="$INSTALL_DIR/nvbandwidth" # 幂等检查 if [[ -x "$src/nvbandwidth" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then ok "nvbandwidth: 已编译 ($src/nvbandwidth)" return 0 fi log "编译 nvbandwidth..." ( set -e # 清理 / 克隆 if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then rm -rf "$src" fi if [[ -d "$src/.git" ]]; then cd "$src" && git pull --ff-only 2>/dev/null || true elif [[ -d "$src" ]]; then rm -rf "$src" git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" else git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" fi cd "$src" mkdir -p build && cd build cmake .. -DCMAKE_BUILD_TYPE=Release 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } make -j"$JOBS" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } if [[ -x ./nvbandwidth ]]; then cp ./nvbandwidth "$src/nvbandwidth" fi ) if [[ -x "$src/nvbandwidth" ]]; then ok "nvbandwidth: 编译成功" else warn "nvbandwidth: 编译失败(非致命,可手动编译)" fi } build_nccl_tests() { local src="$INSTALL_DIR/nccl-tests" if [[ -x "$src/build/all_reduce_perf" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then ok "nccl-tests: 已编译 ($src/build/)" return 0 fi if [[ $HAS_NCCL_DEV -eq 0 ]]; then warn "nccl-tests: 跳过(libnccl-dev 未安装)" return 0 fi local cuda_home="${CUDA_HOME:-/usr/local/cuda}" if [[ ! -d "$cuda_home/include" ]]; then warn "nccl-tests: 跳过(CUDA_HOME=$cuda_home 无效)" return 0 fi log "编译 nccl-tests..." ( set -e if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then rm -rf "$src" fi if [[ -d "$src/.git" ]]; then cd "$src" && git pull --ff-only 2>/dev/null || true elif [[ -d "$src" ]]; then rm -rf "$src" git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" else git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" fi cd "$src" if [[ $HAS_MPI -eq 1 ]]; then make MPI=1 MPI_HOME=/usr CUDA_HOME="$cuda_home" -j"$JOBS" \ 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } else make CUDA_HOME="$cuda_home" -j"$JOBS" \ 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } fi ) if [[ -x "$src/build/all_reduce_perf" ]]; then ok "nccl-tests: 编译成功" else warn "nccl-tests: 编译失败(非致命)" fi } build_gpu_burn() { local src="$INSTALL_DIR/gpu-burn" if [[ -x "$src/gpu_burn" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then ok "gpu-burn: 已编译 ($src/gpu_burn)" return 0 fi local cuda_home="${CUDA_HOME:-/usr/local/cuda}" if [[ ! -d "$cuda_home" ]]; then warn "gpu-burn: 跳过(CUDA_HOME=$cuda_home 不存在)" return 0 fi log "编译 gpu-burn..." ( set -e if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then rm -rf "$src" fi if [[ -d "$src/.git" ]]; then cd "$src" && git pull --ff-only 2>/dev/null || true elif [[ -d "$src" ]]; then rm -rf "$src" git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" else git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" fi cd "$src" make CUDA_PATH="$cuda_home" -j"$JOBS" \ 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } ) if [[ -x "$src/gpu_burn" ]]; then ok "gpu-burn: 编译成功" else warn "gpu-burn: 编译失败(非致命)" fi } build_native_tools() { banner "阶段 4/6: 编译原生工具" build_nvbandwidth build_nccl_tests build_gpu_burn } # ─── 阶段 5: 生成激活脚本 ───────────────────────────────────────────────────── generate_env_sh() { banner "阶段 5/6: 生成环境脚本" local env_file="$INSTALL_DIR/env.sh" cat > "$env_file" << 'ENVEOF' #!/usr/bin/env bash # GPU Test Suite 环境激活脚本 # 用法: source /opt/gpu-test-tools/env.sh export GPU_TOOLS_DIR="__INSTALL_DIR__" export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}" # 激活 Python 虚拟环境 if [[ -f "$GPU_TOOLS_DIR/.venv/bin/activate" ]]; then source "$GPU_TOOLS_DIR/.venv/bin/activate" fi # 编译工具加入 PATH export PATH="$GPU_TOOLS_DIR/nvbandwidth:$PATH" export PATH="$GPU_TOOLS_DIR/nccl-tests/build:$PATH" export PATH="$GPU_TOOLS_DIR/gpu-burn:$PATH" export PATH="$CUDA_HOME/bin:$PATH" # 库路径 export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}" ENVEOF # 替换占位符 sed -i "s|__INSTALL_DIR__|$INSTALL_DIR|g" "$env_file" chmod +x "$env_file" ok "env.sh 已生成: $env_file" # 生成 run-gpu-tests 运行器 local wrapper="$INSTALL_DIR/run-gpu-tests" cat > "$wrapper" << WRAPEOF #!/usr/bin/env bash # GPU Test Suite 一键运行器 # 用法: /opt/gpu-test-tools/run-gpu-tests --test all SCRIPT_DIR="\$(cd "\$(dirname "\${BASH_SOURCE[0]}")" && pwd)" source "\$SCRIPT_DIR/env.sh" exec python3 "$PROJECT_DIR/gpu_tester.py" "\$@" WRAPEOF chmod +x "$wrapper" ok "run-gpu-tests 已生成: $wrapper" } # ─── 阶段 6: 打印总结 ───────────────────────────────────────────────────────── print_summary() { banner "阶段 6/6: 安装总结" echo -e "${BOLD}安装目录:${NC} $INSTALL_DIR" echo "" echo -e "${BOLD}组件状态:${NC}" # Python 虚拟环境 if [[ -x "$INSTALL_DIR/.venv/bin/python" ]]; then local py_ver py_ver=$("$INSTALL_DIR/.venv/bin/python" --version 2>&1) echo -e " ${GREEN}✓${NC} Python venv: $py_ver" else echo -e " ${RED}✗${NC} Python venv: 未创建" fi # PyTorch if "$INSTALL_DIR/.venv/bin/python" -c "import torch" &>/dev/null 2>&1; then local tv tv=$("$INSTALL_DIR/.venv/bin/python" -c "import torch; print(f'{torch.__version__} (CUDA {torch.version.cuda})')" 2>/dev/null) echo -e " ${GREEN}✓${NC} PyTorch: $tv" else echo -e " ${YELLOW}○${NC} PyTorch: 未安装" fi # 编译工具 for tool_info in \ "$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \ "$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \ "$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do local path="${tool_info%%:*}" local name="${tool_info##*:}" if [[ -x "$path" ]]; then echo -e " ${GREEN}✓${NC} $name" else echo -e " ${YELLOW}○${NC} $name (未编译)" fi done # RDMA 工具(系统级) local rdma_found=0 for tool in ib_write_bw ib_read_bw ibstat; do if command -v "$tool" &>/dev/null; then ((rdma_found++)) fi done if [[ $rdma_found -gt 0 ]]; then echo -e " ${GREEN}✓${NC} RDMA 工具: $rdma_found/3 可用" else echo -e " ${YELLOW}○${NC} RDMA 工具: 未安装 (apt install perftest infiniband-diags)" fi echo "" echo -e "${BOLD}使用方法:${NC}" echo "" echo " # 方式一: source 激活后使用" echo " source $INSTALL_DIR/env.sh" echo " python3 $PROJECT_DIR/gpu_tester.py --test all" echo "" echo " # 方式二: 一键运行" echo " $INSTALL_DIR/run-gpu-tests --test all" echo " $INSTALL_DIR/run-gpu-tests --test health" echo " $INSTALL_DIR/run-gpu-tests # 交互式菜单" echo "" } # ─── 主函数 ─────────────────────────────────────────────────────────────────── main() { parse_args "$@" echo "" echo -e "${BOLD}${CYAN}╔══════════════════════════════════════════════════╗${NC}" echo -e "${BOLD}${CYAN}║ GPU Test Suite — 一键安装 ║${NC}" echo -e "${BOLD}${CYAN}║ 环境隔离 · 自动检测 · 完整部署 ║${NC}" echo -e "${BOLD}${CYAN}╚══════════════════════════════════════════════════╝${NC}" echo "" log "安装目录: $INSTALL_DIR" log "项目目录: $PROJECT_DIR" echo "" check_permissions validate_environment ensure_uv setup_python_venv build_native_tools generate_env_sh print_summary echo -e "${GREEN}${BOLD}安装完成!${NC}" } main "$@"