diff --git a/install_deps.sh b/install_deps.sh index c84b47b..a9659a3 100755 --- a/install_deps.sh +++ b/install_deps.sh @@ -1,234 +1,749 @@ #!/usr/bin/env bash -set -euo pipefail +# ============================================================================= +# GPU Test Suite — 一键安装脚本(环境隔离版) +# 支持: A100 / A800 / H100 / H200 / B200 / B300 +# +# 功能: +# 1. 环境校验(GPU、CUDA、Python、编译器等) +# 2. 自动安装 uv 并创建隔离 Python 虚拟环境 +# 3. 自动检测 CUDA 版本并安装对应 PyTorch +# 4. 编译 nvbandwidth / nccl-tests / gpu-burn +# 5. 生成 env.sh 激活脚本和 run-gpu-tests 运行器 +# +# 用法: +# sudo bash install_deps.sh # 标准安装 +# sudo bash install_deps.sh --install-system-deps # 同时安装系统包 +# sudo bash install_deps.sh --skip-pytorch # 跳过 PyTorch +# sudo bash install_deps.sh --rebuild # 强制重新编译 +# sudo bash install_deps.sh -y # 非交互模式 +# ============================================================================= +set -uo pipefail +# ─── 全局变量 ───────────────────────────────────────────────────────────────── INSTALL_DIR="${GPU_TOOLS_DIR:-/opt/gpu-test-tools}" +PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" JOBS="${MAKE_JOBS:-$(nproc)}" VERBOSE="${VERBOSE:-0}" +# 参数标志 +FLAG_INSTALL_SYS_DEPS=0 +FLAG_SKIP_PYTORCH=0 +FLAG_REBUILD=0 +FLAG_YES=0 + +# 检测结果(全局) +DETECTED_GPU="" +DETECTED_DRIVER="" +CUDA_VERSION="" +CUDA_TAG="" +PYTHON_BIN="" +HAS_MPI=0 +HAS_NCCL_DEV=0 + +# ─── 颜色和日志 ────────────────────────────────────────────────────────────── RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' CYAN='\033[0;36m' +BOLD='\033[1m' NC='\033[0m' log() { echo -e "${CYAN}[INFO]${NC} $*"; } ok() { echo -e "${GREEN}[ OK ]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } fail() { echo -e "${RED}[FAIL]${NC} $*"; } +die() { echo -e "${RED}[FATAL]${NC} $*"; exit 1; } +banner() { echo -e "\n${BOLD}${CYAN}══════ $* ══════${NC}\n"; } -check_root() { - if [[ $EUID -ne 0 ]]; then - warn "Not running as root. Some installations may fail." - warn "Re-run with: sudo $0" - fi +# 错误陷阱 +trap 'fail "脚本在第 $LINENO 行出错。设置 VERBOSE=1 查看详情。"' ERR + +# ─── 参数解析 ───────────────────────────────────────────────────────────────── +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --install-system-deps) FLAG_INSTALL_SYS_DEPS=1 ;; + --skip-pytorch) FLAG_SKIP_PYTORCH=1 ;; + --rebuild) FLAG_REBUILD=1 ;; + -y|--yes) FLAG_YES=1 ;; + -h|--help) + echo "用法: $0 [选项]" + echo "" + echo "选项:" + echo " --install-system-deps 自动安装缺失的系统包" + echo " --skip-pytorch 跳过 PyTorch 安装" + echo " --rebuild 强制重新编译原生工具" + echo " -y, --yes 非交互模式" + echo " -h, --help 显示此帮助" + echo "" + echo "环境变量:" + echo " GPU_TOOLS_DIR 安装目录 (默认: /opt/gpu-test-tools)" + echo " MAKE_JOBS 编译并行数 (默认: nproc)" + echo " CUDA_HOME CUDA 安装路径 (默认: /usr/local/cuda)" + exit 0 + ;; + *) warn "未知参数: $1" ;; + esac + shift + done } -detect_gpu() { - if ! command -v nvidia-smi &>/dev/null; then - fail "nvidia-smi not found. Install NVIDIA drivers first." - exit 1 +# ─── 阶段 0: 权限检查 ───────────────────────────────────────────────────────── +check_permissions() { + local parent_dir + parent_dir="$(dirname "$INSTALL_DIR")" + if [[ ! -w "$parent_dir" ]] && [[ ! -d "$INSTALL_DIR" || ! -w "$INSTALL_DIR" ]]; then + die "无法写入 $INSTALL_DIR(请使用 sudo 或设置 GPU_TOOLS_DIR 到可写路径)" fi - local gpu_name - gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1) - log "Detected GPU: $gpu_name" + mkdir -p "$INSTALL_DIR" +} + +# ─── 阶段 1: 环境校验 ───────────────────────────────────────────────────────── + +# 最低驱动版本表 +declare -A MIN_DRIVERS=( + ["A100"]="470" ["A800"]="470" + ["H100"]="535" ["H200"]="535" + ["B200"]="550" ["B300"]="550" +) + +check_nvidia_smi() { + if ! command -v nvidia-smi &>/dev/null; then + fail "nvidia-smi 未找到" + echo " → 请先安装 NVIDIA 驱动" + return 1 + fi + ok "nvidia-smi 可用" + return 0 +} + +detect_gpu_and_driver() { + local smi_out + smi_out=$(nvidia-smi --query-gpu=name,driver_version --format=csv,noheader 2>/dev/null | head -1) + if [[ -z "$smi_out" ]]; then + warn "无法查询 GPU 信息" + return 1 + fi + + DETECTED_GPU=$(echo "$smi_out" | cut -d',' -f1 | xargs) + DETECTED_DRIVER=$(echo "$smi_out" | cut -d',' -f2 | xargs) + + # 检查驱动版本 + local gpu_key="" + for key in "${!MIN_DRIVERS[@]}"; do + if echo "$DETECTED_GPU" | grep -qi "$key"; then + gpu_key="$key" + break + fi + done + + if [[ -n "$gpu_key" ]]; then + local min_drv="${MIN_DRIVERS[$gpu_key]}" + local drv_major="${DETECTED_DRIVER%%.*}" + if [[ "$drv_major" -lt "$min_drv" ]]; then + warn "驱动 $DETECTED_DRIVER < 最低要求 $min_drv($gpu_key 需要)" + else + ok "GPU: $DETECTED_GPU | 驱动: $DETECTED_DRIVER (>= $min_drv)" + fi + else + ok "GPU: $DETECTED_GPU | 驱动: $DETECTED_DRIVER" + fi + return 0 +} + +detect_cuda_version() { + # 方式 1: nvcc(最可靠,代表 toolkit 确实安装了) + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+') + if [[ -n "$CUDA_VERSION" ]]; then + ok "CUDA: $CUDA_VERSION (via nvcc)" + _map_cuda_tag + return 0 + fi + fi + + # 方式 2: nvidia-smi(驱动支持的最大 CUDA 版本,非 toolkit) + local smi_cuda + smi_cuda=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+') + if [[ -n "$smi_cuda" ]]; then + CUDA_VERSION="$smi_cuda" + warn "CUDA: $CUDA_VERSION (via nvidia-smi — 仅代表驱动能力,非已安装 toolkit)" + warn " → 若编译失败,请安装 CUDA Toolkit: apt install cuda-toolkit-${CUDA_VERSION/./-}" + _map_cuda_tag + return 0 + fi + + # 方式 3: /usr/local/cuda + if [[ -f /usr/local/cuda/version.txt ]]; then + CUDA_VERSION=$(grep -oP '[0-9]+\.[0-9]+' /usr/local/cuda/version.txt | head -1) + if [[ -n "$CUDA_VERSION" ]]; then + ok "CUDA: $CUDA_VERSION (via /usr/local/cuda/version.txt)" + _map_cuda_tag + return 0 + fi + fi + + fail "无法检测 CUDA 版本" + echo " → 请安装 CUDA Toolkit: https://developer.nvidia.com/cuda-downloads" + return 1 +} + +_map_cuda_tag() { + local major minor + major="${CUDA_VERSION%%.*}" + minor="${CUDA_VERSION#*.}" + minor="${minor%%.*}" + + if [[ "$major" -eq 11 ]]; then + CUDA_TAG="cu118" + elif [[ "$major" -eq 12 ]]; then + if [[ "$minor" -le 1 ]]; then + CUDA_TAG="cu121" + elif [[ "$minor" -le 4 ]]; then + CUDA_TAG="cu124" + else + CUDA_TAG="cu128" + fi + else + CUDA_TAG="cu128" + warn "未知 CUDA $CUDA_VERSION,默认使用 cu128 索引" + fi + log "PyTorch wheel 索引: $CUDA_TAG" +} + +check_python() { + local py_cmd="" + for cmd in python3.12 python3.11 python3.10 python3; do + if command -v "$cmd" &>/dev/null; then + local ver + ver=$("$cmd" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null) + local py_major py_minor + py_major="${ver%%.*}" + py_minor="${ver#*.}" + if [[ "$py_major" -ge 3 ]] && [[ "$py_minor" -ge 10 ]]; then + py_cmd="$cmd" + break + fi + fi + done + + if [[ -z "$py_cmd" ]]; then + fail "Python >= 3.10 未找到" + echo " → apt install python3.11 python3.11-venv" + return 1 + fi + + PYTHON_BIN="$(command -v "$py_cmd")" + ok "Python: $("$py_cmd" --version 2>&1) ($PYTHON_BIN)" + return 0 +} + +check_cmake() { + if ! command -v cmake &>/dev/null; then + fail "cmake 未找到(编译 nvbandwidth 需要 >= 3.18)" + echo " → apt install cmake" + return 1 + fi + local cmake_ver + cmake_ver=$(cmake --version | head -1 | grep -oP '[0-9]+\.[0-9]+') + local cmake_major cmake_minor + cmake_major="${cmake_ver%%.*}" + cmake_minor="${cmake_ver#*.}" + if [[ "$cmake_major" -lt 3 ]] || { [[ "$cmake_major" -eq 3 ]] && [[ "$cmake_minor" -lt 18 ]]; }; then + fail "cmake $cmake_ver < 3.18(nvbandwidth 需要 >= 3.18)" + echo " → 升级 cmake: pip install cmake 或从源码安装" + return 1 + fi + ok "cmake: $cmake_ver" + return 0 +} + +check_compiler() { + if ! command -v gcc &>/dev/null || ! command -v g++ &>/dev/null; then + fail "gcc/g++ 未找到" + echo " → apt install build-essential" + return 1 + fi + local gcc_ver + gcc_ver=$(gcc -dumpversion 2>/dev/null) + ok "gcc/g++: $gcc_ver" + return 0 +} + +check_mpi() { + if command -v mpirun &>/dev/null || command -v mpiexec &>/dev/null; then + HAS_MPI=1 + ok "MPI: $(mpirun --version 2>&1 | head -1)" + else + HAS_MPI=0 + warn "mpirun 未找到(nccl-tests 将不使用 MPI 模式)" + echo " → apt install openmpi-bin libopenmpi-dev" + fi + return 0 +} + +check_nccl_dev() { + if ldconfig -p 2>/dev/null | grep -q libnccl; then + HAS_NCCL_DEV=1 + ok "libnccl: 已找到 (via ldconfig)" + return 0 + fi + if [[ -f /usr/include/nccl.h ]] || dpkg -l libnccl-dev &>/dev/null 2>&1; then + HAS_NCCL_DEV=1 + ok "libnccl-dev: 已安装" + return 0 + fi + HAS_NCCL_DEV=0 + warn "libnccl-dev 未找到(将跳过 nccl-tests 编译)" + echo " → apt install libnccl-dev libnccl2" + return 0 } install_system_deps() { - log "Installing system dependencies..." + log "安装系统依赖包..." if command -v apt-get &>/dev/null; then apt-get update -qq apt-get install -y -qq build-essential git cmake wget curl \ openmpi-bin libopenmpi-dev openssh-client \ infiniband-diags ibverbs-utils perftest \ python3 python3-pip python3-venv \ - 2>/dev/null || warn "Some apt packages failed (may already be installed)" - elif command -v yum &>/dev/null; then - yum groupinstall -y "Development Tools" 2>/dev/null || true - yum install -y git cmake wget curl \ - openmpi openmpi-devel openssh-clients \ - infiniband-diags libibverbs-utils perftest \ - python3 python3-pip \ - 2>/dev/null || warn "Some yum packages failed" + libnccl-dev libnccl2 \ + 2>/dev/null || warn "部分包安装失败(可能已安装)" elif command -v dnf &>/dev/null; then dnf groupinstall -y "Development Tools" 2>/dev/null || true dnf install -y git cmake wget curl \ openmpi openmpi-devel openssh-clients \ infiniband-diags libibverbs-utils perftest \ python3 python3-pip \ - 2>/dev/null || warn "Some dnf packages failed" + 2>/dev/null || warn "部分包安装失败" + elif command -v yum &>/dev/null; then + yum groupinstall -y "Development Tools" 2>/dev/null || true + yum install -y git cmake wget curl \ + openmpi openmpi-devel openssh-clients \ + infiniband-diags libibverbs-utils perftest \ + python3 python3-pip \ + 2>/dev/null || warn "部分包安装失败" else - warn "Unsupported package manager. Install deps manually." + warn "未识别的包管理器,请手动安装依赖" fi - ok "System dependencies" + ok "系统依赖安装完成" } -install_python_deps() { - log "Installing Python dependencies..." - pip3 install --quiet rich pyyaml 2>/dev/null || pip install --quiet rich pyyaml - ok "Python dependencies (rich, pyyaml)" +validate_environment() { + banner "阶段 1/6: 环境校验" + + local errors=0 + + check_nvidia_smi || ((errors++)) + detect_gpu_and_driver || true + detect_cuda_version || ((errors++)) + check_python || ((errors++)) + check_cmake || ((errors++)) + check_compiler || ((errors++)) + check_mpi || true + check_nccl_dev || true + + echo "" + if [[ $errors -gt 0 ]]; then + fail "环境校验发现 $errors 个必要组件缺失" + if [[ $FLAG_INSTALL_SYS_DEPS -eq 1 ]]; then + log "检测到 --install-system-deps,尝试安装..." + install_system_deps + # 重新校验 + errors=0 + check_python || ((errors++)) + check_cmake || ((errors++)) + check_compiler || ((errors++)) + check_mpi || true + check_nccl_dev || true + if [[ $errors -gt 0 ]]; then + die "安装系统包后仍有 $errors 个组件缺失,请手动解决" + fi + else + echo "" + echo " 提示: 加 --install-system-deps 参数可自动安装缺失的系统包" + echo " 或手动运行上面提示的 apt install 命令后重试" + die "环境校验未通过" + fi + fi + ok "环境校验通过" } -install_nvbandwidth() { - log "Installing nvbandwidth..." +# ─── 阶段 2: 安装 uv ────────────────────────────────────────────────────────── +ensure_uv() { + banner "阶段 2/6: 确保 uv 可用" + + # 检查已有的 uv + if command -v uv &>/dev/null; then + ok "uv 已安装: $(uv --version 2>&1)" + return 0 + fi + + # 检查常见位置 + for p in "$HOME/.local/bin/uv" "$HOME/.cargo/bin/uv" /usr/local/bin/uv; do + if [[ -x "$p" ]]; then + export PATH="$(dirname "$p"):$PATH" + ok "uv 已找到: $p" + return 0 + fi + done + + log "正在安装 uv..." + if ! curl -LsSf https://astral.sh/uv/install.sh | sh 2>/dev/null; then + die "uv 安装失败。请手动安装: https://docs.astral.sh/uv/getting-started/installation/" + fi + + # 将 uv 加入 PATH + export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" + + if ! command -v uv &>/dev/null; then + die "uv 安装后仍无法找到。请检查 PATH。" + fi + ok "uv 安装成功: $(uv --version 2>&1)" +} + +# ─── 阶段 3: Python 虚拟环境 ────────────────────────────────────────────────── +setup_python_venv() { + banner "阶段 3/6: 创建 Python 虚拟环境" + + local venv_dir="$INSTALL_DIR/.venv" + + # 检查已有 venv + if [[ -x "$venv_dir/bin/python" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then + local existing_ver + existing_ver=$("$venv_dir/bin/python" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "0.0") + local ev_major="${existing_ver%%.*}" + local ev_minor="${existing_ver#*.}" + if [[ "$ev_major" -ge 3 ]] && [[ "$ev_minor" -ge 10 ]]; then + ok "虚拟环境已存在: $venv_dir (Python $existing_ver)" + else + log "已有 venv 的 Python 版本过低 ($existing_ver),重建中..." + rm -rf "$venv_dir" + fi + fi + + # 创建 venv + if [[ ! -x "$venv_dir/bin/python" ]]; then + log "创建虚拟环境: $venv_dir" + uv venv "$venv_dir" --python "$PYTHON_BIN" + ok "虚拟环境创建成功" + fi + + # 安装项目依赖 + log "安装 Python 依赖(rich、pyyaml、numpy)..." + uv pip install --python "$venv_dir/bin/python" \ + -e "$PROJECT_DIR" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -1; } || true + ok "项目依赖安装完成" + + # 安装 PyTorch + if [[ $FLAG_SKIP_PYTORCH -eq 1 ]]; then + warn "跳过 PyTorch 安装(--skip-pytorch)" + else + # 检查是否已有 torch + if "$venv_dir/bin/python" -c "import torch" &>/dev/null && [[ $FLAG_REBUILD -eq 0 ]]; then + local torch_ver + torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null) + ok "PyTorch 已安装: $torch_ver" + else + local index_url="https://download.pytorch.org/whl/${CUDA_TAG}" + log "安装 PyTorch (CUDA $CUDA_TAG): $index_url" + log "(下载较大,请耐心等待...)" + uv pip install --python "$venv_dir/bin/python" \ + "torch>=2.1.0" --index-url "$index_url" \ + 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } || { + warn "PyTorch 安装失败,可稍后手动安装:" + echo " source $INSTALL_DIR/env.sh" + echo " uv pip install torch --index-url $index_url" + } + if "$venv_dir/bin/python" -c "import torch" &>/dev/null; then + local torch_ver + torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null) + ok "PyTorch 安装成功: $torch_ver" + fi + fi + fi +} + +# ─── 阶段 4: 编译原生工具 ───────────────────────────────────────────────────── +build_nvbandwidth() { local src="$INSTALL_DIR/nvbandwidth" - if [[ -x "$src/nvbandwidth" ]]; then - ok "nvbandwidth already installed at $src/nvbandwidth" - return + + # 幂等检查 + if [[ -x "$src/nvbandwidth" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then + ok "nvbandwidth: 已编译 ($src/nvbandwidth)" + return 0 fi - mkdir -p "$INSTALL_DIR" - git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" 2>/dev/null - cd "$src" - mkdir -p build && cd build - cmake .. -DCMAKE_BUILD_TYPE=Release 2>/dev/null - make -j"$JOBS" 2>/dev/null - if [[ -x "$src/build/nvbandwidth" ]]; then - cp "$src/build/nvbandwidth" "$src/nvbandwidth" - ok "nvbandwidth installed at $src/nvbandwidth" + log "编译 nvbandwidth..." + ( + set -e + # 清理 / 克隆 + if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then + rm -rf "$src" + fi + if [[ -d "$src/.git" ]]; then + cd "$src" && git pull --ff-only 2>/dev/null || true + elif [[ -d "$src" ]]; then + rm -rf "$src" + git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" + else + git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" + fi + + cd "$src" + mkdir -p build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Release 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + make -j"$JOBS" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + + if [[ -x ./nvbandwidth ]]; then + cp ./nvbandwidth "$src/nvbandwidth" + fi + ) + + if [[ -x "$src/nvbandwidth" ]]; then + ok "nvbandwidth: 编译成功" else - warn "nvbandwidth build failed. Try building manually in $src" + warn "nvbandwidth: 编译失败(非致命,可手动编译)" fi } -install_nccl_tests() { - log "Installing nccl-tests..." +build_nccl_tests() { local src="$INSTALL_DIR/nccl-tests" - if [[ -x "$src/build/all_reduce_perf" ]]; then - ok "nccl-tests already installed at $src/build/" - return + + if [[ -x "$src/build/all_reduce_perf" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then + ok "nccl-tests: 已编译 ($src/build/)" + return 0 fi - mkdir -p "$INSTALL_DIR" - git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" 2>/dev/null - cd "$src" + if [[ $HAS_NCCL_DEV -eq 0 ]]; then + warn "nccl-tests: 跳过(libnccl-dev 未安装)" + return 0 + fi + + local cuda_home="${CUDA_HOME:-/usr/local/cuda}" + if [[ ! -d "$cuda_home/include" ]]; then + warn "nccl-tests: 跳过(CUDA_HOME=$cuda_home 无效)" + return 0 + fi + + log "编译 nccl-tests..." + ( + set -e + if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then + rm -rf "$src" + fi + if [[ -d "$src/.git" ]]; then + cd "$src" && git pull --ff-only 2>/dev/null || true + elif [[ -d "$src" ]]; then + rm -rf "$src" + git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" + else + git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" + fi + + cd "$src" + if [[ $HAS_MPI -eq 1 ]]; then + make MPI=1 MPI_HOME=/usr CUDA_HOME="$cuda_home" -j"$JOBS" \ + 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + else + make CUDA_HOME="$cuda_home" -j"$JOBS" \ + 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + fi + ) + + if [[ -x "$src/build/all_reduce_perf" ]]; then + ok "nccl-tests: 编译成功" + else + warn "nccl-tests: 编译失败(非致命)" + fi +} + +build_gpu_burn() { + local src="$INSTALL_DIR/gpu-burn" + + if [[ -x "$src/gpu_burn" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then + ok "gpu-burn: 已编译 ($src/gpu_burn)" + return 0 + fi local cuda_home="${CUDA_HOME:-/usr/local/cuda}" if [[ ! -d "$cuda_home" ]]; then - warn "CUDA_HOME not found at $cuda_home. Set CUDA_HOME env var." - return + warn "gpu-burn: 跳过(CUDA_HOME=$cuda_home 不存在)" + return 0 fi - make MPI=1 MPI_HOME=/usr -j"$JOBS" 2>/dev/null || \ - make CUDA_HOME="$cuda_home" -j"$JOBS" 2>/dev/null || \ - warn "nccl-tests build failed. Try: cd $src && make MPI=1" - - if [[ -x "$src/build/all_reduce_perf" ]]; then - ok "nccl-tests installed at $src/build/" - else - warn "nccl-tests build incomplete" - fi -} - -install_gpu_burn() { - log "Installing gpu-burn..." - local src="$INSTALL_DIR/gpu-burn" - if [[ -x "$src/gpu_burn" ]]; then - ok "gpu-burn already installed at $src/gpu_burn" - return - fi - - mkdir -p "$INSTALL_DIR" - git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" 2>/dev/null - cd "$src" - make -j"$JOBS" 2>/dev/null || warn "gpu-burn build failed" - if [[ -x "$src/gpu_burn" ]]; then - ok "gpu-burn installed at $src/gpu_burn" - else - warn "gpu-burn build incomplete" - fi -} - -check_dcgm() { - log "Checking DCGM..." - if command -v nv-hostengine &>/dev/null || command -v dcgmi &>/dev/null; then - ok "DCGM already installed" - return - fi - if dpkg -l datacenter-gpu-manager &>/dev/null 2>&1; then - ok "DCGM package installed" - return - fi - warn "DCGM not found. Install from: https://docs.nvidia.com/datacenter/dcgm/latest/installation-guide.html" - warn " Ubuntu: sudo apt install datacenter-gpu-manager" - warn " Or: curl -fsSL https://deb.nvidia.com/datacenter-gpu-manager/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-dcgm.gpg" -} - -check_rdma_tools() { - log "Checking RDMA tools..." - local found=0 - for tool in ib_write_bw ib_read_bw ib_write_lat ib_read_lat ibstat ibv_devinfo; do - if command -v "$tool" &>/dev/null; then - found=$((found + 1)) - else - warn " $tool not found (install: perftest infiniband-diags)" + log "编译 gpu-burn..." + ( + set -e + if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then + rm -rf "$src" fi - done - if [[ $found -gt 0 ]]; then - ok "$found/$RDMA_TOOL_COUNT RDMA tools found" 2>/dev/null || ok "Some RDMA tools found" + if [[ -d "$src/.git" ]]; then + cd "$src" && git pull --ff-only 2>/dev/null || true + elif [[ -d "$src" ]]; then + rm -rf "$src" + git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" + else + git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" + fi + + cd "$src" + make CUDA_PATH="$cuda_home" -j"$JOBS" \ + 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + ) + + if [[ -x "$src/gpu_burn" ]]; then + ok "gpu-burn: 编译成功" + else + warn "gpu-burn: 编译失败(非致命)" fi } -print_summary() { - echo "" - echo "==========================================" - echo " GPU Test Suite - Installation Summary" - echo "==========================================" - echo "" - echo " Install directory: $INSTALL_DIR" - echo "" - echo " Tools status:" +build_native_tools() { + banner "阶段 4/6: 编译原生工具" + build_nvbandwidth + build_nccl_tests + build_gpu_burn +} - for tool_path in \ +# ─── 阶段 5: 生成激活脚本 ───────────────────────────────────────────────────── +generate_env_sh() { + banner "阶段 5/6: 生成环境脚本" + + local env_file="$INSTALL_DIR/env.sh" + cat > "$env_file" << 'ENVEOF' +#!/usr/bin/env bash +# GPU Test Suite 环境激活脚本 +# 用法: source /opt/gpu-test-tools/env.sh + +export GPU_TOOLS_DIR="__INSTALL_DIR__" +export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}" + +# 激活 Python 虚拟环境 +if [[ -f "$GPU_TOOLS_DIR/.venv/bin/activate" ]]; then + source "$GPU_TOOLS_DIR/.venv/bin/activate" +fi + +# 编译工具加入 PATH +export PATH="$GPU_TOOLS_DIR/nvbandwidth:$PATH" +export PATH="$GPU_TOOLS_DIR/nccl-tests/build:$PATH" +export PATH="$GPU_TOOLS_DIR/gpu-burn:$PATH" +export PATH="$CUDA_HOME/bin:$PATH" + +# 库路径 +export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}" +ENVEOF + + # 替换占位符 + sed -i "s|__INSTALL_DIR__|$INSTALL_DIR|g" "$env_file" + chmod +x "$env_file" + ok "env.sh 已生成: $env_file" + + # 生成 run-gpu-tests 运行器 + local wrapper="$INSTALL_DIR/run-gpu-tests" + cat > "$wrapper" << WRAPEOF +#!/usr/bin/env bash +# GPU Test Suite 一键运行器 +# 用法: /opt/gpu-test-tools/run-gpu-tests --test all + +SCRIPT_DIR="\$(cd "\$(dirname "\${BASH_SOURCE[0]}")" && pwd)" +source "\$SCRIPT_DIR/env.sh" +exec python3 "$PROJECT_DIR/gpu_tester.py" "\$@" +WRAPEOF + chmod +x "$wrapper" + ok "run-gpu-tests 已生成: $wrapper" +} + +# ─── 阶段 6: 打印总结 ───────────────────────────────────────────────────────── +print_summary() { + banner "阶段 6/6: 安装总结" + + echo -e "${BOLD}安装目录:${NC} $INSTALL_DIR" + echo "" + echo -e "${BOLD}组件状态:${NC}" + + # Python 虚拟环境 + if [[ -x "$INSTALL_DIR/.venv/bin/python" ]]; then + local py_ver + py_ver=$("$INSTALL_DIR/.venv/bin/python" --version 2>&1) + echo -e " ${GREEN}✓${NC} Python venv: $py_ver" + else + echo -e " ${RED}✗${NC} Python venv: 未创建" + fi + + # PyTorch + if "$INSTALL_DIR/.venv/bin/python" -c "import torch" &>/dev/null 2>&1; then + local tv + tv=$("$INSTALL_DIR/.venv/bin/python" -c "import torch; print(f'{torch.__version__} (CUDA {torch.version.cuda})')" 2>/dev/null) + echo -e " ${GREEN}✓${NC} PyTorch: $tv" + else + echo -e " ${YELLOW}○${NC} PyTorch: 未安装" + fi + + # 编译工具 + for tool_info in \ "$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \ "$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \ "$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do - path="${tool_path%%:*}" - name="${tool_path##*:}" + local path="${tool_info%%:*}" + local name="${tool_info##*:}" if [[ -x "$path" ]]; then - echo -e " [${GREEN}✓${NC}] $name" + echo -e " ${GREEN}✓${NC} $name" else - echo -e " [${YELLOW}?${NC}] $name (not built)" + echo -e " ${YELLOW}○${NC} $name (未编译)" fi done - echo "" - echo " System tools:" - for cmd in nvidia-smi mpirun nvbandwidth ib_write_bw dcgmi; do - if command -v "$cmd" &>/dev/null; then - echo -e " [${GREEN}✓${NC}] $cmd" - else - echo -e " [${YELLOW}-${NC}] $cmd (not found)" + # RDMA 工具(系统级) + local rdma_found=0 + for tool in ib_write_bw ib_read_bw ibstat; do + if command -v "$tool" &>/dev/null; then + ((rdma_found++)) fi done + if [[ $rdma_found -gt 0 ]]; then + echo -e " ${GREEN}✓${NC} RDMA 工具: $rdma_found/3 可用" + else + echo -e " ${YELLOW}○${NC} RDMA 工具: 未安装 (apt install perftest infiniband-diags)" + fi echo "" - echo " Usage:" - echo " python3 gpu_tester.py # Interactive menu" - echo " python3 gpu_tester.py --test all # Full suite" + echo -e "${BOLD}使用方法:${NC}" + echo "" + echo " # 方式一: source 激活后使用" + echo " source $INSTALL_DIR/env.sh" + echo " python3 $PROJECT_DIR/gpu_tester.py --test all" + echo "" + echo " # 方式二: 一键运行" + echo " $INSTALL_DIR/run-gpu-tests --test all" + echo " $INSTALL_DIR/run-gpu-tests --test health" + echo " $INSTALL_DIR/run-gpu-tests # 交互式菜单" echo "" } +# ─── 主函数 ─────────────────────────────────────────────────────────────────── main() { + parse_args "$@" + echo "" - echo "==========================================" - echo " GPU Test Suite - Dependency Installer" - echo "==========================================" + echo -e "${BOLD}${CYAN}╔══════════════════════════════════════════════════╗${NC}" + echo -e "${BOLD}${CYAN}║ GPU Test Suite — 一键安装 ║${NC}" + echo -e "${BOLD}${CYAN}║ 环境隔离 · 自动检测 · 完整部署 ║${NC}" + echo -e "${BOLD}${CYAN}╚══════════════════════════════════════════════════╝${NC}" + echo "" + log "安装目录: $INSTALL_DIR" + log "项目目录: $PROJECT_DIR" echo "" - check_root - detect_gpu - - mkdir -p "$INSTALL_DIR" - - install_system_deps - install_python_deps - install_nvbandwidth - install_nccl_tests - install_gpu_burn - check_dcgm - check_rdma_tools - + check_permissions + validate_environment + ensure_uv + setup_python_venv + build_native_tools + generate_env_sh print_summary + + echo -e "${GREEN}${BOLD}安装完成!${NC}" } main "$@" diff --git a/requirements.txt b/requirements.txt index 284cb79..4be4c43 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ rich>=13.0 pyyaml>=6.0 +numpy>=1.24