From 24934bc182799fe8e505bb0d67469506014d8f94 Mon Sep 17 00:00:00 2001 From: qinyusen Date: Thu, 7 May 2026 01:32:13 +0800 Subject: [PATCH] feat: rewrite install_deps.sh with env isolation and add numpy to requirements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Complete rewrite of install_deps.sh (6-phase architecture): environment validation, uv-based venv isolation, CUDA auto-detection, idempotent native tool compilation, env.sh/run-gpu-tests generation - Add numpy>=1.24 to requirements.txt to align with pyproject.toml - Support --install-system-deps, --skip-pytorch, --rebuild, -y flags - Use subshells for compilation to prevent CWD pollution - Generate env.sh activation script and run-gpu-tests wrapper ๐Ÿค– Generated with [Qoder][https://qoder.com] --- install_deps.sh | 815 ++++++++++++++++++++++++++++++++++++++--------- requirements.txt | 1 + 2 files changed, 666 insertions(+), 150 deletions(-) diff --git a/install_deps.sh b/install_deps.sh index c84b47b..a9659a3 100755 --- a/install_deps.sh +++ b/install_deps.sh @@ -1,234 +1,749 @@ #!/usr/bin/env bash -set -euo pipefail +# ============================================================================= +# GPU Test Suite โ€” ไธ€้”ฎๅฎ‰่ฃ…่„šๆœฌ๏ผˆ็Žฏๅขƒ้š”็ฆป็‰ˆ๏ผ‰ +# ๆ”ฏๆŒ: A100 / A800 / H100 / H200 / B200 / B300 +# +# ๅŠŸ่ƒฝ: +# 1. ็Žฏๅขƒๆ ก้ชŒ๏ผˆGPUใ€CUDAใ€Pythonใ€็ผ–่ฏ‘ๅ™จ็ญ‰๏ผ‰ +# 2. ่‡ชๅŠจๅฎ‰่ฃ… uv ๅนถๅˆ›ๅปบ้š”็ฆป Python ่™šๆ‹Ÿ็Žฏๅขƒ +# 3. ่‡ชๅŠจๆฃ€ๆต‹ CUDA ็‰ˆๆœฌๅนถๅฎ‰่ฃ…ๅฏนๅบ” PyTorch +# 4. ็ผ–่ฏ‘ nvbandwidth / nccl-tests / gpu-burn +# 5. ็”Ÿๆˆ env.sh ๆฟ€ๆดป่„šๆœฌๅ’Œ run-gpu-tests ่ฟ่กŒๅ™จ +# +# ็”จๆณ•: +# sudo bash install_deps.sh # ๆ ‡ๅ‡†ๅฎ‰่ฃ… +# sudo bash install_deps.sh --install-system-deps # ๅŒๆ—ถๅฎ‰่ฃ…็ณป็ปŸๅŒ… +# sudo bash install_deps.sh --skip-pytorch # ่ทณ่ฟ‡ PyTorch +# sudo bash install_deps.sh --rebuild # ๅผบๅˆถ้‡ๆ–ฐ็ผ–่ฏ‘ +# sudo bash install_deps.sh -y # ้žไบคไบ’ๆจกๅผ +# ============================================================================= +set -uo pipefail +# โ”€โ”€โ”€ ๅ…จๅฑ€ๅ˜้‡ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ INSTALL_DIR="${GPU_TOOLS_DIR:-/opt/gpu-test-tools}" +PROJECT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" JOBS="${MAKE_JOBS:-$(nproc)}" VERBOSE="${VERBOSE:-0}" +# ๅ‚ๆ•ฐๆ ‡ๅฟ— +FLAG_INSTALL_SYS_DEPS=0 +FLAG_SKIP_PYTORCH=0 +FLAG_REBUILD=0 +FLAG_YES=0 + +# ๆฃ€ๆต‹็ป“ๆžœ๏ผˆๅ…จๅฑ€๏ผ‰ +DETECTED_GPU="" +DETECTED_DRIVER="" +CUDA_VERSION="" +CUDA_TAG="" +PYTHON_BIN="" +HAS_MPI=0 +HAS_NCCL_DEV=0 + +# โ”€โ”€โ”€ ้ขœ่‰ฒๅ’Œๆ—ฅๅฟ— โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' CYAN='\033[0;36m' +BOLD='\033[1m' NC='\033[0m' log() { echo -e "${CYAN}[INFO]${NC} $*"; } ok() { echo -e "${GREEN}[ OK ]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } fail() { echo -e "${RED}[FAIL]${NC} $*"; } +die() { echo -e "${RED}[FATAL]${NC} $*"; exit 1; } +banner() { echo -e "\n${BOLD}${CYAN}โ•โ•โ•โ•โ•โ• $* โ•โ•โ•โ•โ•โ•${NC}\n"; } -check_root() { - if [[ $EUID -ne 0 ]]; then - warn "Not running as root. Some installations may fail." - warn "Re-run with: sudo $0" - fi +# ้”™่ฏฏ้™ท้˜ฑ +trap 'fail "่„šๆœฌๅœจ็ฌฌ $LINENO ่กŒๅ‡บ้”™ใ€‚่ฎพ็ฝฎ VERBOSE=1 ๆŸฅ็œ‹่ฏฆๆƒ…ใ€‚"' ERR + +# โ”€โ”€โ”€ ๅ‚ๆ•ฐ่งฃๆž โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + --install-system-deps) FLAG_INSTALL_SYS_DEPS=1 ;; + --skip-pytorch) FLAG_SKIP_PYTORCH=1 ;; + --rebuild) FLAG_REBUILD=1 ;; + -y|--yes) FLAG_YES=1 ;; + -h|--help) + echo "็”จๆณ•: $0 [้€‰้กน]" + echo "" + echo "้€‰้กน:" + echo " --install-system-deps ่‡ชๅŠจๅฎ‰่ฃ…็ผบๅคฑ็š„็ณป็ปŸๅŒ…" + echo " --skip-pytorch ่ทณ่ฟ‡ PyTorch ๅฎ‰่ฃ…" + echo " --rebuild ๅผบๅˆถ้‡ๆ–ฐ็ผ–่ฏ‘ๅŽŸ็”Ÿๅทฅๅ…ท" + echo " -y, --yes ้žไบคไบ’ๆจกๅผ" + echo " -h, --help ๆ˜พ็คบๆญคๅธฎๅŠฉ" + echo "" + echo "็Žฏๅขƒๅ˜้‡:" + echo " GPU_TOOLS_DIR ๅฎ‰่ฃ…็›ฎๅฝ• (้ป˜่ฎค: /opt/gpu-test-tools)" + echo " MAKE_JOBS ็ผ–่ฏ‘ๅนถ่กŒๆ•ฐ (้ป˜่ฎค: nproc)" + echo " CUDA_HOME CUDA ๅฎ‰่ฃ…่ทฏๅพ„ (้ป˜่ฎค: /usr/local/cuda)" + exit 0 + ;; + *) warn "ๆœช็Ÿฅๅ‚ๆ•ฐ: $1" ;; + esac + shift + done } -detect_gpu() { - if ! command -v nvidia-smi &>/dev/null; then - fail "nvidia-smi not found. Install NVIDIA drivers first." - exit 1 +# โ”€โ”€โ”€ ้˜ถๆฎต 0: ๆƒ้™ๆฃ€ๆŸฅ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +check_permissions() { + local parent_dir + parent_dir="$(dirname "$INSTALL_DIR")" + if [[ ! -w "$parent_dir" ]] && [[ ! -d "$INSTALL_DIR" || ! -w "$INSTALL_DIR" ]]; then + die "ๆ— ๆณ•ๅ†™ๅ…ฅ $INSTALL_DIR๏ผˆ่ฏทไฝฟ็”จ sudo ๆˆ–่ฎพ็ฝฎ GPU_TOOLS_DIR ๅˆฐๅฏๅ†™่ทฏๅพ„๏ผ‰" fi - local gpu_name - gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1) - log "Detected GPU: $gpu_name" + mkdir -p "$INSTALL_DIR" +} + +# โ”€โ”€โ”€ ้˜ถๆฎต 1: ็Žฏๅขƒๆ ก้ชŒ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +# ๆœ€ไฝŽ้ฉฑๅŠจ็‰ˆๆœฌ่กจ +declare -A MIN_DRIVERS=( + ["A100"]="470" ["A800"]="470" + ["H100"]="535" ["H200"]="535" + ["B200"]="550" ["B300"]="550" +) + +check_nvidia_smi() { + if ! command -v nvidia-smi &>/dev/null; then + fail "nvidia-smi ๆœชๆ‰พๅˆฐ" + echo " โ†’ ่ฏทๅ…ˆๅฎ‰่ฃ… NVIDIA ้ฉฑๅŠจ" + return 1 + fi + ok "nvidia-smi ๅฏ็”จ" + return 0 +} + +detect_gpu_and_driver() { + local smi_out + smi_out=$(nvidia-smi --query-gpu=name,driver_version --format=csv,noheader 2>/dev/null | head -1) + if [[ -z "$smi_out" ]]; then + warn "ๆ— ๆณ•ๆŸฅ่ฏข GPU ไฟกๆฏ" + return 1 + fi + + DETECTED_GPU=$(echo "$smi_out" | cut -d',' -f1 | xargs) + DETECTED_DRIVER=$(echo "$smi_out" | cut -d',' -f2 | xargs) + + # ๆฃ€ๆŸฅ้ฉฑๅŠจ็‰ˆๆœฌ + local gpu_key="" + for key in "${!MIN_DRIVERS[@]}"; do + if echo "$DETECTED_GPU" | grep -qi "$key"; then + gpu_key="$key" + break + fi + done + + if [[ -n "$gpu_key" ]]; then + local min_drv="${MIN_DRIVERS[$gpu_key]}" + local drv_major="${DETECTED_DRIVER%%.*}" + if [[ "$drv_major" -lt "$min_drv" ]]; then + warn "้ฉฑๅŠจ $DETECTED_DRIVER < ๆœ€ไฝŽ่ฆๆฑ‚ $min_drv๏ผˆ$gpu_key ้œ€่ฆ๏ผ‰" + else + ok "GPU: $DETECTED_GPU | ้ฉฑๅŠจ: $DETECTED_DRIVER (>= $min_drv)" + fi + else + ok "GPU: $DETECTED_GPU | ้ฉฑๅŠจ: $DETECTED_DRIVER" + fi + return 0 +} + +detect_cuda_version() { + # ๆ–นๅผ 1: nvcc๏ผˆๆœ€ๅฏ้ ๏ผŒไปฃ่กจ toolkit ็กฎๅฎžๅฎ‰่ฃ…ไบ†๏ผ‰ + if command -v nvcc &>/dev/null; then + CUDA_VERSION=$(nvcc --version 2>/dev/null | grep -oP 'release \K[0-9]+\.[0-9]+') + if [[ -n "$CUDA_VERSION" ]]; then + ok "CUDA: $CUDA_VERSION (via nvcc)" + _map_cuda_tag + return 0 + fi + fi + + # ๆ–นๅผ 2: nvidia-smi๏ผˆ้ฉฑๅŠจๆ”ฏๆŒ็š„ๆœ€ๅคง CUDA ็‰ˆๆœฌ๏ผŒ้ž toolkit๏ผ‰ + local smi_cuda + smi_cuda=$(nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+') + if [[ -n "$smi_cuda" ]]; then + CUDA_VERSION="$smi_cuda" + warn "CUDA: $CUDA_VERSION (via nvidia-smi โ€” ไป…ไปฃ่กจ้ฉฑๅŠจ่ƒฝๅŠ›๏ผŒ้žๅทฒๅฎ‰่ฃ… toolkit)" + warn " โ†’ ่‹ฅ็ผ–่ฏ‘ๅคฑ่ดฅ๏ผŒ่ฏทๅฎ‰่ฃ… CUDA Toolkit: apt install cuda-toolkit-${CUDA_VERSION/./-}" + _map_cuda_tag + return 0 + fi + + # ๆ–นๅผ 3: /usr/local/cuda + if [[ -f /usr/local/cuda/version.txt ]]; then + CUDA_VERSION=$(grep -oP '[0-9]+\.[0-9]+' /usr/local/cuda/version.txt | head -1) + if [[ -n "$CUDA_VERSION" ]]; then + ok "CUDA: $CUDA_VERSION (via /usr/local/cuda/version.txt)" + _map_cuda_tag + return 0 + fi + fi + + fail "ๆ— ๆณ•ๆฃ€ๆต‹ CUDA ็‰ˆๆœฌ" + echo " โ†’ ่ฏทๅฎ‰่ฃ… CUDA Toolkit: https://developer.nvidia.com/cuda-downloads" + return 1 +} + +_map_cuda_tag() { + local major minor + major="${CUDA_VERSION%%.*}" + minor="${CUDA_VERSION#*.}" + minor="${minor%%.*}" + + if [[ "$major" -eq 11 ]]; then + CUDA_TAG="cu118" + elif [[ "$major" -eq 12 ]]; then + if [[ "$minor" -le 1 ]]; then + CUDA_TAG="cu121" + elif [[ "$minor" -le 4 ]]; then + CUDA_TAG="cu124" + else + CUDA_TAG="cu128" + fi + else + CUDA_TAG="cu128" + warn "ๆœช็Ÿฅ CUDA $CUDA_VERSION๏ผŒ้ป˜่ฎคไฝฟ็”จ cu128 ็ดขๅผ•" + fi + log "PyTorch wheel ็ดขๅผ•: $CUDA_TAG" +} + +check_python() { + local py_cmd="" + for cmd in python3.12 python3.11 python3.10 python3; do + if command -v "$cmd" &>/dev/null; then + local ver + ver=$("$cmd" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null) + local py_major py_minor + py_major="${ver%%.*}" + py_minor="${ver#*.}" + if [[ "$py_major" -ge 3 ]] && [[ "$py_minor" -ge 10 ]]; then + py_cmd="$cmd" + break + fi + fi + done + + if [[ -z "$py_cmd" ]]; then + fail "Python >= 3.10 ๆœชๆ‰พๅˆฐ" + echo " โ†’ apt install python3.11 python3.11-venv" + return 1 + fi + + PYTHON_BIN="$(command -v "$py_cmd")" + ok "Python: $("$py_cmd" --version 2>&1) ($PYTHON_BIN)" + return 0 +} + +check_cmake() { + if ! command -v cmake &>/dev/null; then + fail "cmake ๆœชๆ‰พๅˆฐ๏ผˆ็ผ–่ฏ‘ nvbandwidth ้œ€่ฆ >= 3.18๏ผ‰" + echo " โ†’ apt install cmake" + return 1 + fi + local cmake_ver + cmake_ver=$(cmake --version | head -1 | grep -oP '[0-9]+\.[0-9]+') + local cmake_major cmake_minor + cmake_major="${cmake_ver%%.*}" + cmake_minor="${cmake_ver#*.}" + if [[ "$cmake_major" -lt 3 ]] || { [[ "$cmake_major" -eq 3 ]] && [[ "$cmake_minor" -lt 18 ]]; }; then + fail "cmake $cmake_ver < 3.18๏ผˆnvbandwidth ้œ€่ฆ >= 3.18๏ผ‰" + echo " โ†’ ๅ‡็บง cmake: pip install cmake ๆˆ–ไปŽๆบ็ ๅฎ‰่ฃ…" + return 1 + fi + ok "cmake: $cmake_ver" + return 0 +} + +check_compiler() { + if ! command -v gcc &>/dev/null || ! command -v g++ &>/dev/null; then + fail "gcc/g++ ๆœชๆ‰พๅˆฐ" + echo " โ†’ apt install build-essential" + return 1 + fi + local gcc_ver + gcc_ver=$(gcc -dumpversion 2>/dev/null) + ok "gcc/g++: $gcc_ver" + return 0 +} + +check_mpi() { + if command -v mpirun &>/dev/null || command -v mpiexec &>/dev/null; then + HAS_MPI=1 + ok "MPI: $(mpirun --version 2>&1 | head -1)" + else + HAS_MPI=0 + warn "mpirun ๆœชๆ‰พๅˆฐ๏ผˆnccl-tests ๅฐ†ไธไฝฟ็”จ MPI ๆจกๅผ๏ผ‰" + echo " โ†’ apt install openmpi-bin libopenmpi-dev" + fi + return 0 +} + +check_nccl_dev() { + if ldconfig -p 2>/dev/null | grep -q libnccl; then + HAS_NCCL_DEV=1 + ok "libnccl: ๅทฒๆ‰พๅˆฐ (via ldconfig)" + return 0 + fi + if [[ -f /usr/include/nccl.h ]] || dpkg -l libnccl-dev &>/dev/null 2>&1; then + HAS_NCCL_DEV=1 + ok "libnccl-dev: ๅทฒๅฎ‰่ฃ…" + return 0 + fi + HAS_NCCL_DEV=0 + warn "libnccl-dev ๆœชๆ‰พๅˆฐ๏ผˆๅฐ†่ทณ่ฟ‡ nccl-tests ็ผ–่ฏ‘๏ผ‰" + echo " โ†’ apt install libnccl-dev libnccl2" + return 0 } install_system_deps() { - log "Installing system dependencies..." + log "ๅฎ‰่ฃ…็ณป็ปŸไพ่ต–ๅŒ…..." if command -v apt-get &>/dev/null; then apt-get update -qq apt-get install -y -qq build-essential git cmake wget curl \ openmpi-bin libopenmpi-dev openssh-client \ infiniband-diags ibverbs-utils perftest \ python3 python3-pip python3-venv \ - 2>/dev/null || warn "Some apt packages failed (may already be installed)" - elif command -v yum &>/dev/null; then - yum groupinstall -y "Development Tools" 2>/dev/null || true - yum install -y git cmake wget curl \ - openmpi openmpi-devel openssh-clients \ - infiniband-diags libibverbs-utils perftest \ - python3 python3-pip \ - 2>/dev/null || warn "Some yum packages failed" + libnccl-dev libnccl2 \ + 2>/dev/null || warn "้ƒจๅˆ†ๅŒ…ๅฎ‰่ฃ…ๅคฑ่ดฅ๏ผˆๅฏ่ƒฝๅทฒๅฎ‰่ฃ…๏ผ‰" elif command -v dnf &>/dev/null; then dnf groupinstall -y "Development Tools" 2>/dev/null || true dnf install -y git cmake wget curl \ openmpi openmpi-devel openssh-clients \ infiniband-diags libibverbs-utils perftest \ python3 python3-pip \ - 2>/dev/null || warn "Some dnf packages failed" + 2>/dev/null || warn "้ƒจๅˆ†ๅŒ…ๅฎ‰่ฃ…ๅคฑ่ดฅ" + elif command -v yum &>/dev/null; then + yum groupinstall -y "Development Tools" 2>/dev/null || true + yum install -y git cmake wget curl \ + openmpi openmpi-devel openssh-clients \ + infiniband-diags libibverbs-utils perftest \ + python3 python3-pip \ + 2>/dev/null || warn "้ƒจๅˆ†ๅŒ…ๅฎ‰่ฃ…ๅคฑ่ดฅ" else - warn "Unsupported package manager. Install deps manually." + warn "ๆœช่ฏ†ๅˆซ็š„ๅŒ…็ฎก็†ๅ™จ๏ผŒ่ฏทๆ‰‹ๅŠจๅฎ‰่ฃ…ไพ่ต–" fi - ok "System dependencies" + ok "็ณป็ปŸไพ่ต–ๅฎ‰่ฃ…ๅฎŒๆˆ" } -install_python_deps() { - log "Installing Python dependencies..." - pip3 install --quiet rich pyyaml 2>/dev/null || pip install --quiet rich pyyaml - ok "Python dependencies (rich, pyyaml)" +validate_environment() { + banner "้˜ถๆฎต 1/6: ็Žฏๅขƒๆ ก้ชŒ" + + local errors=0 + + check_nvidia_smi || ((errors++)) + detect_gpu_and_driver || true + detect_cuda_version || ((errors++)) + check_python || ((errors++)) + check_cmake || ((errors++)) + check_compiler || ((errors++)) + check_mpi || true + check_nccl_dev || true + + echo "" + if [[ $errors -gt 0 ]]; then + fail "็Žฏๅขƒๆ ก้ชŒๅ‘็Žฐ $errors ไธชๅฟ…่ฆ็ป„ไปถ็ผบๅคฑ" + if [[ $FLAG_INSTALL_SYS_DEPS -eq 1 ]]; then + log "ๆฃ€ๆต‹ๅˆฐ --install-system-deps๏ผŒๅฐ่ฏ•ๅฎ‰่ฃ…..." + install_system_deps + # ้‡ๆ–ฐๆ ก้ชŒ + errors=0 + check_python || ((errors++)) + check_cmake || ((errors++)) + check_compiler || ((errors++)) + check_mpi || true + check_nccl_dev || true + if [[ $errors -gt 0 ]]; then + die "ๅฎ‰่ฃ…็ณป็ปŸๅŒ…ๅŽไปๆœ‰ $errors ไธช็ป„ไปถ็ผบๅคฑ๏ผŒ่ฏทๆ‰‹ๅŠจ่งฃๅ†ณ" + fi + else + echo "" + echo " ๆ็คบ: ๅŠ  --install-system-deps ๅ‚ๆ•ฐๅฏ่‡ชๅŠจๅฎ‰่ฃ…็ผบๅคฑ็š„็ณป็ปŸๅŒ…" + echo " ๆˆ–ๆ‰‹ๅŠจ่ฟ่กŒไธŠ้ขๆ็คบ็š„ apt install ๅ‘ฝไปคๅŽ้‡่ฏ•" + die "็Žฏๅขƒๆ ก้ชŒๆœช้€š่ฟ‡" + fi + fi + ok "็Žฏๅขƒๆ ก้ชŒ้€š่ฟ‡" } -install_nvbandwidth() { - log "Installing nvbandwidth..." +# โ”€โ”€โ”€ ้˜ถๆฎต 2: ๅฎ‰่ฃ… uv โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +ensure_uv() { + banner "้˜ถๆฎต 2/6: ็กฎไฟ uv ๅฏ็”จ" + + # ๆฃ€ๆŸฅๅทฒๆœ‰็š„ uv + if command -v uv &>/dev/null; then + ok "uv ๅทฒๅฎ‰่ฃ…: $(uv --version 2>&1)" + return 0 + fi + + # ๆฃ€ๆŸฅๅธธ่งไฝ็ฝฎ + for p in "$HOME/.local/bin/uv" "$HOME/.cargo/bin/uv" /usr/local/bin/uv; do + if [[ -x "$p" ]]; then + export PATH="$(dirname "$p"):$PATH" + ok "uv ๅทฒๆ‰พๅˆฐ: $p" + return 0 + fi + done + + log "ๆญฃๅœจๅฎ‰่ฃ… uv..." + if ! curl -LsSf https://astral.sh/uv/install.sh | sh 2>/dev/null; then + die "uv ๅฎ‰่ฃ…ๅคฑ่ดฅใ€‚่ฏทๆ‰‹ๅŠจๅฎ‰่ฃ…: https://docs.astral.sh/uv/getting-started/installation/" + fi + + # ๅฐ† uv ๅŠ ๅ…ฅ PATH + export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" + + if ! command -v uv &>/dev/null; then + die "uv ๅฎ‰่ฃ…ๅŽไปๆ— ๆณ•ๆ‰พๅˆฐใ€‚่ฏทๆฃ€ๆŸฅ PATHใ€‚" + fi + ok "uv ๅฎ‰่ฃ…ๆˆๅŠŸ: $(uv --version 2>&1)" +} + +# โ”€โ”€โ”€ ้˜ถๆฎต 3: Python ่™šๆ‹Ÿ็Žฏๅขƒ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +setup_python_venv() { + banner "้˜ถๆฎต 3/6: ๅˆ›ๅปบ Python ่™šๆ‹Ÿ็Žฏๅขƒ" + + local venv_dir="$INSTALL_DIR/.venv" + + # ๆฃ€ๆŸฅๅทฒๆœ‰ venv + if [[ -x "$venv_dir/bin/python" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then + local existing_ver + existing_ver=$("$venv_dir/bin/python" -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')" 2>/dev/null || echo "0.0") + local ev_major="${existing_ver%%.*}" + local ev_minor="${existing_ver#*.}" + if [[ "$ev_major" -ge 3 ]] && [[ "$ev_minor" -ge 10 ]]; then + ok "่™šๆ‹Ÿ็Žฏๅขƒๅทฒๅญ˜ๅœจ: $venv_dir (Python $existing_ver)" + else + log "ๅทฒๆœ‰ venv ็š„ Python ็‰ˆๆœฌ่ฟ‡ไฝŽ ($existing_ver)๏ผŒ้‡ๅปบไธญ..." + rm -rf "$venv_dir" + fi + fi + + # ๅˆ›ๅปบ venv + if [[ ! -x "$venv_dir/bin/python" ]]; then + log "ๅˆ›ๅปบ่™šๆ‹Ÿ็Žฏๅขƒ: $venv_dir" + uv venv "$venv_dir" --python "$PYTHON_BIN" + ok "่™šๆ‹Ÿ็Žฏๅขƒๅˆ›ๅปบๆˆๅŠŸ" + fi + + # ๅฎ‰่ฃ…้กน็›ฎไพ่ต– + log "ๅฎ‰่ฃ… Python ไพ่ต–๏ผˆrichใ€pyyamlใ€numpy๏ผ‰..." + uv pip install --python "$venv_dir/bin/python" \ + -e "$PROJECT_DIR" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -1; } || true + ok "้กน็›ฎไพ่ต–ๅฎ‰่ฃ…ๅฎŒๆˆ" + + # ๅฎ‰่ฃ… PyTorch + if [[ $FLAG_SKIP_PYTORCH -eq 1 ]]; then + warn "่ทณ่ฟ‡ PyTorch ๅฎ‰่ฃ…๏ผˆ--skip-pytorch๏ผ‰" + else + # ๆฃ€ๆŸฅๆ˜ฏๅฆๅทฒๆœ‰ torch + if "$venv_dir/bin/python" -c "import torch" &>/dev/null && [[ $FLAG_REBUILD -eq 0 ]]; then + local torch_ver + torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null) + ok "PyTorch ๅทฒๅฎ‰่ฃ…: $torch_ver" + else + local index_url="https://download.pytorch.org/whl/${CUDA_TAG}" + log "ๅฎ‰่ฃ… PyTorch (CUDA $CUDA_TAG): $index_url" + log "๏ผˆไธ‹่ฝฝ่พƒๅคง๏ผŒ่ฏท่€ๅฟƒ็ญ‰ๅพ…...๏ผ‰" + uv pip install --python "$venv_dir/bin/python" \ + "torch>=2.1.0" --index-url "$index_url" \ + 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } || { + warn "PyTorch ๅฎ‰่ฃ…ๅคฑ่ดฅ๏ผŒๅฏ็จๅŽๆ‰‹ๅŠจๅฎ‰่ฃ…:" + echo " source $INSTALL_DIR/env.sh" + echo " uv pip install torch --index-url $index_url" + } + if "$venv_dir/bin/python" -c "import torch" &>/dev/null; then + local torch_ver + torch_ver=$("$venv_dir/bin/python" -c "import torch; print(torch.__version__)" 2>/dev/null) + ok "PyTorch ๅฎ‰่ฃ…ๆˆๅŠŸ: $torch_ver" + fi + fi + fi +} + +# โ”€โ”€โ”€ ้˜ถๆฎต 4: ็ผ–่ฏ‘ๅŽŸ็”Ÿๅทฅๅ…ท โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +build_nvbandwidth() { local src="$INSTALL_DIR/nvbandwidth" - if [[ -x "$src/nvbandwidth" ]]; then - ok "nvbandwidth already installed at $src/nvbandwidth" - return + + # ๅน‚็ญ‰ๆฃ€ๆŸฅ + if [[ -x "$src/nvbandwidth" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then + ok "nvbandwidth: ๅทฒ็ผ–่ฏ‘ ($src/nvbandwidth)" + return 0 fi - mkdir -p "$INSTALL_DIR" - git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" 2>/dev/null - cd "$src" - mkdir -p build && cd build - cmake .. -DCMAKE_BUILD_TYPE=Release 2>/dev/null - make -j"$JOBS" 2>/dev/null - if [[ -x "$src/build/nvbandwidth" ]]; then - cp "$src/build/nvbandwidth" "$src/nvbandwidth" - ok "nvbandwidth installed at $src/nvbandwidth" + log "็ผ–่ฏ‘ nvbandwidth..." + ( + set -e + # ๆธ…็† / ๅ…‹้š† + if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then + rm -rf "$src" + fi + if [[ -d "$src/.git" ]]; then + cd "$src" && git pull --ff-only 2>/dev/null || true + elif [[ -d "$src" ]]; then + rm -rf "$src" + git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" + else + git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" + fi + + cd "$src" + mkdir -p build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Release 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + make -j"$JOBS" 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + + if [[ -x ./nvbandwidth ]]; then + cp ./nvbandwidth "$src/nvbandwidth" + fi + ) + + if [[ -x "$src/nvbandwidth" ]]; then + ok "nvbandwidth: ็ผ–่ฏ‘ๆˆๅŠŸ" else - warn "nvbandwidth build failed. Try building manually in $src" + warn "nvbandwidth: ็ผ–่ฏ‘ๅคฑ่ดฅ๏ผˆ้ž่‡ดๅ‘ฝ๏ผŒๅฏๆ‰‹ๅŠจ็ผ–่ฏ‘๏ผ‰" fi } -install_nccl_tests() { - log "Installing nccl-tests..." +build_nccl_tests() { local src="$INSTALL_DIR/nccl-tests" - if [[ -x "$src/build/all_reduce_perf" ]]; then - ok "nccl-tests already installed at $src/build/" - return + + if [[ -x "$src/build/all_reduce_perf" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then + ok "nccl-tests: ๅทฒ็ผ–่ฏ‘ ($src/build/)" + return 0 fi - mkdir -p "$INSTALL_DIR" - git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" 2>/dev/null - cd "$src" + if [[ $HAS_NCCL_DEV -eq 0 ]]; then + warn "nccl-tests: ่ทณ่ฟ‡๏ผˆlibnccl-dev ๆœชๅฎ‰่ฃ…๏ผ‰" + return 0 + fi + + local cuda_home="${CUDA_HOME:-/usr/local/cuda}" + if [[ ! -d "$cuda_home/include" ]]; then + warn "nccl-tests: ่ทณ่ฟ‡๏ผˆCUDA_HOME=$cuda_home ๆ— ๆ•ˆ๏ผ‰" + return 0 + fi + + log "็ผ–่ฏ‘ nccl-tests..." + ( + set -e + if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then + rm -rf "$src" + fi + if [[ -d "$src/.git" ]]; then + cd "$src" && git pull --ff-only 2>/dev/null || true + elif [[ -d "$src" ]]; then + rm -rf "$src" + git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" + else + git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" + fi + + cd "$src" + if [[ $HAS_MPI -eq 1 ]]; then + make MPI=1 MPI_HOME=/usr CUDA_HOME="$cuda_home" -j"$JOBS" \ + 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + else + make CUDA_HOME="$cuda_home" -j"$JOBS" \ + 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + fi + ) + + if [[ -x "$src/build/all_reduce_perf" ]]; then + ok "nccl-tests: ็ผ–่ฏ‘ๆˆๅŠŸ" + else + warn "nccl-tests: ็ผ–่ฏ‘ๅคฑ่ดฅ๏ผˆ้ž่‡ดๅ‘ฝ๏ผ‰" + fi +} + +build_gpu_burn() { + local src="$INSTALL_DIR/gpu-burn" + + if [[ -x "$src/gpu_burn" ]] && [[ $FLAG_REBUILD -eq 0 ]]; then + ok "gpu-burn: ๅทฒ็ผ–่ฏ‘ ($src/gpu_burn)" + return 0 + fi local cuda_home="${CUDA_HOME:-/usr/local/cuda}" if [[ ! -d "$cuda_home" ]]; then - warn "CUDA_HOME not found at $cuda_home. Set CUDA_HOME env var." - return + warn "gpu-burn: ่ทณ่ฟ‡๏ผˆCUDA_HOME=$cuda_home ไธๅญ˜ๅœจ๏ผ‰" + return 0 fi - make MPI=1 MPI_HOME=/usr -j"$JOBS" 2>/dev/null || \ - make CUDA_HOME="$cuda_home" -j"$JOBS" 2>/dev/null || \ - warn "nccl-tests build failed. Try: cd $src && make MPI=1" - - if [[ -x "$src/build/all_reduce_perf" ]]; then - ok "nccl-tests installed at $src/build/" - else - warn "nccl-tests build incomplete" - fi -} - -install_gpu_burn() { - log "Installing gpu-burn..." - local src="$INSTALL_DIR/gpu-burn" - if [[ -x "$src/gpu_burn" ]]; then - ok "gpu-burn already installed at $src/gpu_burn" - return - fi - - mkdir -p "$INSTALL_DIR" - git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" 2>/dev/null - cd "$src" - make -j"$JOBS" 2>/dev/null || warn "gpu-burn build failed" - if [[ -x "$src/gpu_burn" ]]; then - ok "gpu-burn installed at $src/gpu_burn" - else - warn "gpu-burn build incomplete" - fi -} - -check_dcgm() { - log "Checking DCGM..." - if command -v nv-hostengine &>/dev/null || command -v dcgmi &>/dev/null; then - ok "DCGM already installed" - return - fi - if dpkg -l datacenter-gpu-manager &>/dev/null 2>&1; then - ok "DCGM package installed" - return - fi - warn "DCGM not found. Install from: https://docs.nvidia.com/datacenter/dcgm/latest/installation-guide.html" - warn " Ubuntu: sudo apt install datacenter-gpu-manager" - warn " Or: curl -fsSL https://deb.nvidia.com/datacenter-gpu-manager/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-dcgm.gpg" -} - -check_rdma_tools() { - log "Checking RDMA tools..." - local found=0 - for tool in ib_write_bw ib_read_bw ib_write_lat ib_read_lat ibstat ibv_devinfo; do - if command -v "$tool" &>/dev/null; then - found=$((found + 1)) - else - warn " $tool not found (install: perftest infiniband-diags)" + log "็ผ–่ฏ‘ gpu-burn..." + ( + set -e + if [[ $FLAG_REBUILD -eq 1 ]] && [[ -d "$src" ]]; then + rm -rf "$src" fi - done - if [[ $found -gt 0 ]]; then - ok "$found/$RDMA_TOOL_COUNT RDMA tools found" 2>/dev/null || ok "Some RDMA tools found" + if [[ -d "$src/.git" ]]; then + cd "$src" && git pull --ff-only 2>/dev/null || true + elif [[ -d "$src" ]]; then + rm -rf "$src" + git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" + else + git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" + fi + + cd "$src" + make CUDA_PATH="$cuda_home" -j"$JOBS" \ + 2>&1 | { [[ $VERBOSE -eq 1 ]] && cat || tail -3; } + ) + + if [[ -x "$src/gpu_burn" ]]; then + ok "gpu-burn: ็ผ–่ฏ‘ๆˆๅŠŸ" + else + warn "gpu-burn: ็ผ–่ฏ‘ๅคฑ่ดฅ๏ผˆ้ž่‡ดๅ‘ฝ๏ผ‰" fi } -print_summary() { - echo "" - echo "==========================================" - echo " GPU Test Suite - Installation Summary" - echo "==========================================" - echo "" - echo " Install directory: $INSTALL_DIR" - echo "" - echo " Tools status:" +build_native_tools() { + banner "้˜ถๆฎต 4/6: ็ผ–่ฏ‘ๅŽŸ็”Ÿๅทฅๅ…ท" + build_nvbandwidth + build_nccl_tests + build_gpu_burn +} - for tool_path in \ +# โ”€โ”€โ”€ ้˜ถๆฎต 5: ็”Ÿๆˆๆฟ€ๆดป่„šๆœฌ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +generate_env_sh() { + banner "้˜ถๆฎต 5/6: ็”Ÿๆˆ็Žฏๅขƒ่„šๆœฌ" + + local env_file="$INSTALL_DIR/env.sh" + cat > "$env_file" << 'ENVEOF' +#!/usr/bin/env bash +# GPU Test Suite ็Žฏๅขƒๆฟ€ๆดป่„šๆœฌ +# ็”จๆณ•: source /opt/gpu-test-tools/env.sh + +export GPU_TOOLS_DIR="__INSTALL_DIR__" +export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}" + +# ๆฟ€ๆดป Python ่™šๆ‹Ÿ็Žฏๅขƒ +if [[ -f "$GPU_TOOLS_DIR/.venv/bin/activate" ]]; then + source "$GPU_TOOLS_DIR/.venv/bin/activate" +fi + +# ็ผ–่ฏ‘ๅทฅๅ…ทๅŠ ๅ…ฅ PATH +export PATH="$GPU_TOOLS_DIR/nvbandwidth:$PATH" +export PATH="$GPU_TOOLS_DIR/nccl-tests/build:$PATH" +export PATH="$GPU_TOOLS_DIR/gpu-burn:$PATH" +export PATH="$CUDA_HOME/bin:$PATH" + +# ๅบ“่ทฏๅพ„ +export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}" +ENVEOF + + # ๆ›ฟๆขๅ ไฝ็ฌฆ + sed -i "s|__INSTALL_DIR__|$INSTALL_DIR|g" "$env_file" + chmod +x "$env_file" + ok "env.sh ๅทฒ็”Ÿๆˆ: $env_file" + + # ็”Ÿๆˆ run-gpu-tests ่ฟ่กŒๅ™จ + local wrapper="$INSTALL_DIR/run-gpu-tests" + cat > "$wrapper" << WRAPEOF +#!/usr/bin/env bash +# GPU Test Suite ไธ€้”ฎ่ฟ่กŒๅ™จ +# ็”จๆณ•: /opt/gpu-test-tools/run-gpu-tests --test all + +SCRIPT_DIR="\$(cd "\$(dirname "\${BASH_SOURCE[0]}")" && pwd)" +source "\$SCRIPT_DIR/env.sh" +exec python3 "$PROJECT_DIR/gpu_tester.py" "\$@" +WRAPEOF + chmod +x "$wrapper" + ok "run-gpu-tests ๅทฒ็”Ÿๆˆ: $wrapper" +} + +# โ”€โ”€โ”€ ้˜ถๆฎต 6: ๆ‰“ๅฐๆ€ป็ป“ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +print_summary() { + banner "้˜ถๆฎต 6/6: ๅฎ‰่ฃ…ๆ€ป็ป“" + + echo -e "${BOLD}ๅฎ‰่ฃ…็›ฎๅฝ•:${NC} $INSTALL_DIR" + echo "" + echo -e "${BOLD}็ป„ไปถ็Šถๆ€:${NC}" + + # Python ่™šๆ‹Ÿ็Žฏๅขƒ + if [[ -x "$INSTALL_DIR/.venv/bin/python" ]]; then + local py_ver + py_ver=$("$INSTALL_DIR/.venv/bin/python" --version 2>&1) + echo -e " ${GREEN}โœ“${NC} Python venv: $py_ver" + else + echo -e " ${RED}โœ—${NC} Python venv: ๆœชๅˆ›ๅปบ" + fi + + # PyTorch + if "$INSTALL_DIR/.venv/bin/python" -c "import torch" &>/dev/null 2>&1; then + local tv + tv=$("$INSTALL_DIR/.venv/bin/python" -c "import torch; print(f'{torch.__version__} (CUDA {torch.version.cuda})')" 2>/dev/null) + echo -e " ${GREEN}โœ“${NC} PyTorch: $tv" + else + echo -e " ${YELLOW}โ—‹${NC} PyTorch: ๆœชๅฎ‰่ฃ…" + fi + + # ็ผ–่ฏ‘ๅทฅๅ…ท + for tool_info in \ "$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \ "$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \ "$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do - path="${tool_path%%:*}" - name="${tool_path##*:}" + local path="${tool_info%%:*}" + local name="${tool_info##*:}" if [[ -x "$path" ]]; then - echo -e " [${GREEN}โœ“${NC}] $name" + echo -e " ${GREEN}โœ“${NC} $name" else - echo -e " [${YELLOW}?${NC}] $name (not built)" + echo -e " ${YELLOW}โ—‹${NC} $name (ๆœช็ผ–่ฏ‘)" fi done - echo "" - echo " System tools:" - for cmd in nvidia-smi mpirun nvbandwidth ib_write_bw dcgmi; do - if command -v "$cmd" &>/dev/null; then - echo -e " [${GREEN}โœ“${NC}] $cmd" - else - echo -e " [${YELLOW}-${NC}] $cmd (not found)" + # RDMA ๅทฅๅ…ท๏ผˆ็ณป็ปŸ็บง๏ผ‰ + local rdma_found=0 + for tool in ib_write_bw ib_read_bw ibstat; do + if command -v "$tool" &>/dev/null; then + ((rdma_found++)) fi done + if [[ $rdma_found -gt 0 ]]; then + echo -e " ${GREEN}โœ“${NC} RDMA ๅทฅๅ…ท: $rdma_found/3 ๅฏ็”จ" + else + echo -e " ${YELLOW}โ—‹${NC} RDMA ๅทฅๅ…ท: ๆœชๅฎ‰่ฃ… (apt install perftest infiniband-diags)" + fi echo "" - echo " Usage:" - echo " python3 gpu_tester.py # Interactive menu" - echo " python3 gpu_tester.py --test all # Full suite" + echo -e "${BOLD}ไฝฟ็”จๆ–นๆณ•:${NC}" + echo "" + echo " # ๆ–นๅผไธ€: source ๆฟ€ๆดปๅŽไฝฟ็”จ" + echo " source $INSTALL_DIR/env.sh" + echo " python3 $PROJECT_DIR/gpu_tester.py --test all" + echo "" + echo " # ๆ–นๅผไบŒ: ไธ€้”ฎ่ฟ่กŒ" + echo " $INSTALL_DIR/run-gpu-tests --test all" + echo " $INSTALL_DIR/run-gpu-tests --test health" + echo " $INSTALL_DIR/run-gpu-tests # ไบคไบ’ๅผ่œๅ•" echo "" } +# โ”€โ”€โ”€ ไธปๅ‡ฝๆ•ฐ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ main() { + parse_args "$@" + echo "" - echo "==========================================" - echo " GPU Test Suite - Dependency Installer" - echo "==========================================" + echo -e "${BOLD}${CYAN}โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—${NC}" + echo -e "${BOLD}${CYAN}โ•‘ GPU Test Suite โ€” ไธ€้”ฎๅฎ‰่ฃ… โ•‘${NC}" + echo -e "${BOLD}${CYAN}โ•‘ ็Žฏๅขƒ้š”็ฆป ยท ่‡ชๅŠจๆฃ€ๆต‹ ยท ๅฎŒๆ•ด้ƒจ็ฝฒ โ•‘${NC}" + echo -e "${BOLD}${CYAN}โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•${NC}" + echo "" + log "ๅฎ‰่ฃ…็›ฎๅฝ•: $INSTALL_DIR" + log "้กน็›ฎ็›ฎๅฝ•: $PROJECT_DIR" echo "" - check_root - detect_gpu - - mkdir -p "$INSTALL_DIR" - - install_system_deps - install_python_deps - install_nvbandwidth - install_nccl_tests - install_gpu_burn - check_dcgm - check_rdma_tools - + check_permissions + validate_environment + ensure_uv + setup_python_venv + build_native_tools + generate_env_sh print_summary + + echo -e "${GREEN}${BOLD}ๅฎ‰่ฃ…ๅฎŒๆˆ๏ผ${NC}" } main "$@" diff --git a/requirements.txt b/requirements.txt index 284cb79..4be4c43 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ rich>=13.0 pyyaml>=6.0 +numpy>=1.24