test_gpu_scripts/install_deps.sh
qinyusen 65cf7feee5 add: dependency installation script (nvbandwidth, nccl-tests, gpu-burn)
Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-04-25 17:23:32 +08:00

235 lines
7.1 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
INSTALL_DIR="${H200_TOOLS_DIR:-/opt/h200-test-tools}"
JOBS="${MAKE_JOBS:-$(nproc)}"
VERBOSE="${VERBOSE:-0}"
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
log() { echo -e "${CYAN}[INFO]${NC} $*"; }
ok() { echo -e "${GREEN}[ OK ]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
fail() { echo -e "${RED}[FAIL]${NC} $*"; }
check_root() {
if [[ $EUID -ne 0 ]]; then
warn "Not running as root. Some installations may fail."
warn "Re-run with: sudo $0"
fi
}
detect_gpu() {
if ! command -v nvidia-smi &>/dev/null; then
fail "nvidia-smi not found. Install NVIDIA drivers first."
exit 1
fi
local gpu_name
gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)
log "Detected GPU: $gpu_name"
}
install_system_deps() {
log "Installing system dependencies..."
if command -v apt-get &>/dev/null; then
apt-get update -qq
apt-get install -y -qq build-essential git cmake wget curl \
openmpi-bin libopenmpi-dev openssh-client \
infiniband-diags ibverbs-utils perftest \
python3 python3-pip python3-venv \
2>/dev/null || warn "Some apt packages failed (may already be installed)"
elif command -v yum &>/dev/null; then
yum groupinstall -y "Development Tools" 2>/dev/null || true
yum install -y git cmake wget curl \
openmpi openmpi-devel openssh-clients \
infiniband-diags libibverbs-utils perftest \
python3 python3-pip \
2>/dev/null || warn "Some yum packages failed"
elif command -v dnf &>/dev/null; then
dnf groupinstall -y "Development Tools" 2>/dev/null || true
dnf install -y git cmake wget curl \
openmpi openmpi-devel openssh-clients \
infiniband-diags libibverbs-utils perftest \
python3 python3-pip \
2>/dev/null || warn "Some dnf packages failed"
else
warn "Unsupported package manager. Install deps manually."
fi
ok "System dependencies"
}
install_python_deps() {
log "Installing Python dependencies..."
pip3 install --quiet rich pyyaml 2>/dev/null || pip install --quiet rich pyyaml
ok "Python dependencies (rich, pyyaml)"
}
install_nvbandwidth() {
log "Installing nvbandwidth..."
local src="$INSTALL_DIR/nvbandwidth"
if [[ -x "$src/nvbandwidth" ]]; then
ok "nvbandwidth already installed at $src/nvbandwidth"
return
fi
mkdir -p "$INSTALL_DIR"
git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" 2>/dev/null
cd "$src"
mkdir -p build && cd build
cmake .. -DCMAKE_BUILD_TYPE=Release 2>/dev/null
make -j"$JOBS" 2>/dev/null
if [[ -x "$src/build/nvbandwidth" ]]; then
cp "$src/build/nvbandwidth" "$src/nvbandwidth"
ok "nvbandwidth installed at $src/nvbandwidth"
else
warn "nvbandwidth build failed. Try building manually in $src"
fi
}
install_nccl_tests() {
log "Installing nccl-tests..."
local src="$INSTALL_DIR/nccl-tests"
if [[ -x "$src/build/all_reduce_perf" ]]; then
ok "nccl-tests already installed at $src/build/"
return
fi
mkdir -p "$INSTALL_DIR"
git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" 2>/dev/null
cd "$src"
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
if [[ ! -d "$cuda_home" ]]; then
warn "CUDA_HOME not found at $cuda_home. Set CUDA_HOME env var."
return
fi
make MPI=1 MPI_HOME=/usr -j"$JOBS" 2>/dev/null || \
make CUDA_HOME="$cuda_home" -j"$JOBS" 2>/dev/null || \
warn "nccl-tests build failed. Try: cd $src && make MPI=1"
if [[ -x "$src/build/all_reduce_perf" ]]; then
ok "nccl-tests installed at $src/build/"
else
warn "nccl-tests build incomplete"
fi
}
install_gpu_burn() {
log "Installing gpu-burn..."
local src="$INSTALL_DIR/gpu-burn"
if [[ -x "$src/gpu_burn" ]]; then
ok "gpu-burn already installed at $src/gpu_burn"
return
fi
mkdir -p "$INSTALL_DIR"
git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" 2>/dev/null
cd "$src"
make -j"$JOBS" 2>/dev/null || warn "gpu-burn build failed"
if [[ -x "$src/gpu_burn" ]]; then
ok "gpu-burn installed at $src/gpu_burn"
else
warn "gpu-burn build incomplete"
fi
}
check_dcgm() {
log "Checking DCGM..."
if command -v nv-hostengine &>/dev/null || command -v dcgmi &>/dev/null; then
ok "DCGM already installed"
return
fi
if dpkg -l datacenter-gpu-manager &>/dev/null 2>&1; then
ok "DCGM package installed"
return
fi
warn "DCGM not found. Install from: https://docs.nvidia.com/datacenter/dcgm/latest/installation-guide.html"
warn " Ubuntu: sudo apt install datacenter-gpu-manager"
warn " Or: curl -fsSL https://deb.nvidia.com/datacenter-gpu-manager/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-dcgm.gpg"
}
check_rdma_tools() {
log "Checking RDMA tools..."
local found=0
for tool in ib_write_bw ib_read_bw ib_write_lat ib_read_lat ibstat ibv_devinfo; do
if command -v "$tool" &>/dev/null; then
found=$((found + 1))
else
warn " $tool not found (install: perftest infiniband-diags)"
fi
done
if [[ $found -gt 0 ]]; then
ok "$found/$RDMA_TOOL_COUNT RDMA tools found" 2>/dev/null || ok "Some RDMA tools found"
fi
}
print_summary() {
echo ""
echo "=========================================="
echo " H200 Test Suite - Installation Summary"
echo "=========================================="
echo ""
echo " Install directory: $INSTALL_DIR"
echo ""
echo " Tools status:"
for tool_path in \
"$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \
"$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \
"$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do
path="${tool_path%%:*}"
name="${tool_path##*:}"
if [[ -x "$path" ]]; then
echo -e " [${GREEN}${NC}] $name"
else
echo -e " [${YELLOW}?${NC}] $name (not built)"
fi
done
echo ""
echo " System tools:"
for cmd in nvidia-smi mpirun nvbandwidth ib_write_bw dcgmi; do
if command -v "$cmd" &>/dev/null; then
echo -e " [${GREEN}${NC}] $cmd"
else
echo -e " [${YELLOW}-${NC}] $cmd (not found)"
fi
done
echo ""
echo " Usage:"
echo " python3 h200_tester.py # Interactive menu"
echo " python3 h200_tester.py --test all # Full suite"
echo ""
}
main() {
echo ""
echo "=========================================="
echo " H200 Test Suite - Dependency Installer"
echo "=========================================="
echo ""
check_root
detect_gpu
mkdir -p "$INSTALL_DIR"
install_system_deps
install_python_deps
install_nvbandwidth
install_nccl_tests
install_gpu_burn
check_dcgm
check_rdma_tools
print_summary
}
main "$@"