add: dependency installation script (nvbandwidth, nccl-tests, gpu-burn)
Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
parent
418dc70efb
commit
65cf7feee5
234
install_deps.sh
Executable file
234
install_deps.sh
Executable file
@ -0,0 +1,234 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
INSTALL_DIR="${H200_TOOLS_DIR:-/opt/h200-test-tools}"
|
||||
JOBS="${MAKE_JOBS:-$(nproc)}"
|
||||
VERBOSE="${VERBOSE:-0}"
|
||||
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
CYAN='\033[0;36m'
|
||||
NC='\033[0m'
|
||||
|
||||
log() { echo -e "${CYAN}[INFO]${NC} $*"; }
|
||||
ok() { echo -e "${GREEN}[ OK ]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
fail() { echo -e "${RED}[FAIL]${NC} $*"; }
|
||||
|
||||
check_root() {
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
warn "Not running as root. Some installations may fail."
|
||||
warn "Re-run with: sudo $0"
|
||||
fi
|
||||
}
|
||||
|
||||
detect_gpu() {
|
||||
if ! command -v nvidia-smi &>/dev/null; then
|
||||
fail "nvidia-smi not found. Install NVIDIA drivers first."
|
||||
exit 1
|
||||
fi
|
||||
local gpu_name
|
||||
gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)
|
||||
log "Detected GPU: $gpu_name"
|
||||
}
|
||||
|
||||
install_system_deps() {
|
||||
log "Installing system dependencies..."
|
||||
if command -v apt-get &>/dev/null; then
|
||||
apt-get update -qq
|
||||
apt-get install -y -qq build-essential git cmake wget curl \
|
||||
openmpi-bin libopenmpi-dev openssh-client \
|
||||
infiniband-diags ibverbs-utils perftest \
|
||||
python3 python3-pip python3-venv \
|
||||
2>/dev/null || warn "Some apt packages failed (may already be installed)"
|
||||
elif command -v yum &>/dev/null; then
|
||||
yum groupinstall -y "Development Tools" 2>/dev/null || true
|
||||
yum install -y git cmake wget curl \
|
||||
openmpi openmpi-devel openssh-clients \
|
||||
infiniband-diags libibverbs-utils perftest \
|
||||
python3 python3-pip \
|
||||
2>/dev/null || warn "Some yum packages failed"
|
||||
elif command -v dnf &>/dev/null; then
|
||||
dnf groupinstall -y "Development Tools" 2>/dev/null || true
|
||||
dnf install -y git cmake wget curl \
|
||||
openmpi openmpi-devel openssh-clients \
|
||||
infiniband-diags libibverbs-utils perftest \
|
||||
python3 python3-pip \
|
||||
2>/dev/null || warn "Some dnf packages failed"
|
||||
else
|
||||
warn "Unsupported package manager. Install deps manually."
|
||||
fi
|
||||
ok "System dependencies"
|
||||
}
|
||||
|
||||
install_python_deps() {
|
||||
log "Installing Python dependencies..."
|
||||
pip3 install --quiet rich pyyaml 2>/dev/null || pip install --quiet rich pyyaml
|
||||
ok "Python dependencies (rich, pyyaml)"
|
||||
}
|
||||
|
||||
install_nvbandwidth() {
|
||||
log "Installing nvbandwidth..."
|
||||
local src="$INSTALL_DIR/nvbandwidth"
|
||||
if [[ -x "$src/nvbandwidth" ]]; then
|
||||
ok "nvbandwidth already installed at $src/nvbandwidth"
|
||||
return
|
||||
fi
|
||||
|
||||
mkdir -p "$INSTALL_DIR"
|
||||
git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" 2>/dev/null
|
||||
cd "$src"
|
||||
mkdir -p build && cd build
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release 2>/dev/null
|
||||
make -j"$JOBS" 2>/dev/null
|
||||
if [[ -x "$src/build/nvbandwidth" ]]; then
|
||||
cp "$src/build/nvbandwidth" "$src/nvbandwidth"
|
||||
ok "nvbandwidth installed at $src/nvbandwidth"
|
||||
else
|
||||
warn "nvbandwidth build failed. Try building manually in $src"
|
||||
fi
|
||||
}
|
||||
|
||||
install_nccl_tests() {
|
||||
log "Installing nccl-tests..."
|
||||
local src="$INSTALL_DIR/nccl-tests"
|
||||
if [[ -x "$src/build/all_reduce_perf" ]]; then
|
||||
ok "nccl-tests already installed at $src/build/"
|
||||
return
|
||||
fi
|
||||
|
||||
mkdir -p "$INSTALL_DIR"
|
||||
git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" 2>/dev/null
|
||||
cd "$src"
|
||||
|
||||
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
|
||||
if [[ ! -d "$cuda_home" ]]; then
|
||||
warn "CUDA_HOME not found at $cuda_home. Set CUDA_HOME env var."
|
||||
return
|
||||
fi
|
||||
|
||||
make MPI=1 MPI_HOME=/usr -j"$JOBS" 2>/dev/null || \
|
||||
make CUDA_HOME="$cuda_home" -j"$JOBS" 2>/dev/null || \
|
||||
warn "nccl-tests build failed. Try: cd $src && make MPI=1"
|
||||
|
||||
if [[ -x "$src/build/all_reduce_perf" ]]; then
|
||||
ok "nccl-tests installed at $src/build/"
|
||||
else
|
||||
warn "nccl-tests build incomplete"
|
||||
fi
|
||||
}
|
||||
|
||||
install_gpu_burn() {
|
||||
log "Installing gpu-burn..."
|
||||
local src="$INSTALL_DIR/gpu-burn"
|
||||
if [[ -x "$src/gpu_burn" ]]; then
|
||||
ok "gpu-burn already installed at $src/gpu_burn"
|
||||
return
|
||||
fi
|
||||
|
||||
mkdir -p "$INSTALL_DIR"
|
||||
git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" 2>/dev/null
|
||||
cd "$src"
|
||||
make -j"$JOBS" 2>/dev/null || warn "gpu-burn build failed"
|
||||
if [[ -x "$src/gpu_burn" ]]; then
|
||||
ok "gpu-burn installed at $src/gpu_burn"
|
||||
else
|
||||
warn "gpu-burn build incomplete"
|
||||
fi
|
||||
}
|
||||
|
||||
check_dcgm() {
|
||||
log "Checking DCGM..."
|
||||
if command -v nv-hostengine &>/dev/null || command -v dcgmi &>/dev/null; then
|
||||
ok "DCGM already installed"
|
||||
return
|
||||
fi
|
||||
if dpkg -l datacenter-gpu-manager &>/dev/null 2>&1; then
|
||||
ok "DCGM package installed"
|
||||
return
|
||||
fi
|
||||
warn "DCGM not found. Install from: https://docs.nvidia.com/datacenter/dcgm/latest/installation-guide.html"
|
||||
warn " Ubuntu: sudo apt install datacenter-gpu-manager"
|
||||
warn " Or: curl -fsSL https://deb.nvidia.com/datacenter-gpu-manager/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-dcgm.gpg"
|
||||
}
|
||||
|
||||
check_rdma_tools() {
|
||||
log "Checking RDMA tools..."
|
||||
local found=0
|
||||
for tool in ib_write_bw ib_read_bw ib_write_lat ib_read_lat ibstat ibv_devinfo; do
|
||||
if command -v "$tool" &>/dev/null; then
|
||||
found=$((found + 1))
|
||||
else
|
||||
warn " $tool not found (install: perftest infiniband-diags)"
|
||||
fi
|
||||
done
|
||||
if [[ $found -gt 0 ]]; then
|
||||
ok "$found/$RDMA_TOOL_COUNT RDMA tools found" 2>/dev/null || ok "Some RDMA tools found"
|
||||
fi
|
||||
}
|
||||
|
||||
print_summary() {
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo " H200 Test Suite - Installation Summary"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo " Install directory: $INSTALL_DIR"
|
||||
echo ""
|
||||
echo " Tools status:"
|
||||
|
||||
for tool_path in \
|
||||
"$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \
|
||||
"$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \
|
||||
"$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do
|
||||
path="${tool_path%%:*}"
|
||||
name="${tool_path##*:}"
|
||||
if [[ -x "$path" ]]; then
|
||||
echo -e " [${GREEN}✓${NC}] $name"
|
||||
else
|
||||
echo -e " [${YELLOW}?${NC}] $name (not built)"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo " System tools:"
|
||||
for cmd in nvidia-smi mpirun nvbandwidth ib_write_bw dcgmi; do
|
||||
if command -v "$cmd" &>/dev/null; then
|
||||
echo -e " [${GREEN}✓${NC}] $cmd"
|
||||
else
|
||||
echo -e " [${YELLOW}-${NC}] $cmd (not found)"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo " Usage:"
|
||||
echo " python3 h200_tester.py # Interactive menu"
|
||||
echo " python3 h200_tester.py --test all # Full suite"
|
||||
echo ""
|
||||
}
|
||||
|
||||
main() {
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo " H200 Test Suite - Dependency Installer"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
check_root
|
||||
detect_gpu
|
||||
|
||||
mkdir -p "$INSTALL_DIR"
|
||||
|
||||
install_system_deps
|
||||
install_python_deps
|
||||
install_nvbandwidth
|
||||
install_nccl_tests
|
||||
install_gpu_burn
|
||||
check_dcgm
|
||||
check_rdma_tools
|
||||
|
||||
print_summary
|
||||
}
|
||||
|
||||
main "$@"
|
||||
Loading…
x
Reference in New Issue
Block a user