diff --git a/install_deps.sh b/install_deps.sh new file mode 100755 index 0000000..f7538fd --- /dev/null +++ b/install_deps.sh @@ -0,0 +1,234 @@ +#!/usr/bin/env bash +set -euo pipefail + +INSTALL_DIR="${H200_TOOLS_DIR:-/opt/h200-test-tools}" +JOBS="${MAKE_JOBS:-$(nproc)}" +VERBOSE="${VERBOSE:-0}" + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +log() { echo -e "${CYAN}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[ OK ]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +fail() { echo -e "${RED}[FAIL]${NC} $*"; } + +check_root() { + if [[ $EUID -ne 0 ]]; then + warn "Not running as root. Some installations may fail." + warn "Re-run with: sudo $0" + fi +} + +detect_gpu() { + if ! command -v nvidia-smi &>/dev/null; then + fail "nvidia-smi not found. Install NVIDIA drivers first." + exit 1 + fi + local gpu_name + gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1) + log "Detected GPU: $gpu_name" +} + +install_system_deps() { + log "Installing system dependencies..." + if command -v apt-get &>/dev/null; then + apt-get update -qq + apt-get install -y -qq build-essential git cmake wget curl \ + openmpi-bin libopenmpi-dev openssh-client \ + infiniband-diags ibverbs-utils perftest \ + python3 python3-pip python3-venv \ + 2>/dev/null || warn "Some apt packages failed (may already be installed)" + elif command -v yum &>/dev/null; then + yum groupinstall -y "Development Tools" 2>/dev/null || true + yum install -y git cmake wget curl \ + openmpi openmpi-devel openssh-clients \ + infiniband-diags libibverbs-utils perftest \ + python3 python3-pip \ + 2>/dev/null || warn "Some yum packages failed" + elif command -v dnf &>/dev/null; then + dnf groupinstall -y "Development Tools" 2>/dev/null || true + dnf install -y git cmake wget curl \ + openmpi openmpi-devel openssh-clients \ + infiniband-diags libibverbs-utils perftest \ + python3 python3-pip \ + 2>/dev/null || warn "Some dnf packages failed" + else + warn "Unsupported package manager. Install deps manually." + fi + ok "System dependencies" +} + +install_python_deps() { + log "Installing Python dependencies..." + pip3 install --quiet rich pyyaml 2>/dev/null || pip install --quiet rich pyyaml + ok "Python dependencies (rich, pyyaml)" +} + +install_nvbandwidth() { + log "Installing nvbandwidth..." + local src="$INSTALL_DIR/nvbandwidth" + if [[ -x "$src/nvbandwidth" ]]; then + ok "nvbandwidth already installed at $src/nvbandwidth" + return + fi + + mkdir -p "$INSTALL_DIR" + git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" 2>/dev/null + cd "$src" + mkdir -p build && cd build + cmake .. -DCMAKE_BUILD_TYPE=Release 2>/dev/null + make -j"$JOBS" 2>/dev/null + if [[ -x "$src/build/nvbandwidth" ]]; then + cp "$src/build/nvbandwidth" "$src/nvbandwidth" + ok "nvbandwidth installed at $src/nvbandwidth" + else + warn "nvbandwidth build failed. Try building manually in $src" + fi +} + +install_nccl_tests() { + log "Installing nccl-tests..." + local src="$INSTALL_DIR/nccl-tests" + if [[ -x "$src/build/all_reduce_perf" ]]; then + ok "nccl-tests already installed at $src/build/" + return + fi + + mkdir -p "$INSTALL_DIR" + git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" 2>/dev/null + cd "$src" + + local cuda_home="${CUDA_HOME:-/usr/local/cuda}" + if [[ ! -d "$cuda_home" ]]; then + warn "CUDA_HOME not found at $cuda_home. Set CUDA_HOME env var." + return + fi + + make MPI=1 MPI_HOME=/usr -j"$JOBS" 2>/dev/null || \ + make CUDA_HOME="$cuda_home" -j"$JOBS" 2>/dev/null || \ + warn "nccl-tests build failed. Try: cd $src && make MPI=1" + + if [[ -x "$src/build/all_reduce_perf" ]]; then + ok "nccl-tests installed at $src/build/" + else + warn "nccl-tests build incomplete" + fi +} + +install_gpu_burn() { + log "Installing gpu-burn..." + local src="$INSTALL_DIR/gpu-burn" + if [[ -x "$src/gpu_burn" ]]; then + ok "gpu-burn already installed at $src/gpu_burn" + return + fi + + mkdir -p "$INSTALL_DIR" + git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" 2>/dev/null + cd "$src" + make -j"$JOBS" 2>/dev/null || warn "gpu-burn build failed" + if [[ -x "$src/gpu_burn" ]]; then + ok "gpu-burn installed at $src/gpu_burn" + else + warn "gpu-burn build incomplete" + fi +} + +check_dcgm() { + log "Checking DCGM..." + if command -v nv-hostengine &>/dev/null || command -v dcgmi &>/dev/null; then + ok "DCGM already installed" + return + fi + if dpkg -l datacenter-gpu-manager &>/dev/null 2>&1; then + ok "DCGM package installed" + return + fi + warn "DCGM not found. Install from: https://docs.nvidia.com/datacenter/dcgm/latest/installation-guide.html" + warn " Ubuntu: sudo apt install datacenter-gpu-manager" + warn " Or: curl -fsSL https://deb.nvidia.com/datacenter-gpu-manager/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-dcgm.gpg" +} + +check_rdma_tools() { + log "Checking RDMA tools..." + local found=0 + for tool in ib_write_bw ib_read_bw ib_write_lat ib_read_lat ibstat ibv_devinfo; do + if command -v "$tool" &>/dev/null; then + found=$((found + 1)) + else + warn " $tool not found (install: perftest infiniband-diags)" + fi + done + if [[ $found -gt 0 ]]; then + ok "$found/$RDMA_TOOL_COUNT RDMA tools found" 2>/dev/null || ok "Some RDMA tools found" + fi +} + +print_summary() { + echo "" + echo "==========================================" + echo " H200 Test Suite - Installation Summary" + echo "==========================================" + echo "" + echo " Install directory: $INSTALL_DIR" + echo "" + echo " Tools status:" + + for tool_path in \ + "$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \ + "$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \ + "$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do + path="${tool_path%%:*}" + name="${tool_path##*:}" + if [[ -x "$path" ]]; then + echo -e " [${GREEN}✓${NC}] $name" + else + echo -e " [${YELLOW}?${NC}] $name (not built)" + fi + done + + echo "" + echo " System tools:" + for cmd in nvidia-smi mpirun nvbandwidth ib_write_bw dcgmi; do + if command -v "$cmd" &>/dev/null; then + echo -e " [${GREEN}✓${NC}] $cmd" + else + echo -e " [${YELLOW}-${NC}] $cmd (not found)" + fi + done + + echo "" + echo " Usage:" + echo " python3 h200_tester.py # Interactive menu" + echo " python3 h200_tester.py --test all # Full suite" + echo "" +} + +main() { + echo "" + echo "==========================================" + echo " H200 Test Suite - Dependency Installer" + echo "==========================================" + echo "" + + check_root + detect_gpu + + mkdir -p "$INSTALL_DIR" + + install_system_deps + install_python_deps + install_nvbandwidth + install_nccl_tests + install_gpu_burn + check_dcgm + check_rdma_tools + + print_summary +} + +main "$@"