- Expand GPU specs database to include A100/A800 with Ampere architecture parameters - Rename h200_tester.py to gpu_tester.py for architecture-neutral branding - Add driver/CUDA compatibility validation per GPU generation - Enhance report module with HTML and Markdown output formats - Improve nvbandwidth binary discovery (system paths, DCGM locations) - Add pyproject.toml with uv for dependency management - Update install_deps.sh, configs, and README for multi-architecture support 🤖 Generated with [Qoder][https://qoder.com]
235 lines
7.1 KiB
Bash
Executable File
235 lines
7.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
INSTALL_DIR="${GPU_TOOLS_DIR:-/opt/gpu-test-tools}"
|
|
JOBS="${MAKE_JOBS:-$(nproc)}"
|
|
VERBOSE="${VERBOSE:-0}"
|
|
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m'
|
|
|
|
log() { echo -e "${CYAN}[INFO]${NC} $*"; }
|
|
ok() { echo -e "${GREEN}[ OK ]${NC} $*"; }
|
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
fail() { echo -e "${RED}[FAIL]${NC} $*"; }
|
|
|
|
check_root() {
|
|
if [[ $EUID -ne 0 ]]; then
|
|
warn "Not running as root. Some installations may fail."
|
|
warn "Re-run with: sudo $0"
|
|
fi
|
|
}
|
|
|
|
detect_gpu() {
|
|
if ! command -v nvidia-smi &>/dev/null; then
|
|
fail "nvidia-smi not found. Install NVIDIA drivers first."
|
|
exit 1
|
|
fi
|
|
local gpu_name
|
|
gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -1)
|
|
log "Detected GPU: $gpu_name"
|
|
}
|
|
|
|
install_system_deps() {
|
|
log "Installing system dependencies..."
|
|
if command -v apt-get &>/dev/null; then
|
|
apt-get update -qq
|
|
apt-get install -y -qq build-essential git cmake wget curl \
|
|
openmpi-bin libopenmpi-dev openssh-client \
|
|
infiniband-diags ibverbs-utils perftest \
|
|
python3 python3-pip python3-venv \
|
|
2>/dev/null || warn "Some apt packages failed (may already be installed)"
|
|
elif command -v yum &>/dev/null; then
|
|
yum groupinstall -y "Development Tools" 2>/dev/null || true
|
|
yum install -y git cmake wget curl \
|
|
openmpi openmpi-devel openssh-clients \
|
|
infiniband-diags libibverbs-utils perftest \
|
|
python3 python3-pip \
|
|
2>/dev/null || warn "Some yum packages failed"
|
|
elif command -v dnf &>/dev/null; then
|
|
dnf groupinstall -y "Development Tools" 2>/dev/null || true
|
|
dnf install -y git cmake wget curl \
|
|
openmpi openmpi-devel openssh-clients \
|
|
infiniband-diags libibverbs-utils perftest \
|
|
python3 python3-pip \
|
|
2>/dev/null || warn "Some dnf packages failed"
|
|
else
|
|
warn "Unsupported package manager. Install deps manually."
|
|
fi
|
|
ok "System dependencies"
|
|
}
|
|
|
|
install_python_deps() {
|
|
log "Installing Python dependencies..."
|
|
pip3 install --quiet rich pyyaml 2>/dev/null || pip install --quiet rich pyyaml
|
|
ok "Python dependencies (rich, pyyaml)"
|
|
}
|
|
|
|
install_nvbandwidth() {
|
|
log "Installing nvbandwidth..."
|
|
local src="$INSTALL_DIR/nvbandwidth"
|
|
if [[ -x "$src/nvbandwidth" ]]; then
|
|
ok "nvbandwidth already installed at $src/nvbandwidth"
|
|
return
|
|
fi
|
|
|
|
mkdir -p "$INSTALL_DIR"
|
|
git clone --depth 1 https://github.com/NVIDIA/nvbandwidth.git "$src" 2>/dev/null
|
|
cd "$src"
|
|
mkdir -p build && cd build
|
|
cmake .. -DCMAKE_BUILD_TYPE=Release 2>/dev/null
|
|
make -j"$JOBS" 2>/dev/null
|
|
if [[ -x "$src/build/nvbandwidth" ]]; then
|
|
cp "$src/build/nvbandwidth" "$src/nvbandwidth"
|
|
ok "nvbandwidth installed at $src/nvbandwidth"
|
|
else
|
|
warn "nvbandwidth build failed. Try building manually in $src"
|
|
fi
|
|
}
|
|
|
|
install_nccl_tests() {
|
|
log "Installing nccl-tests..."
|
|
local src="$INSTALL_DIR/nccl-tests"
|
|
if [[ -x "$src/build/all_reduce_perf" ]]; then
|
|
ok "nccl-tests already installed at $src/build/"
|
|
return
|
|
fi
|
|
|
|
mkdir -p "$INSTALL_DIR"
|
|
git clone --depth 1 https://github.com/NVIDIA/nccl-tests.git "$src" 2>/dev/null
|
|
cd "$src"
|
|
|
|
local cuda_home="${CUDA_HOME:-/usr/local/cuda}"
|
|
if [[ ! -d "$cuda_home" ]]; then
|
|
warn "CUDA_HOME not found at $cuda_home. Set CUDA_HOME env var."
|
|
return
|
|
fi
|
|
|
|
make MPI=1 MPI_HOME=/usr -j"$JOBS" 2>/dev/null || \
|
|
make CUDA_HOME="$cuda_home" -j"$JOBS" 2>/dev/null || \
|
|
warn "nccl-tests build failed. Try: cd $src && make MPI=1"
|
|
|
|
if [[ -x "$src/build/all_reduce_perf" ]]; then
|
|
ok "nccl-tests installed at $src/build/"
|
|
else
|
|
warn "nccl-tests build incomplete"
|
|
fi
|
|
}
|
|
|
|
install_gpu_burn() {
|
|
log "Installing gpu-burn..."
|
|
local src="$INSTALL_DIR/gpu-burn"
|
|
if [[ -x "$src/gpu_burn" ]]; then
|
|
ok "gpu-burn already installed at $src/gpu_burn"
|
|
return
|
|
fi
|
|
|
|
mkdir -p "$INSTALL_DIR"
|
|
git clone --depth 1 https://github.com/wilicc/gpu-burn.git "$src" 2>/dev/null
|
|
cd "$src"
|
|
make -j"$JOBS" 2>/dev/null || warn "gpu-burn build failed"
|
|
if [[ -x "$src/gpu_burn" ]]; then
|
|
ok "gpu-burn installed at $src/gpu_burn"
|
|
else
|
|
warn "gpu-burn build incomplete"
|
|
fi
|
|
}
|
|
|
|
check_dcgm() {
|
|
log "Checking DCGM..."
|
|
if command -v nv-hostengine &>/dev/null || command -v dcgmi &>/dev/null; then
|
|
ok "DCGM already installed"
|
|
return
|
|
fi
|
|
if dpkg -l datacenter-gpu-manager &>/dev/null 2>&1; then
|
|
ok "DCGM package installed"
|
|
return
|
|
fi
|
|
warn "DCGM not found. Install from: https://docs.nvidia.com/datacenter/dcgm/latest/installation-guide.html"
|
|
warn " Ubuntu: sudo apt install datacenter-gpu-manager"
|
|
warn " Or: curl -fsSL https://deb.nvidia.com/datacenter-gpu-manager/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-dcgm.gpg"
|
|
}
|
|
|
|
check_rdma_tools() {
|
|
log "Checking RDMA tools..."
|
|
local found=0
|
|
for tool in ib_write_bw ib_read_bw ib_write_lat ib_read_lat ibstat ibv_devinfo; do
|
|
if command -v "$tool" &>/dev/null; then
|
|
found=$((found + 1))
|
|
else
|
|
warn " $tool not found (install: perftest infiniband-diags)"
|
|
fi
|
|
done
|
|
if [[ $found -gt 0 ]]; then
|
|
ok "$found/$RDMA_TOOL_COUNT RDMA tools found" 2>/dev/null || ok "Some RDMA tools found"
|
|
fi
|
|
}
|
|
|
|
print_summary() {
|
|
echo ""
|
|
echo "=========================================="
|
|
echo " GPU Test Suite - Installation Summary"
|
|
echo "=========================================="
|
|
echo ""
|
|
echo " Install directory: $INSTALL_DIR"
|
|
echo ""
|
|
echo " Tools status:"
|
|
|
|
for tool_path in \
|
|
"$INSTALL_DIR/nvbandwidth/nvbandwidth:nvbandwidth" \
|
|
"$INSTALL_DIR/nccl-tests/build/all_reduce_perf:nccl-tests" \
|
|
"$INSTALL_DIR/gpu-burn/gpu_burn:gpu-burn"; do
|
|
path="${tool_path%%:*}"
|
|
name="${tool_path##*:}"
|
|
if [[ -x "$path" ]]; then
|
|
echo -e " [${GREEN}✓${NC}] $name"
|
|
else
|
|
echo -e " [${YELLOW}?${NC}] $name (not built)"
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
echo " System tools:"
|
|
for cmd in nvidia-smi mpirun nvbandwidth ib_write_bw dcgmi; do
|
|
if command -v "$cmd" &>/dev/null; then
|
|
echo -e " [${GREEN}✓${NC}] $cmd"
|
|
else
|
|
echo -e " [${YELLOW}-${NC}] $cmd (not found)"
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
echo " Usage:"
|
|
echo " python3 gpu_tester.py # Interactive menu"
|
|
echo " python3 gpu_tester.py --test all # Full suite"
|
|
echo ""
|
|
}
|
|
|
|
main() {
|
|
echo ""
|
|
echo "=========================================="
|
|
echo " GPU Test Suite - Dependency Installer"
|
|
echo "=========================================="
|
|
echo ""
|
|
|
|
check_root
|
|
detect_gpu
|
|
|
|
mkdir -p "$INSTALL_DIR"
|
|
|
|
install_system_deps
|
|
install_python_deps
|
|
install_nvbandwidth
|
|
install_nccl_tests
|
|
install_gpu_burn
|
|
check_dcgm
|
|
check_rdma_tools
|
|
|
|
print_summary
|
|
}
|
|
|
|
main "$@"
|