test_gpu_scripts/scripts/run_fp8_path_comparison.sh

94 lines
3.0 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}"
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}"
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
WARMUP="${WARMUP:-20}"
ITERATIONS="${ITERATIONS:-100}"
GPU_INDEX="${GPU_INDEX:-0}"
WORKSPACE_MB="${WORKSPACE_MB:-256}"
VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY'
import site
print(site.getsitepackages()[0])
PY
)"
export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}"
mkdir -p "$PROJECT_DIR/build" "$OUT_DIR"
HOST="$(hostname 2>/dev/null || echo unknown)"
TS="$(date +%Y%m%d_%H%M%S)"
PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json"
CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json"
COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json"
"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \
--matrix-size "$MATRIX_SIZE" \
--warmup "$WARMUP" \
--iterations "$ITERATIONS" \
--gpu-index "$GPU_INDEX" | tee "$PY_REPORT"
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
-lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \
--matrix-size "$MATRIX_SIZE" \
--warmup "$WARMUP" \
--iterations "$ITERATIONS" \
--first-gpu "$GPU_INDEX" \
--gpu-count 1 \
--workspace-mb "$WORKSPACE_MB" \
--fast-accum 1 | tee "$CUBLAS_REPORT"
"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY'
import json
import pathlib
import sys
py_report = pathlib.Path(sys.argv[1])
cublas_report = pathlib.Path(sys.argv[2])
combined_report = pathlib.Path(sys.argv[3])
with py_report.open() as f:
py_payload = json.load(f)
with cublas_report.open() as f:
cublas_payload = json.load(f)
combined = {
"source": "fp8_path_comparison",
"host": cublas_payload.get("host"),
"matrix_size": py_payload.get("matrix_size"),
"gpu_index": py_payload.get("gpu_index"),
"pytorch": py_payload,
"cublaslt": cublas_payload,
"results": [],
}
combined["results"].extend(py_payload.get("results", []))
per_gpu = cublas_payload.get("per_gpu", [])
if per_gpu:
row = dict(per_gpu[0])
row.update({
"name": "E_direct_cublaslt_fast_accum",
"status": "ok",
"tflops": row.pop("fp8_tflops"),
"matrix_size": cublas_payload.get("matrix_size"),
"iterations": cublas_payload.get("iterations"),
"warmup": cublas_payload.get("warmup"),
"fast_accum": cublas_payload.get("fast_accum"),
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.",
})
combined["results"].append(row)
combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8")
print(f"Combined report written to: {combined_report}")
PY
echo "$COMBINED_REPORT"