94 lines
3.0 KiB
Bash
Executable File
94 lines
3.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
|
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
|
|
|
PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}"
|
|
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}"
|
|
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
|
|
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
|
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
|
|
WARMUP="${WARMUP:-20}"
|
|
ITERATIONS="${ITERATIONS:-100}"
|
|
GPU_INDEX="${GPU_INDEX:-0}"
|
|
WORKSPACE_MB="${WORKSPACE_MB:-256}"
|
|
VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY'
|
|
import site
|
|
print(site.getsitepackages()[0])
|
|
PY
|
|
)"
|
|
export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}"
|
|
|
|
mkdir -p "$PROJECT_DIR/build" "$OUT_DIR"
|
|
|
|
HOST="$(hostname 2>/dev/null || echo unknown)"
|
|
TS="$(date +%Y%m%d_%H%M%S)"
|
|
PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json"
|
|
CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json"
|
|
COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json"
|
|
|
|
"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \
|
|
--matrix-size "$MATRIX_SIZE" \
|
|
--warmup "$WARMUP" \
|
|
--iterations "$ITERATIONS" \
|
|
--gpu-index "$GPU_INDEX" | tee "$PY_REPORT"
|
|
|
|
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
|
|
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
|
|
-lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
|
|
|
|
"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \
|
|
--matrix-size "$MATRIX_SIZE" \
|
|
--warmup "$WARMUP" \
|
|
--iterations "$ITERATIONS" \
|
|
--first-gpu "$GPU_INDEX" \
|
|
--gpu-count 1 \
|
|
--workspace-mb "$WORKSPACE_MB" \
|
|
--fast-accum 1 | tee "$CUBLAS_REPORT"
|
|
|
|
"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY'
|
|
import json
|
|
import pathlib
|
|
import sys
|
|
|
|
py_report = pathlib.Path(sys.argv[1])
|
|
cublas_report = pathlib.Path(sys.argv[2])
|
|
combined_report = pathlib.Path(sys.argv[3])
|
|
|
|
with py_report.open() as f:
|
|
py_payload = json.load(f)
|
|
with cublas_report.open() as f:
|
|
cublas_payload = json.load(f)
|
|
|
|
combined = {
|
|
"source": "fp8_path_comparison",
|
|
"host": cublas_payload.get("host"),
|
|
"matrix_size": py_payload.get("matrix_size"),
|
|
"gpu_index": py_payload.get("gpu_index"),
|
|
"pytorch": py_payload,
|
|
"cublaslt": cublas_payload,
|
|
"results": [],
|
|
}
|
|
combined["results"].extend(py_payload.get("results", []))
|
|
per_gpu = cublas_payload.get("per_gpu", [])
|
|
if per_gpu:
|
|
row = dict(per_gpu[0])
|
|
row.update({
|
|
"name": "E_direct_cublaslt_fast_accum",
|
|
"status": "ok",
|
|
"tflops": row.pop("fp8_tflops"),
|
|
"matrix_size": cublas_payload.get("matrix_size"),
|
|
"iterations": cublas_payload.get("iterations"),
|
|
"warmup": cublas_payload.get("warmup"),
|
|
"fast_accum": cublas_payload.get("fast_accum"),
|
|
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.",
|
|
})
|
|
combined["results"].append(row)
|
|
|
|
combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8")
|
|
print(f"Combined report written to: {combined_report}")
|
|
PY
|
|
|
|
echo "$COMBINED_REPORT"
|