#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}" CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}" NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}" OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" MATRIX_SIZE="${MATRIX_SIZE:-8192}" WARMUP="${WARMUP:-20}" ITERATIONS="${ITERATIONS:-100}" GPU_INDEX="${GPU_INDEX:-0}" WORKSPACE_MB="${WORKSPACE_MB:-256}" VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY' import site print(site.getsitepackages()[0]) PY )" export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}" mkdir -p "$PROJECT_DIR/build" "$OUT_DIR" HOST="$(hostname 2>/dev/null || echo unknown)" TS="$(date +%Y%m%d_%H%M%S)" PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json" CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json" COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json" "$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \ --matrix-size "$MATRIX_SIZE" \ --warmup "$WARMUP" \ --iterations "$ITERATIONS" \ --gpu-index "$GPU_INDEX" | tee "$PY_REPORT" "$NVCC" -O3 -std=c++17 -arch=sm_90 \ "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \ -lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \ --matrix-size "$MATRIX_SIZE" \ --warmup "$WARMUP" \ --iterations "$ITERATIONS" \ --first-gpu "$GPU_INDEX" \ --gpu-count 1 \ --workspace-mb "$WORKSPACE_MB" \ --fast-accum 1 | tee "$CUBLAS_REPORT" "$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY' import json import pathlib import sys py_report = pathlib.Path(sys.argv[1]) cublas_report = pathlib.Path(sys.argv[2]) combined_report = pathlib.Path(sys.argv[3]) with py_report.open() as f: py_payload = json.load(f) with cublas_report.open() as f: cublas_payload = json.load(f) combined = { "source": "fp8_path_comparison", "host": cublas_payload.get("host"), "matrix_size": py_payload.get("matrix_size"), "gpu_index": py_payload.get("gpu_index"), "pytorch": py_payload, "cublaslt": cublas_payload, "results": [], } combined["results"].extend(py_payload.get("results", [])) per_gpu = cublas_payload.get("per_gpu", []) if per_gpu: row = dict(per_gpu[0]) row.update({ "name": "E_direct_cublaslt_fast_accum", "status": "ok", "tflops": row.pop("fp8_tflops"), "matrix_size": cublas_payload.get("matrix_size"), "iterations": cublas_payload.get("iterations"), "warmup": cublas_payload.get("warmup"), "fast_accum": cublas_payload.get("fast_accum"), "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.", }) combined["results"].append(row) combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8") print(f"Combined report written to: {combined_report}") PY echo "$COMBINED_REPORT"