292 lines
12 KiB
Plaintext
292 lines
12 KiB
Plaintext
#include <cublasLt.h>
|
|
#include <cuda_bf16.h>
|
|
#include <cuda_fp8.h>
|
|
#include <cuda_runtime.h>
|
|
|
|
#include <algorithm>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <numeric>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#define CHECK_CUDA(call) \
|
|
do { \
|
|
cudaError_t status = (call); \
|
|
if (status != cudaSuccess) { \
|
|
std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
|
|
cudaGetErrorString(status)); \
|
|
std::exit(1); \
|
|
} \
|
|
} while (0)
|
|
|
|
#define CHECK_CUBLAS(call) \
|
|
do { \
|
|
cublasStatus_t status = (call); \
|
|
if (status != CUBLAS_STATUS_SUCCESS) { \
|
|
std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__, \
|
|
__LINE__, static_cast<int>(status)); \
|
|
std::exit(1); \
|
|
} \
|
|
} while (0)
|
|
|
|
__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) {
|
|
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
size_t stride = blockDim.x * gridDim.x;
|
|
for (size_t i = tid; i < count; i += stride) {
|
|
ptr[i] = __nv_fp8_e4m3(value);
|
|
}
|
|
}
|
|
|
|
struct Args {
|
|
int matrix_size = 8192;
|
|
int warmup = 20;
|
|
int iterations = 200;
|
|
int first_gpu = 0;
|
|
int gpu_count = -1;
|
|
size_t workspace_mb = 256;
|
|
int fast_accum = 1;
|
|
};
|
|
|
|
static Args parse_args(int argc, char **argv) {
|
|
Args args;
|
|
for (int i = 1; i < argc; ++i) {
|
|
auto need = [&](const char *name) {
|
|
if (i + 1 >= argc) {
|
|
std::fprintf(stderr, "Missing value for %s\n", name);
|
|
std::exit(2);
|
|
}
|
|
return argv[++i];
|
|
};
|
|
if (!std::strcmp(argv[i], "--matrix-size")) {
|
|
args.matrix_size = std::atoi(need(argv[i]));
|
|
} else if (!std::strcmp(argv[i], "--warmup")) {
|
|
args.warmup = std::atoi(need(argv[i]));
|
|
} else if (!std::strcmp(argv[i], "--iterations")) {
|
|
args.iterations = std::atoi(need(argv[i]));
|
|
} else if (!std::strcmp(argv[i], "--first-gpu")) {
|
|
args.first_gpu = std::atoi(need(argv[i]));
|
|
} else if (!std::strcmp(argv[i], "--gpu-count")) {
|
|
args.gpu_count = std::atoi(need(argv[i]));
|
|
} else if (!std::strcmp(argv[i], "--workspace-mb")) {
|
|
args.workspace_mb = static_cast<size_t>(std::atoll(need(argv[i])));
|
|
} else if (!std::strcmp(argv[i], "--fast-accum")) {
|
|
args.fast_accum = std::atoi(need(argv[i]));
|
|
} else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) {
|
|
std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] "
|
|
"[--iterations N] [--first-gpu N] [--gpu-count N] "
|
|
"[--workspace-mb N] [--fast-accum 0|1]");
|
|
std::exit(0);
|
|
} else {
|
|
std::fprintf(stderr, "Unknown argument: %s\n", argv[i]);
|
|
std::exit(2);
|
|
}
|
|
}
|
|
return args;
|
|
}
|
|
|
|
static double run_one_gpu(int gpu, const Args &args) {
|
|
CHECK_CUDA(cudaSetDevice(gpu));
|
|
|
|
const int64_t m = args.matrix_size;
|
|
const int64_t n = args.matrix_size;
|
|
const int64_t k = args.matrix_size;
|
|
const size_t a_elems = static_cast<size_t>(m) * k;
|
|
const size_t b_elems = static_cast<size_t>(k) * n;
|
|
const size_t d_elems = static_cast<size_t>(m) * n;
|
|
|
|
__nv_fp8_e4m3 *d_a = nullptr;
|
|
__nv_fp8_e4m3 *d_b = nullptr;
|
|
__nv_bfloat16 *d_d = nullptr;
|
|
void *workspace = nullptr;
|
|
float *d_scale_a = nullptr;
|
|
float *d_scale_b = nullptr;
|
|
const float scale = 1.0f;
|
|
const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL;
|
|
|
|
CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3)));
|
|
CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3)));
|
|
CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16)));
|
|
CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes));
|
|
CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float)));
|
|
CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float)));
|
|
CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice));
|
|
CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice));
|
|
|
|
const int threads = 256;
|
|
const int blocks = 4096;
|
|
fill_fp8<<<blocks, threads>>>(d_a, a_elems, 0.01f);
|
|
fill_fp8<<<blocks, threads>>>(d_b, b_elems, 0.01f);
|
|
CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16)));
|
|
CHECK_CUDA(cudaGetLastError());
|
|
CHECK_CUDA(cudaDeviceSynchronize());
|
|
|
|
cublasLtHandle_t lt;
|
|
cublasLtMatmulDesc_t op_desc;
|
|
cublasLtMatrixLayout_t a_desc, b_desc, d_desc;
|
|
cublasLtMatmulPreference_t preference;
|
|
CHECK_CUBLAS(cublasLtCreate(<));
|
|
CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F));
|
|
|
|
// cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed.
|
|
// With square GEMMs this keeps the benchmark FLOP count identical to the PDF
|
|
// acceptance shape while satisfying the library's FP8 kernel constraints.
|
|
cublasOperation_t transa = CUBLAS_OP_T;
|
|
cublasOperation_t transb = CUBLAS_OP_N;
|
|
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
|
op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
|
|
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
|
op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
|
|
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
|
op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a,
|
|
sizeof(d_scale_a)));
|
|
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
|
op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b,
|
|
sizeof(d_scale_b)));
|
|
int8_t fast_accum = args.fast_accum ? 1 : 0;
|
|
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
|
op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum,
|
|
sizeof(fast_accum)));
|
|
|
|
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k));
|
|
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k));
|
|
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m));
|
|
|
|
CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference));
|
|
CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute(
|
|
preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes,
|
|
sizeof(workspace_bytes)));
|
|
|
|
cublasLtMatmulHeuristicResult_t heuristic;
|
|
int returned = 0;
|
|
CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(
|
|
lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic,
|
|
&returned));
|
|
if (returned == 0) {
|
|
std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu);
|
|
std::exit(1);
|
|
}
|
|
|
|
auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
|
int32_t value = -1;
|
|
size_t written = 0;
|
|
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
|
&heuristic.algo, attr, &value, sizeof(value), &written));
|
|
return static_cast<int>(value);
|
|
};
|
|
auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
|
uint32_t value = 0;
|
|
size_t written = 0;
|
|
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
|
&heuristic.algo, attr, &value, sizeof(value), &written));
|
|
return static_cast<int>(value);
|
|
};
|
|
auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
|
uint16_t value = 0;
|
|
size_t written = 0;
|
|
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
|
&heuristic.algo, attr, &value, sizeof(value), &written));
|
|
return static_cast<int>(value);
|
|
};
|
|
const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID);
|
|
const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID);
|
|
const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM);
|
|
const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID);
|
|
const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID);
|
|
const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID);
|
|
|
|
const float alpha = 1.0f;
|
|
const float beta = 0.0f;
|
|
auto matmul = [&]() {
|
|
CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc,
|
|
&beta, d_d, d_desc, d_d, d_desc,
|
|
&heuristic.algo, workspace, workspace_bytes, 0));
|
|
};
|
|
|
|
for (int i = 0; i < args.warmup; ++i) {
|
|
matmul();
|
|
}
|
|
CHECK_CUDA(cudaDeviceSynchronize());
|
|
|
|
cudaEvent_t start, stop;
|
|
CHECK_CUDA(cudaEventCreate(&start));
|
|
CHECK_CUDA(cudaEventCreate(&stop));
|
|
CHECK_CUDA(cudaEventRecord(start));
|
|
for (int i = 0; i < args.iterations; ++i) {
|
|
matmul();
|
|
}
|
|
CHECK_CUDA(cudaEventRecord(stop));
|
|
CHECK_CUDA(cudaEventSynchronize(stop));
|
|
float elapsed_ms = 0.0f;
|
|
CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop));
|
|
const double flops =
|
|
2.0 * static_cast<double>(m) * static_cast<double>(n) *
|
|
static_cast<double>(k) * static_cast<double>(args.iterations);
|
|
const double tflops = flops / (static_cast<double>(elapsed_ms) / 1000.0) / 1e12;
|
|
std::printf(
|
|
" {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, "
|
|
"\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, "
|
|
"\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n",
|
|
gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape,
|
|
(gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ",");
|
|
std::fflush(stdout);
|
|
|
|
CHECK_CUDA(cudaEventDestroy(start));
|
|
CHECK_CUDA(cudaEventDestroy(stop));
|
|
CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference));
|
|
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc));
|
|
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc));
|
|
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc));
|
|
CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc));
|
|
CHECK_CUBLAS(cublasLtDestroy(lt));
|
|
CHECK_CUDA(cudaFree(d_a));
|
|
CHECK_CUDA(cudaFree(d_b));
|
|
CHECK_CUDA(cudaFree(d_d));
|
|
CHECK_CUDA(cudaFree(workspace));
|
|
CHECK_CUDA(cudaFree(d_scale_a));
|
|
CHECK_CUDA(cudaFree(d_scale_b));
|
|
CHECK_CUDA(cudaDeviceSynchronize());
|
|
|
|
return tflops;
|
|
}
|
|
|
|
int main(int argc, char **argv) {
|
|
Args args = parse_args(argc, argv);
|
|
int device_count = 0;
|
|
CHECK_CUDA(cudaGetDeviceCount(&device_count));
|
|
if (args.gpu_count < 0) {
|
|
args.gpu_count = device_count - args.first_gpu;
|
|
}
|
|
if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) {
|
|
std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n",
|
|
args.first_gpu, args.gpu_count, device_count);
|
|
return 2;
|
|
}
|
|
|
|
std::vector<double> values;
|
|
std::printf("{\n");
|
|
std::printf(" \"source\": \"cuBLASLt\",\n");
|
|
std::printf(" \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n");
|
|
std::printf(" \"matrix_size\": %d,\n", args.matrix_size);
|
|
std::printf(" \"warmup\": %d,\n", args.warmup);
|
|
std::printf(" \"iterations\": %d,\n", args.iterations);
|
|
std::printf(" \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0);
|
|
std::printf(" \"per_gpu\": [\n");
|
|
for (int i = 0; i < args.gpu_count; ++i) {
|
|
int gpu = args.first_gpu + i;
|
|
double tflops = run_one_gpu(gpu, args);
|
|
values.push_back(tflops);
|
|
}
|
|
double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size();
|
|
auto minmax = std::minmax_element(values.begin(), values.end());
|
|
double spread = ((*minmax.second - *minmax.first) / mean) * 100.0;
|
|
std::printf(" ],\n");
|
|
std::printf(" \"mean_tflops\": %.1f,\n", mean);
|
|
std::printf(" \"min_tflops\": %.1f,\n", *minmax.first);
|
|
std::printf(" \"max_tflops\": %.1f,\n", *minmax.second);
|
|
std::printf(" \"spread_pct\": %.2f\n", spread);
|
|
std::printf("}\n");
|
|
return mean >= 1400.0 ? 0 : 1;
|
|
}
|