144 lines
4.1 KiB
YAML
144 lines
4.1 KiB
YAML
# GPU type: auto-detect or override to a100/a800/h100/h800/h200/h20/b200/b300
|
||
gpu_type: auto
|
||
|
||
benchmark:
|
||
memory:
|
||
size_mb: 4096
|
||
iterations: 10
|
||
nvbandwidth_buffer_mb: 512
|
||
nvbandwidth_samples: 3
|
||
compute:
|
||
dtypes:
|
||
- fp32
|
||
- tf32
|
||
- fp16
|
||
- bf16
|
||
- fp8
|
||
# MAMF-style shape sweep: measure each dtype at every shape below and keep the max
|
||
# TFLOPS (the realistic achievable peak). A single fixed shape under-reports by
|
||
# ~7-12% and can't meet the MAMF-calibrated thresholds in gpu_specs.py.
|
||
# Each entry is either N (square N×N×N) or [M, N, K]. K-heavy non-square shapes
|
||
# (e.g. 2048×2048×13312) hit the true Hopper MAMF — bf16 ~790 vs ~755 square.
|
||
# Empty list => single matrix_size shape (legacy behaviour).
|
||
sweep_sizes:
|
||
- 3584
|
||
- 4608
|
||
- 5376
|
||
- 8192
|
||
- 11520
|
||
- [2048, 2048, 13312]
|
||
- [2048, 2048, 16384]
|
||
matrix_size: 8192 # fallback shape when sweep_sizes is empty
|
||
warmup: 20
|
||
iterations: 80
|
||
# NOTE: torch.compile was dropped — on H100 eager cuBLAS beats Triton for plain
|
||
# GEMM, and compiling would re-autotune per shape and make the sweep very slow.
|
||
|
||
health:
|
||
temp_warning: 75
|
||
temp_critical: 85
|
||
power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py
|
||
|
||
nccl:
|
||
min_bandwidth_gbps: null # null = auto-detect (40% of GPU NVLink BW)
|
||
test_allreduce: true
|
||
test_alltoall: true
|
||
test_broadcast: true
|
||
test_reduce_scatter: false
|
||
test_allgather: false
|
||
test_sendrecv: false
|
||
|
||
multinode_nccl:
|
||
enabled: false
|
||
mode: sweep
|
||
hosts:
|
||
- name: nccl-gpu-1
|
||
addr: 172.72.8.12
|
||
slots: 8
|
||
- name: nccl-gpu-2
|
||
addr: 172.72.8.16
|
||
slots: 8
|
||
ssh_user: root
|
||
ssh_preflight: true
|
||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||
mpi_ld_preload: null
|
||
extra_ld_library_path:
|
||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||
- /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
|
||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||
nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build
|
||
tests:
|
||
- all_reduce_perf
|
||
- alltoall_perf
|
||
topologies:
|
||
- nodes: 2
|
||
gpus_per_node: 8
|
||
begin_size: 1k
|
||
end_size: 16g
|
||
step_factor: 2
|
||
warmup_iters: 10
|
||
gpus_per_rank: 1
|
||
timeout_sec: 1800
|
||
socket_ifname: bond0
|
||
ib_gid_index: 3
|
||
ib_sl: 5
|
||
ib_tc: 136
|
||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||
ib_timeout: 22
|
||
qps_per_connection: 4
|
||
min_nchannels: 4
|
||
net_plugin: none
|
||
nvls_enable: 1
|
||
split_data_on_qps: 1
|
||
extra_env: {}
|
||
min_peak_busbw_gbps:
|
||
allreduce: 480
|
||
alltoall: 75
|
||
|
||
stress:
|
||
duration_sec: 600 # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup
|
||
use_doubles: false
|
||
use_tensor_cores: true
|
||
memory_pct: 90
|
||
gpus: all
|
||
|
||
rdma:
|
||
min_bandwidth_gbps: 50
|
||
max_latency_us: 10
|
||
ib_iterations: 1000
|
||
msg_size: 65536
|
||
ib_device: null
|
||
ib_port: 1
|
||
# Cross-node (two-host) RDMA via perftest, orchestrated over SSH from the CLIENT
|
||
# node. Replaces the old scripts/rdma_cross_node.sh. Run on the client; it starts
|
||
# ib_write_bw/ib_write_lat servers on `server` over SSH (passwordless required),
|
||
# then drives the local client per device.
|
||
cross_node:
|
||
enabled: false # set true on the client node to run cross-node RDMA
|
||
server: null # peer ssh address, e.g. 172.72.8.12 (server node)
|
||
server_addr: null # OOB addr client connects to (default: = server)
|
||
ssh_user: root
|
||
devices: [] # e.g. [mlx5_0, mlx5_1, mlx5_6, mlx5_7]; [] = auto-detect active IB
|
||
ib_port: 1
|
||
gid_index: null # -x <n> for RoCE; null for pure InfiniBand
|
||
msg_size: 1048576 # 1 MiB — large enough to reach NDR400 peak
|
||
iters: 5000
|
||
base_oob_port: 18515 # per-device OOB port = base + device index
|
||
server_warmup_sec: 2.0
|
||
min_bandwidth_gbps: 350 # per-port PASS floor (NDR400 ≈ 0.9 × 400)
|
||
max_latency_us: 5
|
||
|
||
training:
|
||
model: gpt2
|
||
batch_size: 8
|
||
seq_length: 2048
|
||
num_steps: 50
|
||
dtype: bf16
|
||
|
||
report:
|
||
output_dir: ./reports
|
||
format: json
|
||
|
||
tools:
|
||
install_dir: /opt/gpu-test-tools
|