# GPU type: auto-detect or override to a100/a800/h100/h800/h200/h20/b200/b300 gpu_type: auto benchmark: memory: size_mb: 4096 iterations: 10 nvbandwidth_buffer_mb: 512 nvbandwidth_samples: 3 compute: dtypes: - fp32 - tf32 - fp16 - bf16 - fp8 # MAMF-style shape sweep: measure each dtype at every shape below and keep the max # TFLOPS (the realistic achievable peak). A single fixed shape under-reports by # ~7-12% and can't meet the MAMF-calibrated thresholds in gpu_specs.py. # Each entry is either N (square N×N×N) or [M, N, K]. K-heavy non-square shapes # (e.g. 2048×2048×13312) hit the true Hopper MAMF — bf16 ~790 vs ~755 square. # Empty list => single matrix_size shape (legacy behaviour). sweep_sizes: - 3584 - 4608 - 5376 - 8192 - 11520 - [2048, 2048, 13312] - [2048, 2048, 16384] matrix_size: 8192 # fallback shape when sweep_sizes is empty warmup: 20 iterations: 80 # NOTE: torch.compile was dropped — on H100 eager cuBLAS beats Triton for plain # GEMM, and compiling would re-autotune per shape and make the sweep very slow. health: temp_warning: 75 temp_critical: 85 power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py nccl: min_bandwidth_gbps: null # null = auto-detect (40% of GPU NVLink BW) test_allreduce: true test_alltoall: true test_broadcast: true test_reduce_scatter: false test_allgather: false test_sendrecv: false multinode_nccl: enabled: false mode: sweep hosts: - name: nccl-gpu-1 addr: 172.72.8.12 slots: 8 - name: nccl-gpu-2 addr: 172.72.8.16 slots: 8 ssh_user: root ssh_preflight: true mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun mpi_ld_preload: null extra_ld_library_path: - /usr/mpi/gcc/openmpi-4.1.9a1/lib - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib - /usr/local/cuda-12.4/targets/x86_64-linux/lib nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build tests: - all_reduce_perf - alltoall_perf topologies: - nodes: 2 gpus_per_node: 8 begin_size: 1k end_size: 16g step_factor: 2 warmup_iters: 10 gpus_per_rank: 1 timeout_sec: 1800 socket_ifname: bond0 oob_tcp_ifname: bond0 plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" ib_gid_index: 3 ib_sl: 5 ib_tc: 136 ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 ib_timeout: 22 qps_per_connection: null min_nchannels: null net_plugin: none nvls_enable: 1 split_data_on_qps: null extra_env: {} min_peak_busbw_gbps: allreduce: 480 alltoall: 75 stress: duration_sec: 600 # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup use_doubles: false use_tensor_cores: true memory_pct: 90 gpus: all rdma: min_bandwidth_gbps: 50 max_latency_us: 10 ib_iterations: 1000 msg_size: 65536 ib_device: null ib_port: 1 # Cross-node (two-host) RDMA via perftest, orchestrated over SSH from the CLIENT # node. Replaces the old scripts/rdma_cross_node.sh. Run on the client; it starts # ib_write_bw/ib_write_lat servers on `server` over SSH (passwordless required), # then drives the local client per device. cross_node: enabled: false # set true on the client node to run cross-node RDMA server: null # peer ssh address, e.g. 172.72.8.12 (server node) server_addr: null # OOB addr client connects to (default: = server) ssh_user: root devices: [] # e.g. [mlx5_0, mlx5_1, mlx5_6, mlx5_7]; [] = auto-detect active IB ib_port: 1 gid_index: null # -x for RoCE; null for pure InfiniBand msg_size: 1048576 # 1 MiB — large enough to reach NDR400 peak iters: 5000 base_oob_port: 18515 # per-device OOB port = base + device index server_warmup_sec: 2.0 min_bandwidth_gbps: 350 # per-port PASS floor (NDR400 ≈ 0.9 × 400) max_latency_us: 5 training: model: gpt2 batch_size: 8 seq_length: 2048 num_steps: 50 dtype: bf16 report: output_dir: ./reports format: json tools: install_dir: /opt/gpu-test-tools