# GPU type: auto-detect or override to a100/a800/h100/h800/h200/h20/b200/b300
gpu_type: auto

benchmark:
  memory:
    size_mb: 4096
    iterations: 10
    nvbandwidth_buffer_mb: 512
    nvbandwidth_samples: 3
  compute:
    dtypes:
      - fp32
      - tf32
      - fp16
      - bf16
      - fp8
    # MAMF-style shape sweep: measure each dtype at every shape below and keep the max
    # TFLOPS (the realistic achievable peak). A single fixed shape under-reports by
    # ~7-12% and can't meet the MAMF-calibrated thresholds in gpu_specs.py.
    # Each entry is either N (square N×N×N) or [M, N, K]. K-heavy non-square shapes
    # (e.g. 2048×2048×13312) hit the true Hopper MAMF — bf16 ~790 vs ~755 square.
    # Empty list => single matrix_size shape (legacy behaviour).
    sweep_sizes:
      - 3584
      - 4608
      - 5376
      - 8192
      - 11520
      - [2048, 2048, 13312]
      - [2048, 2048, 16384]
    matrix_size: 8192        # fallback shape when sweep_sizes is empty
    warmup: 20
    iterations: 80
    # NOTE: torch.compile was dropped — on H100 eager cuBLAS beats Triton for plain
    # GEMM, and compiling would re-autotune per shape and make the sweep very slow.

health:
  temp_warning: 75
  temp_critical: 85
  power_limit: null  # null = auto-detect from GPU TDP per gpu_specs.py

nccl:
  min_bandwidth_gbps: null  # null = auto-detect (40% of GPU NVLink BW)
  test_allreduce: true
  test_alltoall: true
  test_broadcast: true
  test_reduce_scatter: false
  test_allgather: false
  test_sendrecv: false

multinode_nccl:
  enabled: false
  mode: sweep
  hosts:
    - name: nccl-gpu-1
      addr: 172.72.8.12
      slots: 8
    - name: nccl-gpu-2
      addr: 172.72.8.16
      slots: 8
  ssh_user: root
  ssh_preflight: true
  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
  mpi_ld_preload: null
  extra_ld_library_path:
    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
    - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
  nccl_tests_dir: null  # null = tools.install_dir/nccl-tests/build
  tests:
    - all_reduce_perf
    - alltoall_perf
  topologies:
    - nodes: 2
      gpus_per_node: 8
  begin_size: 1k
  end_size: 16g
  step_factor: 2
  warmup_iters: 10
  gpus_per_rank: 1
  timeout_sec: 1800
  socket_ifname: bond0
  oob_tcp_ifname: bond0
  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
  ib_gid_index: 3
  ib_sl: 5
  ib_tc: 136
  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
  ib_timeout: 22
  qps_per_connection: null
  min_nchannels: null
  net_plugin: none
  nvls_enable: 1
  split_data_on_qps: null
  extra_env: {}
  min_peak_busbw_gbps:
    allreduce: 480
    alltoall: 75

stress:
  duration_sec: 600           # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup
  use_doubles: false
  use_tensor_cores: true
  memory_pct: 90
  gpus: all

rdma:
  min_bandwidth_gbps: 50
  max_latency_us: 10
  ib_iterations: 1000
  msg_size: 65536
  ib_device: null
  ib_port: 1
  # Cross-node (two-host) RDMA via perftest, orchestrated over SSH from the CLIENT
  # node. Replaces the old scripts/rdma_cross_node.sh. Run on the client; it starts
  # ib_write_bw/ib_write_lat servers on `server` over SSH (passwordless required),
  # then drives the local client per device.
  cross_node:
    enabled: false              # set true on the client node to run cross-node RDMA
    server: null                # peer ssh address, e.g. 172.72.8.12 (server node)
    server_addr: null           # OOB addr client connects to (default: = server)
    ssh_user: root
    devices: []                 # e.g. [mlx5_0, mlx5_1, mlx5_6, mlx5_7]; [] = auto-detect active IB
    ib_port: 1
    gid_index: null             # -x <n> for RoCE; null for pure InfiniBand
    msg_size: 1048576           # 1 MiB — large enough to reach NDR400 peak
    iters: 5000
    base_oob_port: 18515        # per-device OOB port = base + device index
    server_warmup_sec: 2.0
    min_bandwidth_gbps: 350     # per-port PASS floor (NDR400 ≈ 0.9 × 400)
    max_latency_us: 5

training:
  model: gpt2
  batch_size: 8
  seq_length: 2048
  num_steps: 50
  dtype: bf16

report:
  output_dir: ./reports
  format: json

tools:
  install_dir: /opt/gpu-test-tools