# GPU type: auto-detect or override to a100/a800/h100/h200/b200/b300
gpu_type: auto

benchmark:
  memory:
    size_mb: 4096
    iterations: 10
    nvbandwidth_buffer_mb: 512
    nvbandwidth_samples: 3
  compute:
    dtypes:
      - fp32
      - tf32
      - fp16
      - bf16
      - fp8
      - fp64
      - int8
    matrix_size: 8192
    warmup: 50
    iterations: 500
    use_compile: true

health:
  temp_warning: 75
  temp_critical: 85
  power_limit: null  # null = auto-detect from GPU TDP per gpu_specs.py

nccl:
  min_bandwidth_gbps: null  # null = auto-detect (40% of GPU NVLink BW)
  test_allreduce: true
  test_alltoall: true
  test_broadcast: true
  test_reduce_scatter: true
  test_allgather: true
  test_sendrecv: true
  message_sizes:
    - 1M
    - 256M
    - 2G
  repeats: 3
  max_stddev_pct: 3

multinode_nccl:
  enabled: false
  mode: sweep
  hosts:
    - name: nccl-gpu-1
      addr: 172.72.8.12
      slots: 8
    - name: nccl-gpu-2
      addr: 172.72.8.16
      slots: 8
  ssh_user: root
  ssh_preflight: true
  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
  mpi_ld_preload: null
  extra_ld_library_path:
    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
    - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
  nccl_tests_dir: null  # null = tools.install_dir/nccl-tests/build
  tests:
    - all_reduce_perf
    - alltoall_perf
  topologies:
    - nodes: 2
      gpus_per_node: 8
  begin_size: 1k
  end_size: 16g
  step_factor: 2
  warmup_iters: 10
  gpus_per_rank: 1
  timeout_sec: 1800
  socket_ifname: bond0
  ib_gid_index: 3
  ib_sl: 5
  ib_tc: 136
  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
  ib_timeout: 22
  qps_per_connection: 4
  min_nchannels: 4
  net_plugin: none
  nvls_enable: 1
  split_data_on_qps: 1
  min_peak_busbw_gbps:
    allreduce: 480
    alltoall: 75

stress:
  duration_sec: 1800
  production_duration_sec: 1800
  use_gpu_burn: false
  use_doubles: false
  use_tensor_cores: true
  memory_pct: 90
  gpus: all
  dtype: bf16
  matrix_size: 24576
  telemetry_interval_sec: 1
  warmup_sec: 60
  min_steady_samples: 10
  max_temp_c: 80
  max_temp_delta_c: 5
  min_power_watts: 630
  max_tflops_jitter_pct: 5
  require_tflops_jitter: true

rdma:
  min_bandwidth_gbps: 47
  min_port_rate_gbps: 400
  max_latency_us: 3.5
  max_write_latency_us: 2.0
  max_read_latency_us: 3.5
  ib_iterations: 1000
  msg_size: 4194304
  latency_msg_size: 8
  ib_device: null
  ib_port: 1
  server_addr: null
  ibping_target: null
  ibping_count: 5
  role: auto
  pfc_ecn_counters: true

nvlink:
  expected_links_per_gpu: 18
  expected_link_speed_gbps: 25
  require_zero_errors: true

dcgm:
  diag_level: 3
  timeout_sec: 3600
  expected_num_gpus: 8
  json_output: true
  require_subtests: true

training:
  model: synthetic_1.5b
  batch_size: 8
  seq_length: 2048
  num_steps: 50
  warmup_steps: 5
  dtype: bf16
  mode: ddp
  synthetic_params_b: 1.5
  min_tokens_per_sec: 45000
  max_step_jitter_pct: 3
  max_peak_memory_gb: 70
  require_distributed: true

report:
  output_dir: ./reports
  format: json

tools:
  install_dir: /opt/gpu-test-tools