# GPU type: auto-detect or override to a100/a800/h100/h200/b200/b300 gpu_type: auto benchmark: memory: size_mb: 4096 iterations: 10 nvbandwidth_buffer_mb: 512 nvbandwidth_samples: 3 compute: dtypes: - fp32 - tf32 - fp16 - bf16 - fp8 - fp64 - int8 matrix_size: 8192 warmup: 50 iterations: 500 use_compile: true health: temp_warning: 75 temp_critical: 85 power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py nccl: min_bandwidth_gbps: null # null = auto-detect (40% of GPU NVLink BW) test_allreduce: true test_alltoall: true test_broadcast: true test_reduce_scatter: true test_allgather: true test_sendrecv: true message_sizes: - 1M - 256M - 2G repeats: 3 max_stddev_pct: 3 multinode_nccl: enabled: false mode: sweep hosts: - name: nccl-gpu-1 addr: 172.72.8.12 slots: 8 - name: nccl-gpu-2 addr: 172.72.8.16 slots: 8 ssh_user: root ssh_preflight: true mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun mpi_ld_preload: null extra_ld_library_path: - /usr/mpi/gcc/openmpi-4.1.9a1/lib - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib - /usr/local/cuda-12.4/targets/x86_64-linux/lib nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build tests: - all_reduce_perf - alltoall_perf topologies: - nodes: 2 gpus_per_node: 8 begin_size: 1k end_size: 16g step_factor: 2 warmup_iters: 10 gpus_per_rank: 1 timeout_sec: 1800 socket_ifname: bond0 ib_gid_index: 3 ib_sl: 5 ib_tc: 136 ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 ib_timeout: 22 qps_per_connection: 4 min_nchannels: 4 net_plugin: none nvls_enable: 1 split_data_on_qps: 1 min_peak_busbw_gbps: allreduce: 480 alltoall: 75 stress: duration_sec: 1800 production_duration_sec: 1800 use_gpu_burn: false use_doubles: false use_tensor_cores: true memory_pct: 90 gpus: all dtype: bf16 matrix_size: 24576 telemetry_interval_sec: 1 warmup_sec: 60 min_steady_samples: 10 max_temp_c: 80 max_temp_delta_c: 5 min_power_watts: 630 max_tflops_jitter_pct: 5 require_tflops_jitter: true rdma: min_bandwidth_gbps: 47 min_port_rate_gbps: 400 max_latency_us: 3.5 max_write_latency_us: 2.0 max_read_latency_us: 3.5 ib_iterations: 1000 msg_size: 4194304 latency_msg_size: 8 ib_device: null ib_port: 1 server_addr: null ibping_target: null ibping_count: 5 role: auto pfc_ecn_counters: true nvlink: expected_links_per_gpu: 18 expected_link_speed_gbps: 25 require_zero_errors: true dcgm: diag_level: 3 timeout_sec: 3600 expected_num_gpus: 8 json_output: true require_subtests: true training: model: synthetic_1.5b batch_size: 8 seq_length: 2048 num_steps: 50 warmup_steps: 5 dtype: bf16 mode: ddp synthetic_params_b: 1.5 min_tokens_per_sec: 45000 max_step_jitter_pct: 3 max_peak_memory_gb: 70 require_distributed: true report: output_dir: ./reports format: json tools: install_dir: /opt/gpu-test-tools