test_gpu_scripts/configs/default.yaml

159 lines
3.1 KiB
YAML

# GPU type: auto-detect or override to a100/a800/h100/h200/b200/b300
gpu_type: auto
benchmark:
memory:
size_mb: 4096
iterations: 10
nvbandwidth_buffer_mb: 512
nvbandwidth_samples: 3
compute:
dtypes:
- fp32
- tf32
- fp16
- bf16
- fp8
- fp64
- int8
matrix_size: 8192
warmup: 50
iterations: 500
use_compile: true
health:
temp_warning: 75
temp_critical: 85
power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py
nccl:
min_bandwidth_gbps: null # null = auto-detect (40% of GPU NVLink BW)
test_allreduce: true
test_alltoall: true
test_broadcast: true
test_reduce_scatter: true
test_allgather: true
test_sendrecv: true
message_sizes:
- 1M
- 256M
- 2G
repeats: 3
max_stddev_pct: 3
multinode_nccl:
enabled: false
mode: sweep
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build
tests:
- all_reduce_perf
- alltoall_perf
topologies:
- nodes: 2
gpus_per_node: 8
begin_size: 1k
end_size: 16g
step_factor: 2
warmup_iters: 10
gpus_per_rank: 1
timeout_sec: 1800
socket_ifname: bond0
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: 4
min_nchannels: 4
net_plugin: none
nvls_enable: 1
split_data_on_qps: 1
extra_env: {}
min_peak_busbw_gbps:
allreduce: 480
alltoall: 75
stress:
duration_sec: 1800
production_duration_sec: 1800
use_gpu_burn: false
use_doubles: false
use_tensor_cores: true
memory_pct: 90
gpus: all
dtype: bf16
matrix_size: 24576
telemetry_interval_sec: 1
warmup_sec: 60
min_steady_samples: 10
max_temp_c: 80
max_temp_delta_c: 5
min_power_watts: 630
max_tflops_jitter_pct: 5
require_tflops_jitter: true
rdma:
min_bandwidth_gbps: 47
min_port_rate_gbps: 400
max_latency_us: 3.5
max_write_latency_us: 2.0
max_read_latency_us: 3.5
ib_iterations: 1000
msg_size: 4194304
latency_msg_size: 8
ib_device: null
ib_port: 1
server_addr: null
ibping_target: null
ibping_count: 5
role: auto
pfc_ecn_counters: true
nvlink:
expected_links_per_gpu: 18
expected_link_speed_gbps: 25
require_zero_errors: true
dcgm:
diag_level: 3
timeout_sec: 3600
expected_num_gpus: 8
json_output: true
require_subtests: true
training:
model: synthetic_1.5b
batch_size: 8
seq_length: 2048
num_steps: 50
warmup_steps: 5
dtype: bf16
mode: ddp
synthetic_params_b: 1.5
min_tokens_per_sec: 45000
max_step_jitter_pct: 3
max_peak_memory_gb: 70
require_distributed: true
report:
output_dir: ./reports
format: json
tools:
install_dir: /opt/gpu-test-tools