159 lines
3.1 KiB
YAML
159 lines
3.1 KiB
YAML
# GPU type: auto-detect or override to a100/a800/h100/h200/b200/b300
|
|
gpu_type: auto
|
|
|
|
benchmark:
|
|
memory:
|
|
size_mb: 4096
|
|
iterations: 10
|
|
nvbandwidth_buffer_mb: 512
|
|
nvbandwidth_samples: 3
|
|
compute:
|
|
dtypes:
|
|
- fp32
|
|
- tf32
|
|
- fp16
|
|
- bf16
|
|
- fp8
|
|
- fp64
|
|
- int8
|
|
matrix_size: 8192
|
|
warmup: 50
|
|
iterations: 500
|
|
use_compile: true
|
|
|
|
health:
|
|
temp_warning: 75
|
|
temp_critical: 85
|
|
power_limit: null # null = auto-detect from GPU TDP per gpu_specs.py
|
|
|
|
nccl:
|
|
min_bandwidth_gbps: null # null = auto-detect (40% of GPU NVLink BW)
|
|
test_allreduce: true
|
|
test_alltoall: true
|
|
test_broadcast: true
|
|
test_reduce_scatter: true
|
|
test_allgather: true
|
|
test_sendrecv: true
|
|
message_sizes:
|
|
- 1M
|
|
- 256M
|
|
- 2G
|
|
repeats: 3
|
|
max_stddev_pct: 3
|
|
|
|
multinode_nccl:
|
|
enabled: false
|
|
mode: sweep
|
|
hosts:
|
|
- name: nccl-gpu-1
|
|
addr: 172.72.8.12
|
|
slots: 8
|
|
- name: nccl-gpu-2
|
|
addr: 172.72.8.16
|
|
slots: 8
|
|
ssh_user: root
|
|
ssh_preflight: true
|
|
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
|
mpi_ld_preload: null
|
|
extra_ld_library_path:
|
|
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
|
- /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
|
|
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
|
nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build
|
|
tests:
|
|
- all_reduce_perf
|
|
- alltoall_perf
|
|
topologies:
|
|
- nodes: 2
|
|
gpus_per_node: 8
|
|
begin_size: 1k
|
|
end_size: 16g
|
|
step_factor: 2
|
|
warmup_iters: 10
|
|
gpus_per_rank: 1
|
|
timeout_sec: 1800
|
|
socket_ifname: bond0
|
|
ib_gid_index: 3
|
|
ib_sl: 5
|
|
ib_tc: 136
|
|
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
|
ib_timeout: 22
|
|
qps_per_connection: 4
|
|
min_nchannels: 4
|
|
net_plugin: none
|
|
nvls_enable: 1
|
|
split_data_on_qps: 1
|
|
extra_env: {}
|
|
min_peak_busbw_gbps:
|
|
allreduce: 480
|
|
alltoall: 75
|
|
|
|
stress:
|
|
duration_sec: 1800
|
|
production_duration_sec: 1800
|
|
use_gpu_burn: false
|
|
use_doubles: false
|
|
use_tensor_cores: true
|
|
memory_pct: 90
|
|
gpus: all
|
|
dtype: bf16
|
|
matrix_size: 24576
|
|
telemetry_interval_sec: 1
|
|
warmup_sec: 60
|
|
min_steady_samples: 10
|
|
max_temp_c: 80
|
|
max_temp_delta_c: 5
|
|
min_power_watts: 630
|
|
max_tflops_jitter_pct: 5
|
|
require_tflops_jitter: true
|
|
|
|
rdma:
|
|
min_bandwidth_gbps: 47
|
|
min_port_rate_gbps: 400
|
|
max_latency_us: 3.5
|
|
max_write_latency_us: 2.0
|
|
max_read_latency_us: 3.5
|
|
ib_iterations: 1000
|
|
msg_size: 4194304
|
|
latency_msg_size: 8
|
|
ib_device: null
|
|
ib_port: 1
|
|
server_addr: null
|
|
ibping_target: null
|
|
ibping_count: 5
|
|
role: auto
|
|
pfc_ecn_counters: true
|
|
|
|
nvlink:
|
|
expected_links_per_gpu: 18
|
|
expected_link_speed_gbps: 25
|
|
require_zero_errors: true
|
|
|
|
dcgm:
|
|
diag_level: 3
|
|
timeout_sec: 3600
|
|
expected_num_gpus: 8
|
|
json_output: true
|
|
require_subtests: true
|
|
|
|
training:
|
|
model: synthetic_1.5b
|
|
batch_size: 8
|
|
seq_length: 2048
|
|
num_steps: 50
|
|
warmup_steps: 5
|
|
dtype: bf16
|
|
mode: ddp
|
|
synthetic_params_b: 1.5
|
|
min_tokens_per_sec: 45000
|
|
max_step_jitter_pct: 3
|
|
max_peak_memory_gb: 70
|
|
require_distributed: true
|
|
|
|
report:
|
|
output_dir: ./reports
|
|
format: json
|
|
|
|
tools:
|
|
install_dir: /opt/gpu-test-tools
|