test_gpu_scripts/configs/multinode_nccl_nccl227_pdf_matrix.yaml

92 lines
2.2 KiB
YAML

tools:
install_dir: /opt/gpu-test-tools
report:
output_dir: ./reports
format: md
multinode_nccl:
enabled: true
mode: cross-leaf-pdf-matrix-nccl-2.27.7
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: null
tests:
- all_reduce_perf
- alltoall_perf
topologies:
- nodes: 2
gpus_per_node: 1
label: 2 nodes x 1 GPU (PDF 2 machines 2 GPUs)
min_peak_busbw_gbps:
allreduce: 48.90
alltoall: 27.25
- nodes: 2
gpus_per_node: 2
label: 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs)
min_peak_busbw_gbps:
allreduce: 136.93
alltoall: 54.41
- nodes: 2
gpus_per_node: 4
label: 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs)
cuda_visible_devices: 0,1,4,5
op_env:
alltoall:
NCCL_IB_QPS_PER_CONNECTION: 4
NCCL_MIN_NCHANNELS: 4
NCCL_IB_SPLIT_DATA_ON_QPS: 1
min_peak_busbw_gbps:
allreduce: 335.48
alltoall: 73.73
- nodes: 2
gpus_per_node: 8
label: 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs)
op_env:
alltoall:
NCCL_PXN_DISABLE: 1
min_peak_busbw_gbps:
allreduce: 491.84
alltoall: 76.54
begin_size: 16G
end_size: 16G
step_factor: 2
warmup_iters: 10
gpus_per_rank: 1
timeout_sec: 1800
debug: INFO
socket_ifname: bond0
oob_tcp_ifname: bond0
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: null
min_nchannels: null
net_plugin: none
nvls_enable: 1
split_data_on_qps: null
extra_env:
NCCL_DEBUG_SUBSYS: INIT,NET
NCCL_NET_GDR_LEVEL: 5
NCCL_NET_GDR_READ: 1
NCCL_DMABUF_ENABLE: 0
min_peak_busbw_gbps:
allreduce: 0
alltoall: 0