tools: install_dir: /opt/gpu-test-tools report: output_dir: ./reports format: md multinode_nccl: enabled: true mode: cross-leaf-pdf-matrix-nccl-2.27.7 hosts: - name: nccl-gpu-1 addr: 172.72.8.12 slots: 8 - name: nccl-gpu-2 addr: 172.72.8.16 slots: 8 ssh_user: root ssh_preflight: true mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun mpi_ld_preload: null extra_ld_library_path: - /usr/mpi/gcc/openmpi-4.1.9a1/lib - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu - /usr/local/cuda-12.4/targets/x86_64-linux/lib nccl_tests_dir: null tests: - all_reduce_perf - alltoall_perf topologies: - nodes: 2 gpus_per_node: 1 label: 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) min_peak_busbw_gbps: allreduce: 48.90 alltoall: 27.25 - nodes: 2 gpus_per_node: 2 label: 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) min_peak_busbw_gbps: allreduce: 136.93 alltoall: 54.41 - nodes: 2 gpus_per_node: 4 label: 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) cuda_visible_devices: 0,1,4,5 op_env: alltoall: NCCL_IB_QPS_PER_CONNECTION: 4 NCCL_MIN_NCHANNELS: 4 NCCL_IB_SPLIT_DATA_ON_QPS: 1 min_peak_busbw_gbps: allreduce: 335.48 alltoall: 73.73 - nodes: 2 gpus_per_node: 8 label: 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) op_env: alltoall: NCCL_PXN_DISABLE: 1 min_peak_busbw_gbps: allreduce: 491.84 alltoall: 76.54 begin_size: 16G end_size: 16G step_factor: 2 warmup_iters: 10 gpus_per_rank: 1 timeout_sec: 1800 debug: INFO socket_ifname: bond0 oob_tcp_ifname: bond0 plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" ib_gid_index: 3 ib_sl: 5 ib_tc: 136 ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 ib_timeout: 22 qps_per_connection: null min_nchannels: null net_plugin: none nvls_enable: 1 split_data_on_qps: null extra_env: NCCL_DEBUG_SUBSYS: INIT,NET NCCL_NET_GDR_LEVEL: 5 NCCL_NET_GDR_READ: 1 NCCL_DMABUF_ENABLE: 0 min_peak_busbw_gbps: allreduce: 0 alltoall: 0