test_gpu_scripts/configs/multinode_nccl_nccl227_pdf_matrix.yaml

tools:
  install_dir: /opt/gpu-test-tools

report:
  output_dir: ./reports
  format: md

multinode_nccl:
  enabled: true
  mode: cross-leaf-pdf-matrix-nccl-2.27.7
  hosts:
    - name: nccl-gpu-1
      addr: 172.72.8.12
      slots: 8
    - name: nccl-gpu-2
      addr: 172.72.8.16
      slots: 8
  ssh_user: root
  ssh_preflight: true
  mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
  mpi_ld_preload: null
  extra_ld_library_path:
    - /usr/mpi/gcc/openmpi-4.1.9a1/lib
    - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
    - /usr/local/cuda-12.4/targets/x86_64-linux/lib
  nccl_tests_dir: null
  tests:
    - all_reduce_perf
    - alltoall_perf
  topologies:
    - nodes: 2
      gpus_per_node: 1
      label: 2 nodes x 1 GPU (PDF 2 machines 2 GPUs)
      min_peak_busbw_gbps:
        allreduce: 48.90
        alltoall: 27.25
    - nodes: 2
      gpus_per_node: 2
      label: 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs)
      min_peak_busbw_gbps:
        allreduce: 136.93
        alltoall: 54.41
    - nodes: 2
      gpus_per_node: 4
      label: 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs)
      cuda_visible_devices: 0,1,4,5
      op_env:
        alltoall:
          NCCL_IB_QPS_PER_CONNECTION: 4
          NCCL_MIN_NCHANNELS: 4
          NCCL_IB_SPLIT_DATA_ON_QPS: 1
      min_peak_busbw_gbps:
        allreduce: 335.48
        alltoall: 73.73
    - nodes: 2
      gpus_per_node: 8
      label: 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs)
      op_env:
        alltoall:
          NCCL_PXN_DISABLE: 1
      min_peak_busbw_gbps:
        allreduce: 491.84
        alltoall: 76.54
  begin_size: 16G
  end_size: 16G
  step_factor: 2
  warmup_iters: 10
  gpus_per_rank: 1
  timeout_sec: 1800
  debug: INFO
  socket_ifname: bond0
  oob_tcp_ifname: bond0
  plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
  ib_gid_index: 3
  ib_sl: 5
  ib_tc: 136
  ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
  ib_timeout: 22
  qps_per_connection: null
  min_nchannels: null
  net_plugin: none
  nvls_enable: 1
  split_data_on_qps: null
  extra_env:
    NCCL_DEBUG_SUBSYS: INIT,NET
    NCCL_NET_GDR_LEVEL: 5
    NCCL_NET_GDR_READ: 1
    NCCL_DMABUF_ENABLE: 0
  min_peak_busbw_gbps:
    allreduce: 0
    alltoall: 0