h100-acceptance-current #3
8
.gitignore
vendored
8
.gitignore
vendored
@ -6,6 +6,12 @@ __pycache__/
|
||||
dist/
|
||||
build/
|
||||
reports/
|
||||
reports_*
|
||||
H100*.md
|
||||
test_all*.md
|
||||
docs/h100_test_all_metrics_guide_cn.md
|
||||
docs/multinode_nccl_concepts.md
|
||||
docs/multinode_nccl_deep_diagnose_runbook.md
|
||||
*.egg
|
||||
.eggs/
|
||||
*.log
|
||||
@ -14,4 +20,6 @@ reports/
|
||||
.venv/
|
||||
venv/
|
||||
.qoder/*
|
||||
.playwright-mcp/
|
||||
.claude/settings.local.json
|
||||
.omx/
|
||||
|
||||
@ -48,6 +48,55 @@ nccl:
|
||||
test_allgather: false
|
||||
test_sendrecv: false
|
||||
|
||||
multinode_nccl:
|
||||
enabled: false
|
||||
mode: sweep
|
||||
hosts:
|
||||
- name: nccl-gpu-1
|
||||
addr: 172.72.8.12
|
||||
slots: 8
|
||||
- name: nccl-gpu-2
|
||||
addr: 172.72.8.16
|
||||
slots: 8
|
||||
ssh_user: root
|
||||
ssh_preflight: true
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
mpi_ld_preload: null
|
||||
extra_ld_library_path:
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build
|
||||
tests:
|
||||
- all_reduce_perf
|
||||
- alltoall_perf
|
||||
topologies:
|
||||
- nodes: 2
|
||||
gpus_per_node: 8
|
||||
begin_size: 1k
|
||||
end_size: 16g
|
||||
step_factor: 2
|
||||
warmup_iters: 10
|
||||
gpus_per_rank: 1
|
||||
timeout_sec: 1800
|
||||
socket_ifname: bond0
|
||||
oob_tcp_ifname: bond0
|
||||
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
|
||||
ib_gid_index: 3
|
||||
ib_sl: 5
|
||||
ib_tc: 136
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
ib_timeout: 22
|
||||
qps_per_connection: null
|
||||
min_nchannels: null
|
||||
net_plugin: none
|
||||
nvls_enable: 1
|
||||
split_data_on_qps: null
|
||||
extra_env: {}
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 480
|
||||
alltoall: 75
|
||||
|
||||
stress:
|
||||
duration_sec: 600 # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup
|
||||
use_doubles: false
|
||||
|
||||
62
configs/multinode_nccl_diagnostic.yaml
Normal file
62
configs/multinode_nccl_diagnostic.yaml
Normal file
@ -0,0 +1,62 @@
|
||||
tools:
|
||||
install_dir: /opt/gpu-test-tools
|
||||
|
||||
report:
|
||||
output_dir: ./reports
|
||||
format: md
|
||||
|
||||
multinode_nccl:
|
||||
enabled: true
|
||||
mode: diagnostic
|
||||
hosts:
|
||||
- name: nccl-gpu-1
|
||||
addr: 172.72.8.12
|
||||
slots: 8
|
||||
- name: nccl-gpu-2
|
||||
addr: 172.72.8.16
|
||||
slots: 8
|
||||
ssh_user: root
|
||||
ssh_preflight: true
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
mpi_ld_preload: null
|
||||
extra_ld_library_path:
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
nccl_tests_dir: null
|
||||
tests:
|
||||
- all_reduce_perf
|
||||
- alltoall_perf
|
||||
topologies:
|
||||
- nodes: 2
|
||||
gpus_per_node: 8
|
||||
label: 2 nodes x 8 GPUs diagnostic
|
||||
begin_size: 256M
|
||||
end_size: 256M
|
||||
step_factor: 2
|
||||
warmup_iters: 1
|
||||
iters: 3
|
||||
gpus_per_rank: 1
|
||||
timeout_sec: 600
|
||||
debug: INFO
|
||||
socket_ifname: bond0
|
||||
oob_tcp_ifname: bond0
|
||||
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
|
||||
ib_gid_index: 3
|
||||
ib_sl: 5
|
||||
ib_tc: 136
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
ib_timeout: 22
|
||||
qps_per_connection: null
|
||||
min_nchannels: null
|
||||
net_plugin: none
|
||||
nvls_enable: 1
|
||||
split_data_on_qps: null
|
||||
extra_env:
|
||||
NCCL_DEBUG_SUBSYS: INIT,NET
|
||||
NCCL_NET_GDR_LEVEL: 5
|
||||
NCCL_NET_GDR_READ: 1
|
||||
NCCL_DMABUF_ENABLE: 0
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 480
|
||||
alltoall: 75
|
||||
62
configs/multinode_nccl_nccl227_16g.yaml
Normal file
62
configs/multinode_nccl_nccl227_16g.yaml
Normal file
@ -0,0 +1,62 @@
|
||||
tools:
|
||||
install_dir: /opt/gpu-test-tools
|
||||
|
||||
report:
|
||||
output_dir: ./reports
|
||||
format: md
|
||||
|
||||
multinode_nccl:
|
||||
enabled: true
|
||||
mode: large-message-nccl-2.27.7
|
||||
hosts:
|
||||
- name: nccl-gpu-1
|
||||
addr: 172.72.8.12
|
||||
slots: 8
|
||||
- name: nccl-gpu-2
|
||||
addr: 172.72.8.16
|
||||
slots: 8
|
||||
ssh_user: root
|
||||
ssh_preflight: true
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
mpi_ld_preload: null
|
||||
extra_ld_library_path:
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||
tests:
|
||||
- all_reduce_perf
|
||||
- alltoall_perf
|
||||
topologies:
|
||||
- nodes: 2
|
||||
gpus_per_node: 8
|
||||
label: 2 nodes x 8 GPUs NCCL 2.27.7 16G
|
||||
begin_size: 16G
|
||||
end_size: 16G
|
||||
step_factor: 2
|
||||
warmup_iters: 1
|
||||
iters: 3
|
||||
gpus_per_rank: 1
|
||||
timeout_sec: 1200
|
||||
debug: INFO
|
||||
socket_ifname: bond0
|
||||
oob_tcp_ifname: bond0
|
||||
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
|
||||
ib_gid_index: 3
|
||||
ib_sl: 5
|
||||
ib_tc: 136
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
ib_timeout: 22
|
||||
qps_per_connection: null
|
||||
min_nchannels: null
|
||||
net_plugin: none
|
||||
nvls_enable: 1
|
||||
split_data_on_qps: null
|
||||
extra_env:
|
||||
NCCL_DEBUG_SUBSYS: INIT,NET
|
||||
NCCL_NET_GDR_LEVEL: 5
|
||||
NCCL_NET_GDR_READ: 1
|
||||
NCCL_DMABUF_ENABLE: 0
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 480
|
||||
alltoall: 75
|
||||
72
configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
Normal file
72
configs/multinode_nccl_nccl227_all_collectives_2x8.yaml
Normal file
@ -0,0 +1,72 @@
|
||||
tools:
|
||||
install_dir: /opt/gpu-test-tools
|
||||
|
||||
report:
|
||||
output_dir: ./reports
|
||||
format: md
|
||||
|
||||
multinode_nccl:
|
||||
enabled: true
|
||||
mode: cross-leaf-all-collectives-nccl-2.27.7
|
||||
hosts:
|
||||
- name: nccl-gpu-1
|
||||
addr: 172.72.8.12
|
||||
slots: 8
|
||||
- name: nccl-gpu-2
|
||||
addr: 172.72.8.16
|
||||
slots: 8
|
||||
ssh_user: root
|
||||
ssh_preflight: true
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
mpi_ld_preload: null
|
||||
extra_ld_library_path:
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||
tests:
|
||||
- all_reduce_perf
|
||||
- alltoall_perf
|
||||
- broadcast_perf
|
||||
- reduce_scatter_perf
|
||||
- all_gather_perf
|
||||
- sendrecv_perf
|
||||
topologies:
|
||||
- nodes: 2
|
||||
gpus_per_node: 8
|
||||
label: 2 nodes x 8 GPUs (all collectives evidence run)
|
||||
op_env:
|
||||
alltoall:
|
||||
NCCL_PXN_DISABLE: 1
|
||||
begin_size: 16G
|
||||
end_size: 16G
|
||||
step_factor: 2
|
||||
warmup_iters: 10
|
||||
gpus_per_rank: 1
|
||||
timeout_sec: 1800
|
||||
debug: INFO
|
||||
socket_ifname: bond0
|
||||
oob_tcp_ifname: bond0
|
||||
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
|
||||
ib_gid_index: 3
|
||||
ib_sl: 5
|
||||
ib_tc: 136
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
ib_timeout: 22
|
||||
qps_per_connection: null
|
||||
min_nchannels: null
|
||||
net_plugin: none
|
||||
nvls_enable: 1
|
||||
split_data_on_qps: null
|
||||
extra_env:
|
||||
NCCL_DEBUG_SUBSYS: INIT,NET
|
||||
NCCL_NET_GDR_LEVEL: 5
|
||||
NCCL_NET_GDR_READ: 1
|
||||
NCCL_DMABUF_ENABLE: 0
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 491.84
|
||||
alltoall: 76.54
|
||||
broadcast: 0
|
||||
reducescatter: 0
|
||||
allgather: 0
|
||||
sendrecv: 0
|
||||
62
configs/multinode_nccl_nccl227_auto_16g.yaml
Normal file
62
configs/multinode_nccl_nccl227_auto_16g.yaml
Normal file
@ -0,0 +1,62 @@
|
||||
tools:
|
||||
install_dir: /opt/gpu-test-tools
|
||||
|
||||
report:
|
||||
output_dir: ./reports
|
||||
format: md
|
||||
|
||||
multinode_nccl:
|
||||
enabled: true
|
||||
mode: large-message-nccl-2.27.7-auto
|
||||
hosts:
|
||||
- name: nccl-gpu-1
|
||||
addr: 172.72.8.12
|
||||
slots: 8
|
||||
- name: nccl-gpu-2
|
||||
addr: 172.72.8.16
|
||||
slots: 8
|
||||
ssh_user: root
|
||||
ssh_preflight: true
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
mpi_ld_preload: null
|
||||
extra_ld_library_path:
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||
tests:
|
||||
- all_reduce_perf
|
||||
- alltoall_perf
|
||||
topologies:
|
||||
- nodes: 2
|
||||
gpus_per_node: 8
|
||||
label: 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G
|
||||
begin_size: 16G
|
||||
end_size: 16G
|
||||
step_factor: 2
|
||||
warmup_iters: 1
|
||||
iters: 3
|
||||
gpus_per_rank: 1
|
||||
timeout_sec: 1200
|
||||
debug: INFO
|
||||
socket_ifname: bond0
|
||||
oob_tcp_ifname: bond0
|
||||
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
|
||||
ib_gid_index: 3
|
||||
ib_sl: 5
|
||||
ib_tc: 136
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
ib_timeout: 22
|
||||
qps_per_connection: null
|
||||
min_nchannels: null
|
||||
net_plugin: none
|
||||
nvls_enable: 1
|
||||
split_data_on_qps: null
|
||||
extra_env:
|
||||
NCCL_DEBUG_SUBSYS: INIT,NET
|
||||
NCCL_NET_GDR_LEVEL: 5
|
||||
NCCL_NET_GDR_READ: 1
|
||||
NCCL_DMABUF_ENABLE: 0
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 480
|
||||
alltoall: 75
|
||||
62
configs/multinode_nccl_nccl227_diagnostic.yaml
Normal file
62
configs/multinode_nccl_nccl227_diagnostic.yaml
Normal file
@ -0,0 +1,62 @@
|
||||
tools:
|
||||
install_dir: /opt/gpu-test-tools
|
||||
|
||||
report:
|
||||
output_dir: ./reports
|
||||
format: md
|
||||
|
||||
multinode_nccl:
|
||||
enabled: true
|
||||
mode: diagnostic-nccl-2.27.7
|
||||
hosts:
|
||||
- name: nccl-gpu-1
|
||||
addr: 172.72.8.12
|
||||
slots: 8
|
||||
- name: nccl-gpu-2
|
||||
addr: 172.72.8.16
|
||||
slots: 8
|
||||
ssh_user: root
|
||||
ssh_preflight: true
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
mpi_ld_preload: null
|
||||
extra_ld_library_path:
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||
tests:
|
||||
- all_reduce_perf
|
||||
- alltoall_perf
|
||||
topologies:
|
||||
- nodes: 2
|
||||
gpus_per_node: 8
|
||||
label: 2 nodes x 8 GPUs NCCL 2.27.7
|
||||
begin_size: 256M
|
||||
end_size: 256M
|
||||
step_factor: 2
|
||||
warmup_iters: 1
|
||||
iters: 3
|
||||
gpus_per_rank: 1
|
||||
timeout_sec: 600
|
||||
debug: INFO
|
||||
socket_ifname: bond0
|
||||
oob_tcp_ifname: bond0
|
||||
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
|
||||
ib_gid_index: 3
|
||||
ib_sl: 5
|
||||
ib_tc: 136
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
ib_timeout: 22
|
||||
qps_per_connection: null
|
||||
min_nchannels: null
|
||||
net_plugin: none
|
||||
nvls_enable: 1
|
||||
split_data_on_qps: null
|
||||
extra_env:
|
||||
NCCL_DEBUG_SUBSYS: INIT,NET
|
||||
NCCL_NET_GDR_LEVEL: 5
|
||||
NCCL_NET_GDR_READ: 1
|
||||
NCCL_DMABUF_ENABLE: 0
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 480
|
||||
alltoall: 75
|
||||
91
configs/multinode_nccl_nccl227_pdf_matrix.yaml
Normal file
91
configs/multinode_nccl_nccl227_pdf_matrix.yaml
Normal file
@ -0,0 +1,91 @@
|
||||
tools:
|
||||
install_dir: /opt/gpu-test-tools
|
||||
|
||||
report:
|
||||
output_dir: ./reports
|
||||
format: md
|
||||
|
||||
multinode_nccl:
|
||||
enabled: true
|
||||
mode: cross-leaf-pdf-matrix-nccl-2.27.7
|
||||
hosts:
|
||||
- name: nccl-gpu-1
|
||||
addr: 172.72.8.12
|
||||
slots: 8
|
||||
- name: nccl-gpu-2
|
||||
addr: 172.72.8.16
|
||||
slots: 8
|
||||
ssh_user: root
|
||||
ssh_preflight: true
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
mpi_ld_preload: null
|
||||
extra_ld_library_path:
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||
tests:
|
||||
- all_reduce_perf
|
||||
- alltoall_perf
|
||||
topologies:
|
||||
- nodes: 2
|
||||
gpus_per_node: 1
|
||||
label: 2 nodes x 1 GPU (PDF 2 machines 2 GPUs)
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 48.90
|
||||
alltoall: 27.25
|
||||
- nodes: 2
|
||||
gpus_per_node: 2
|
||||
label: 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs)
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 136.93
|
||||
alltoall: 54.41
|
||||
- nodes: 2
|
||||
gpus_per_node: 4
|
||||
label: 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs)
|
||||
cuda_visible_devices: 0,1,4,5
|
||||
op_env:
|
||||
alltoall:
|
||||
NCCL_IB_QPS_PER_CONNECTION: 4
|
||||
NCCL_MIN_NCHANNELS: 4
|
||||
NCCL_IB_SPLIT_DATA_ON_QPS: 1
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 335.48
|
||||
alltoall: 73.73
|
||||
- nodes: 2
|
||||
gpus_per_node: 8
|
||||
label: 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs)
|
||||
op_env:
|
||||
alltoall:
|
||||
NCCL_PXN_DISABLE: 1
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 491.84
|
||||
alltoall: 76.54
|
||||
begin_size: 16G
|
||||
end_size: 16G
|
||||
step_factor: 2
|
||||
warmup_iters: 10
|
||||
gpus_per_rank: 1
|
||||
timeout_sec: 1800
|
||||
debug: INFO
|
||||
socket_ifname: bond0
|
||||
oob_tcp_ifname: bond0
|
||||
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
|
||||
ib_gid_index: 3
|
||||
ib_sl: 5
|
||||
ib_tc: 136
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
ib_timeout: 22
|
||||
qps_per_connection: null
|
||||
min_nchannels: null
|
||||
net_plugin: none
|
||||
nvls_enable: 1
|
||||
split_data_on_qps: null
|
||||
extra_env:
|
||||
NCCL_DEBUG_SUBSYS: INIT,NET
|
||||
NCCL_NET_GDR_LEVEL: 5
|
||||
NCCL_NET_GDR_READ: 1
|
||||
NCCL_DMABUF_ENABLE: 0
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 0
|
||||
alltoall: 0
|
||||
62
configs/multinode_nccl_nccl227_sweep.yaml
Normal file
62
configs/multinode_nccl_nccl227_sweep.yaml
Normal file
@ -0,0 +1,62 @@
|
||||
tools:
|
||||
install_dir: /opt/gpu-test-tools
|
||||
|
||||
report:
|
||||
output_dir: ./reports
|
||||
format: md
|
||||
|
||||
multinode_nccl:
|
||||
enabled: true
|
||||
mode: sweep-nccl-2.27.7
|
||||
hosts:
|
||||
- name: nccl-gpu-1
|
||||
addr: 172.72.8.12
|
||||
slots: 8
|
||||
- name: nccl-gpu-2
|
||||
addr: 172.72.8.16
|
||||
slots: 8
|
||||
ssh_user: root
|
||||
ssh_preflight: true
|
||||
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
|
||||
mpi_ld_preload: null
|
||||
extra_ld_library_path:
|
||||
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
|
||||
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
|
||||
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
|
||||
nccl_tests_dir: /data/nccl-tests-latest/build
|
||||
tests:
|
||||
- all_reduce_perf
|
||||
- alltoall_perf
|
||||
topologies:
|
||||
- nodes: 2
|
||||
gpus_per_node: 8
|
||||
label: 2 nodes x 8 GPUs NCCL 2.27.7 sweep
|
||||
begin_size: 1M
|
||||
end_size: 4G
|
||||
step_factor: 4
|
||||
warmup_iters: 2
|
||||
iters: 5
|
||||
gpus_per_rank: 1
|
||||
timeout_sec: 1200
|
||||
debug: INFO
|
||||
socket_ifname: bond0
|
||||
oob_tcp_ifname: bond0
|
||||
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
|
||||
ib_gid_index: 3
|
||||
ib_sl: 5
|
||||
ib_tc: 136
|
||||
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
|
||||
ib_timeout: 22
|
||||
qps_per_connection: null
|
||||
min_nchannels: null
|
||||
net_plugin: none
|
||||
nvls_enable: 1
|
||||
split_data_on_qps: null
|
||||
extra_env:
|
||||
NCCL_DEBUG_SUBSYS: INIT,NET
|
||||
NCCL_NET_GDR_LEVEL: 5
|
||||
NCCL_NET_GDR_READ: 1
|
||||
NCCL_DMABUF_ENABLE: 0
|
||||
min_peak_busbw_gbps:
|
||||
allreduce: 480
|
||||
alltoall: 75
|
||||
220
gpu_tester.py
220
gpu_tester.py
@ -5,6 +5,7 @@ import argparse
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
@ -25,6 +26,9 @@ from modules.nccl_test import NCCLTest
|
||||
from modules.training_sim import TrainingSim
|
||||
from modules.stress_test import StressTest
|
||||
from modules.rdma_test import RDMATest
|
||||
from modules.nvlink_test import NVLinkTest
|
||||
from modules.dcgm_test import DCGMTest
|
||||
from modules.multinode_nccl_test import MultiNodeNCCLTest
|
||||
from modules.report import ReportGenerator
|
||||
from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility
|
||||
|
||||
@ -32,43 +36,125 @@ DEFAULT_CONFIG = {
|
||||
"benchmark": {
|
||||
"memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3},
|
||||
"compute": {
|
||||
"dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"],
|
||||
"matrix_size": 4096,
|
||||
"warmup": 10,
|
||||
"iterations": 100,
|
||||
"dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
|
||||
"matrix_size": 8192,
|
||||
"warmup": 50,
|
||||
"iterations": 500,
|
||||
"use_compile": True,
|
||||
},
|
||||
},
|
||||
"health": {"temp_warning": 80, "temp_critical": 90, "power_limit": None},
|
||||
"health": {"temp_warning": 75, "temp_critical": 85, "power_limit": None},
|
||||
"nccl": {
|
||||
"min_bandwidth_gbps": None,
|
||||
"test_allreduce": True,
|
||||
"test_alltoall": True,
|
||||
"test_broadcast": True,
|
||||
"test_reduce_scatter": False,
|
||||
"test_allgather": False,
|
||||
"test_sendrecv": False,
|
||||
"test_reduce_scatter": True,
|
||||
"test_allgather": True,
|
||||
"test_sendrecv": True,
|
||||
"message_sizes": ["1M", "256M", "2G"],
|
||||
"repeats": 3,
|
||||
"max_stddev_pct": 3,
|
||||
},
|
||||
"multinode_nccl": {
|
||||
"enabled": False,
|
||||
"mode": "sweep",
|
||||
"hosts": [
|
||||
{"name": "nccl-gpu-1", "addr": "172.72.8.12", "slots": 8},
|
||||
{"name": "nccl-gpu-2", "addr": "172.72.8.16", "slots": 8},
|
||||
],
|
||||
"ssh_user": "root",
|
||||
"ssh_preflight": True,
|
||||
"mpirun_path": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun",
|
||||
"mpi_ld_preload": None,
|
||||
"extra_ld_library_path": [
|
||||
"/usr/mpi/gcc/openmpi-4.1.9a1/lib",
|
||||
"/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib",
|
||||
"/usr/local/cuda-12.4/targets/x86_64-linux/lib",
|
||||
],
|
||||
"nccl_tests_dir": None,
|
||||
"tests": ["all_reduce_perf", "alltoall_perf"],
|
||||
"topologies": [{"nodes": 2, "gpus_per_node": 8}],
|
||||
"begin_size": "1k",
|
||||
"end_size": "16g",
|
||||
"step_factor": 2,
|
||||
"warmup_iters": 10,
|
||||
"gpus_per_rank": 1,
|
||||
"timeout_sec": 1800,
|
||||
"socket_ifname": "bond0",
|
||||
"ib_gid_index": 3,
|
||||
"ib_sl": 5,
|
||||
"ib_tc": 136,
|
||||
"ib_hca": "mlx5_0,mlx5_1,mlx5_6,mlx5_7",
|
||||
"ib_timeout": 22,
|
||||
"qps_per_connection": 4,
|
||||
"min_nchannels": 4,
|
||||
"net_plugin": "none",
|
||||
"nvls_enable": 1,
|
||||
"split_data_on_qps": 1,
|
||||
"min_peak_busbw_gbps": {"allreduce": 480, "alltoall": 75},
|
||||
},
|
||||
"stress": {
|
||||
"duration_sec": 60,
|
||||
"duration_sec": 1800,
|
||||
"production_duration_sec": 1800,
|
||||
"use_gpu_burn": False,
|
||||
"use_doubles": False,
|
||||
"use_tensor_cores": True,
|
||||
"memory_pct": 90,
|
||||
"gpus": "all",
|
||||
"dtype": "bf16",
|
||||
"matrix_size": 24576,
|
||||
"telemetry_interval_sec": 1,
|
||||
"warmup_sec": 60,
|
||||
"min_steady_samples": 10,
|
||||
"max_temp_c": 80,
|
||||
"max_temp_delta_c": 5,
|
||||
"min_power_watts": 630,
|
||||
"max_tflops_jitter_pct": 5,
|
||||
"require_tflops_jitter": True,
|
||||
},
|
||||
"rdma": {
|
||||
"min_bandwidth_gbps": 50,
|
||||
"max_latency_us": 10,
|
||||
"min_bandwidth_gbps": 47,
|
||||
"min_port_rate_gbps": 400,
|
||||
"max_latency_us": 3.5,
|
||||
"max_write_latency_us": 2.0,
|
||||
"max_read_latency_us": 3.5,
|
||||
"ib_iterations": 1000,
|
||||
"msg_size": 65536,
|
||||
"msg_size": 4194304,
|
||||
"latency_msg_size": 8,
|
||||
"ib_device": None,
|
||||
"ib_port": 1,
|
||||
"server_addr": None,
|
||||
"ibping_target": None,
|
||||
"ibping_count": 5,
|
||||
"role": "auto",
|
||||
"pfc_ecn_counters": True,
|
||||
},
|
||||
"nvlink": {
|
||||
"expected_links_per_gpu": 18,
|
||||
"expected_link_speed_gbps": 25,
|
||||
"require_zero_errors": True,
|
||||
},
|
||||
"dcgm": {
|
||||
"diag_level": 3,
|
||||
"timeout_sec": 1200,
|
||||
"expected_num_gpus": 8,
|
||||
"json_output": True,
|
||||
"require_subtests": True,
|
||||
},
|
||||
"training": {
|
||||
"model": "gpt2",
|
||||
"model": "synthetic_1.5b",
|
||||
"batch_size": 8,
|
||||
"seq_length": 2048,
|
||||
"num_steps": 50,
|
||||
"warmup_steps": 5,
|
||||
"dtype": "bf16",
|
||||
"mode": "ddp",
|
||||
"synthetic_params_b": 1.5,
|
||||
"min_tokens_per_sec": 45000,
|
||||
"max_step_jitter_pct": 3,
|
||||
"max_peak_memory_gb": 70,
|
||||
"require_distributed": True,
|
||||
},
|
||||
"report": {"output_dir": "./reports", "format": "json"},
|
||||
"tools": {"install_dir": "/opt/gpu-test-tools"},
|
||||
@ -131,7 +217,7 @@ def interactive_menu(config: dict):
|
||||
if not check_prerequisites(console):
|
||||
return
|
||||
|
||||
results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}}
|
||||
results_store: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname(), "tests": {}}
|
||||
|
||||
menu_items = [
|
||||
("1", "GPU Information", "gpu_info"),
|
||||
@ -139,10 +225,13 @@ def interactive_menu(config: dict):
|
||||
("3", "Memory Benchmark (nvbandwidth)", "memory_bench"),
|
||||
("4", "Compute Benchmark", "compute_bench"),
|
||||
("5", "NCCL Multi-GPU Test", "nccl"),
|
||||
("6", "GPU Stress Test (gpu-burn)", "stress"),
|
||||
("6", "GPU Stress Test (PyTorch/gpu-burn)", "stress"),
|
||||
("7", "RDMA/IB Test", "rdma"),
|
||||
("8", "Training Simulation", "training"),
|
||||
("9", "Full Test Suite (All Tests)", "all"),
|
||||
("8", "NVLink/NVSwitch Test", "nvlink"),
|
||||
("9", "DCGM Diagnostic", "dcgm"),
|
||||
("10", "Training Simulation", "training"),
|
||||
("11", "Multi-node NCCL Test", "multinode_nccl"),
|
||||
("12", "Full Test Suite (All Tests)", "all"),
|
||||
("0", "Generate Report", "report"),
|
||||
]
|
||||
|
||||
@ -164,9 +253,12 @@ def interactive_menu(config: dict):
|
||||
"memory_bench": "HBM bandwidth via nvbandwidth",
|
||||
"compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8",
|
||||
"nccl": "AllReduce, AllToAll, Broadcast via nccl-tests",
|
||||
"stress": "Long-running GPU stress via gpu-burn",
|
||||
"stress": "Long-running high-power GEMM stress with telemetry",
|
||||
"rdma": "InfiniBand bandwidth & latency (ib_write_bw)",
|
||||
"nvlink": "NVLink links, speed, and error counters",
|
||||
"dcgm": "DCGM diag -r 3 production diagnostic",
|
||||
"training": "Simulate LLM training with PyTorch",
|
||||
"multinode_nccl": "Cross-node NCCL via mpirun/nccl-tests",
|
||||
"all": "Run all tests sequentially",
|
||||
"report": "Export results to JSON/HTML",
|
||||
}
|
||||
@ -257,12 +349,30 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
|
||||
m.print_results(result)
|
||||
return result
|
||||
|
||||
elif test_name == "nvlink":
|
||||
m = NVLinkTest(config)
|
||||
result = m.run()
|
||||
m.print_results(result)
|
||||
return result
|
||||
|
||||
elif test_name == "dcgm":
|
||||
m = DCGMTest(config)
|
||||
result = m.run()
|
||||
m.print_results(result)
|
||||
return result
|
||||
|
||||
elif test_name == "training":
|
||||
m = TrainingSim(config)
|
||||
result = m.run()
|
||||
m.print_results(result)
|
||||
return result
|
||||
|
||||
elif test_name == "multinode_nccl":
|
||||
m = MultiNodeNCCLTest(config)
|
||||
result = m.run()
|
||||
m.print_results(result)
|
||||
return result
|
||||
|
||||
elif test_name == "all":
|
||||
return _run_full_suite(config, console)
|
||||
|
||||
@ -280,17 +390,21 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
|
||||
def _run_full_suite(config: dict, console: Console) -> dict:
|
||||
"""Run all tests sequentially."""
|
||||
console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE))
|
||||
all_results: dict = {"timestamp": datetime.now().isoformat()}
|
||||
all_results: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname()}
|
||||
tests = [
|
||||
("gpu_info", "GPU Information", GPUInfo),
|
||||
("health", "Health Check", HealthCheck),
|
||||
("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)),
|
||||
("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)),
|
||||
("nvlink", "NVLink/NVSwitch Test", NVLinkTest),
|
||||
("nccl", "NCCL Test", NCCLTest),
|
||||
("stress", "GPU Stress Test", StressTest),
|
||||
("rdma", "RDMA/IB Test", RDMATest),
|
||||
("dcgm", "DCGM Diagnostic", DCGMTest),
|
||||
("training", "Training Simulation", TrainingSim),
|
||||
]
|
||||
if (config.get("multinode_nccl", {}) or {}).get("enabled"):
|
||||
tests.append(("multinode_nccl", "Multi-node NCCL Test", MultiNodeNCCLTest))
|
||||
|
||||
for i, (key, name, mod_cls) in enumerate(tests, 1):
|
||||
console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]")
|
||||
@ -313,14 +427,49 @@ def _run_full_suite(config: dict, console: Console) -> dict:
|
||||
# Summary
|
||||
console.print("\n" + "=" * 60)
|
||||
# Only count test results, exclude metadata like timestamp
|
||||
test_results = {k: v for k, v in all_results.items() if k != "timestamp"}
|
||||
passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v)
|
||||
test_results = {k: v for k, v in all_results.items() if k not in ("timestamp", "hostname")}
|
||||
passed = sum(1 for v in test_results.values() if _test_result_passed(v))
|
||||
total = len(test_results)
|
||||
color = "green" if passed == total else ("yellow" if passed > 0 else "red")
|
||||
console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]")
|
||||
return all_results
|
||||
|
||||
|
||||
def _test_result_passed(result) -> bool:
|
||||
"""Strict production verdict helper for full-suite exit status."""
|
||||
if not isinstance(result, dict):
|
||||
return True
|
||||
if result.get("error"):
|
||||
return False
|
||||
if result.get("skipped") or result.get("status") == "SKIP":
|
||||
return False
|
||||
if result.get("source") == "torchrun_fallback":
|
||||
return False
|
||||
if "passed" in result:
|
||||
return bool(result.get("passed"))
|
||||
if "memory" in result:
|
||||
mem = result["memory"]
|
||||
if isinstance(mem, dict) and "passed" in mem:
|
||||
return bool(mem.get("passed"))
|
||||
if mem.get("error") or mem.get("source") == "pytorch":
|
||||
return False
|
||||
eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0
|
||||
return eff >= 80
|
||||
if "compute" in result:
|
||||
comp = result["compute"]
|
||||
if isinstance(comp, dict) and "passed" in comp:
|
||||
return bool(comp.get("passed"))
|
||||
thresholds = comp.get("pass_thresholds_tflops", {}) or {}
|
||||
per_dtype = comp.get("per_dtype_tflops", {})
|
||||
for dt, threshold in thresholds.items():
|
||||
val = per_dtype.get(dt)
|
||||
if not isinstance(val, (int, float)) or val < threshold:
|
||||
return False
|
||||
consistency = comp.get("consistency", {})
|
||||
return not any(not c.get("passed", False) for c in consistency.values())
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
gpu_list_str = " / ".join(g.upper() for g in get_supported_gpus())
|
||||
parser = argparse.ArgumentParser(
|
||||
@ -335,15 +484,18 @@ Examples:
|
||||
python gpu_tester.py --test benchmark --type memory
|
||||
python gpu_tester.py --test benchmark --type compute --dtype fp16
|
||||
python gpu_tester.py --test nccl # NCCL test
|
||||
python gpu_tester.py --test multinode-nccl # Cross-node NCCL test
|
||||
python gpu_tester.py --test nvlink # NVLink/NVSwitch test
|
||||
python gpu_tester.py --test dcgm # DCGM diagnostic
|
||||
python gpu_tester.py --test training # Training sim
|
||||
python gpu_tester.py --test all # Full suite
|
||||
python gpu_tester.py --report --format json --output report.json
|
||||
""",
|
||||
)
|
||||
parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"],
|
||||
parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "multinode-nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"],
|
||||
help="Run a specific test")
|
||||
parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)")
|
||||
parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"],
|
||||
parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
|
||||
help="Compute benchmark dtype (with --test benchmark --type compute)")
|
||||
parser.add_argument("--interactive", action="store_true", help="Force interactive mode")
|
||||
parser.add_argument("--report", action="store_true", help="Generate report from last results")
|
||||
@ -397,8 +549,11 @@ Examples:
|
||||
"health": "health",
|
||||
"benchmark": None,
|
||||
"nccl": "nccl",
|
||||
"multinode-nccl": "multinode_nccl",
|
||||
"stress": "stress",
|
||||
"rdma": "rdma",
|
||||
"nvlink": "nvlink",
|
||||
"dcgm": "dcgm",
|
||||
"training": "training",
|
||||
"all": "all",
|
||||
}
|
||||
@ -415,19 +570,30 @@ Examples:
|
||||
result = bench.run()
|
||||
Benchmark.print_results(result)
|
||||
if args.report:
|
||||
ReportGenerator(config).generate({"benchmark": result, "timestamp": datetime.now().isoformat()},
|
||||
ReportGenerator(config).generate({
|
||||
"benchmark": result,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"hostname": socket.gethostname(),
|
||||
},
|
||||
fmt=args.format, output=args.output)
|
||||
sys.exit(0 if _test_result_passed(result) else 1)
|
||||
elif args.test == "all":
|
||||
results = _run_full_suite(config, console)
|
||||
if args.report:
|
||||
ReportGenerator(config).generate(results, fmt=args.format, output=args.output)
|
||||
has_errors = any("error" in v for v in results.values() if isinstance(v, dict))
|
||||
sys.exit(1 if has_errors else 0)
|
||||
failed = any(not _test_result_passed(v) for k, v in results.items() if k not in ("timestamp", "hostname"))
|
||||
sys.exit(1 if failed else 0)
|
||||
else:
|
||||
result = _run_test(test_map[args.test], config, console)
|
||||
if args.report and result:
|
||||
ReportGenerator(config).generate({args.test: result, "timestamp": datetime.now().isoformat()},
|
||||
report_key = test_map[args.test] or args.test
|
||||
ReportGenerator(config).generate({
|
||||
report_key: result,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"hostname": socket.gethostname(),
|
||||
},
|
||||
fmt=args.format, output=args.output)
|
||||
sys.exit(0 if _test_result_passed(result) else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
231
modules/dcgm_test.py
Normal file
231
modules/dcgm_test.py
Normal file
@ -0,0 +1,231 @@
|
||||
"""DCGM diagnostic acceptance wrapper."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
|
||||
class DCGMTest:
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
self.console = Console()
|
||||
self.cfg = config.get("dcgm", {})
|
||||
|
||||
def run(self) -> dict:
|
||||
dcgmi = shutil.which("dcgmi")
|
||||
if not dcgmi:
|
||||
return {
|
||||
"passed": False,
|
||||
"error": "dcgmi not found",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
level = str(self.cfg.get("diag_level", 3))
|
||||
timeout = int(self.cfg.get("timeout_sec", 1200))
|
||||
cmd = [dcgmi, "diag", "-r", level]
|
||||
expected_gpus = self.cfg.get("expected_num_gpus")
|
||||
if expected_gpus:
|
||||
cmd.extend(["-n", f"gpu:{int(expected_gpus)}"])
|
||||
if self.cfg.get("json_output", True):
|
||||
cmd.append("-j")
|
||||
|
||||
try:
|
||||
r = self._run_with_process_group_timeout(cmd, timeout)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
output = ((e.output or "") + "\n" + (e.stderr or "")).strip()
|
||||
return {
|
||||
"passed": False,
|
||||
"error": f"dcgmi diag -r {level} timeout after {timeout}s",
|
||||
"command": cmd,
|
||||
"raw_output_tail": output[-8000:],
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
output = r.stdout + "\n" + r.stderr
|
||||
subtests = self._parse_json_output(output) or self._parse_output(output)
|
||||
strict_statuses = {"PASS"}
|
||||
failed = [s for s in subtests if s["status"] not in strict_statuses]
|
||||
require_subtests = bool(self.cfg.get("require_subtests", True))
|
||||
passed = r.returncode == 0 and not failed and (bool(subtests) or not require_subtests)
|
||||
return {
|
||||
"passed": passed,
|
||||
"returncode": r.returncode,
|
||||
"level": int(level),
|
||||
"command": cmd,
|
||||
"expected_num_gpus": int(expected_gpus) if expected_gpus else None,
|
||||
"subtests": subtests,
|
||||
"raw_output_tail": output[-8000:],
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _run_with_process_group_timeout(cmd: list[str], timeout: int) -> subprocess.CompletedProcess:
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
start_new_session=True,
|
||||
)
|
||||
try:
|
||||
stdout, stderr = proc.communicate(timeout=timeout)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
try:
|
||||
os.killpg(proc.pid, signal.SIGTERM)
|
||||
stdout, stderr = proc.communicate(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
os.killpg(proc.pid, signal.SIGKILL)
|
||||
stdout, stderr = proc.communicate(timeout=10)
|
||||
raise subprocess.TimeoutExpired(cmd, timeout, output=stdout, stderr=stderr) from e
|
||||
return subprocess.CompletedProcess(cmd, proc.returncode, stdout, stderr)
|
||||
|
||||
@classmethod
|
||||
def _parse_json_output(cls, output: str) -> list[dict]:
|
||||
text = output.strip()
|
||||
if not text:
|
||||
return []
|
||||
try:
|
||||
payload = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
m = re.search(r"(\{.*\})", text, re.S)
|
||||
if not m:
|
||||
return []
|
||||
try:
|
||||
payload = json.loads(m.group(1))
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
dcgm_payload = payload.get("DCGM Diagnostic") if isinstance(payload, dict) else None
|
||||
if isinstance(dcgm_payload, dict):
|
||||
parsed = cls._parse_dcgm_diagnostic_json(dcgm_payload)
|
||||
if parsed:
|
||||
return parsed
|
||||
|
||||
subtests = []
|
||||
|
||||
def walk(node, path: list[str]):
|
||||
if isinstance(node, dict):
|
||||
node_name = (
|
||||
node.get("name")
|
||||
or node.get("testName")
|
||||
or node.get("test_name")
|
||||
or node.get("category")
|
||||
or node.get("category_name")
|
||||
)
|
||||
child_path = [*path, str(node_name)] if node_name else path
|
||||
status = node.get("status") or node.get("result") or node.get("Result")
|
||||
if isinstance(status, str):
|
||||
name = (
|
||||
node_name
|
||||
or " / ".join(path[-3:])
|
||||
)
|
||||
normalized = cls._normalize_status(status)
|
||||
if normalized:
|
||||
subtests.append({
|
||||
"name": str(name)[:160],
|
||||
"status": normalized,
|
||||
"raw": json.dumps(node, default=str)[:1000],
|
||||
})
|
||||
for key, value in node.items():
|
||||
walk(value, [*child_path, str(key)])
|
||||
elif isinstance(node, list):
|
||||
for idx, item in enumerate(node):
|
||||
walk(item, [*path, str(idx)])
|
||||
|
||||
walk(payload, [])
|
||||
return subtests
|
||||
|
||||
@classmethod
|
||||
def _parse_dcgm_diagnostic_json(cls, payload: dict) -> list[dict]:
|
||||
subtests = []
|
||||
for category in payload.get("test_categories", []) or []:
|
||||
category_name = str(category.get("category") or "DCGM")
|
||||
for test in category.get("tests", []) or []:
|
||||
test_name = str(test.get("name") or "unnamed")
|
||||
for result in test.get("results", []) or []:
|
||||
status = cls._normalize_status(str(result.get("status", "")))
|
||||
if not status:
|
||||
continue
|
||||
entity_group = result.get("entity_group") or "entity"
|
||||
entity_id = result.get("entity_id", "unknown")
|
||||
name = f"{category_name}/{test_name}/{entity_group}{entity_id}"
|
||||
subtests.append({
|
||||
"name": name[:160],
|
||||
"status": status,
|
||||
"raw": json.dumps(result, default=str)[:1000],
|
||||
})
|
||||
summary = test.get("test_summary") or {}
|
||||
status = cls._normalize_status(str(summary.get("status", "")))
|
||||
if status:
|
||||
subtests.append({
|
||||
"name": f"{category_name}/{test_name}/summary"[:160],
|
||||
"status": status,
|
||||
"raw": json.dumps(summary, default=str)[:1000],
|
||||
})
|
||||
return subtests
|
||||
|
||||
@staticmethod
|
||||
def _normalize_status(status: str) -> str:
|
||||
s = status.strip().upper()
|
||||
aliases = {
|
||||
"PASS": "PASS",
|
||||
"PASSED": "PASS",
|
||||
"OK": "PASS",
|
||||
"FAIL": "FAIL",
|
||||
"FAILED": "FAIL",
|
||||
"ERROR": "ERROR",
|
||||
"WARN": "WARN",
|
||||
"WARNING": "WARN",
|
||||
"SKIP": "SKIP",
|
||||
"SKIPPED": "SKIP",
|
||||
"NOT_RUN": "SKIP",
|
||||
"NOT RUN": "SKIP",
|
||||
}
|
||||
return aliases.get(s, s if s in {"PASS", "FAIL", "ERROR", "WARN", "SKIP"} else "")
|
||||
|
||||
@staticmethod
|
||||
def _parse_output(output: str) -> list[dict]:
|
||||
subtests = []
|
||||
for line in output.splitlines():
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
m = re.search(r"(.+?)\s*[:|]\s*(PASS|FAIL|WARN|ERROR|SKIP)\b", stripped, re.I)
|
||||
if not m:
|
||||
m = re.search(r"\b(PASS|FAIL|WARN|ERROR|SKIP)\b\s*[-:|]\s*(.+)", stripped, re.I)
|
||||
if m:
|
||||
status = DCGMTest._normalize_status(m.group(1))
|
||||
name = m.group(2).strip()
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
name = m.group(1).strip(" .|-")
|
||||
status = DCGMTest._normalize_status(m.group(2))
|
||||
if name and len(name) < 160:
|
||||
subtests.append({"name": name, "status": status, "raw": stripped})
|
||||
return subtests
|
||||
|
||||
@staticmethod
|
||||
def print_results(results: dict, console: Optional[Console] = None):
|
||||
c = console or Console()
|
||||
if results.get("error"):
|
||||
c.print(f"[bold red]DCGM error: {results['error']}[/bold red]")
|
||||
return
|
||||
passed = results.get("passed", False)
|
||||
c.print("[bold green]✓ DCGM diag PASSED[/bold green]" if passed else "[bold red]✗ DCGM diag FAILED[/bold red]")
|
||||
subtests = results.get("subtests", [])
|
||||
if subtests:
|
||||
table = Table(box=None, padding=(0, 1))
|
||||
table.add_column("Subtest")
|
||||
table.add_column("Status", style="bold")
|
||||
for s in subtests:
|
||||
table.add_row(s.get("name", ""), s.get("status", ""))
|
||||
c.print(table)
|
||||
@ -171,6 +171,10 @@ class HealthCheck:
|
||||
gpu_health.append({"index": i, "status": worst, "checks": checks})
|
||||
|
||||
system_health = self._check_system()
|
||||
for key in ("fabricmanager", "retired_pages", "kernel_errors"):
|
||||
item = system_health.get(key, {})
|
||||
if isinstance(item, dict) and item.get("status") == "FAIL":
|
||||
overall_pass = False
|
||||
|
||||
return {
|
||||
"passed": overall_pass,
|
||||
@ -228,6 +232,9 @@ class HealthCheck:
|
||||
rdma_devs = os.listdir("/sys/class/infiniband_verbs")
|
||||
|
||||
nccl_env = {k: v for k, v in os.environ.items() if k.startswith("NCCL_")}
|
||||
fabric = self._check_fabricmanager()
|
||||
retired = self._check_retired_pages()
|
||||
kernel_errors = self._check_kernel_errors()
|
||||
|
||||
return {
|
||||
"nvidia_persistenced": {"installed": persistd, "running": persistd_running},
|
||||
@ -238,6 +245,41 @@ class HealthCheck:
|
||||
"infiniband_devices": ib_devs,
|
||||
"rdma_devices": rdma_devs,
|
||||
"nccl_env_vars": nccl_env,
|
||||
"fabricmanager": fabric,
|
||||
"retired_pages": retired,
|
||||
"kernel_errors": kernel_errors,
|
||||
}
|
||||
|
||||
def _check_fabricmanager(self) -> dict:
|
||||
r = self._run_cmd(["systemctl", "is-active", "nvidia-fabricmanager"], timeout=5)
|
||||
active = r == "active"
|
||||
logs = self._run_cmd(["journalctl", "-u", "nvidia-fabricmanager", "-n", "200", "--no-pager"], timeout=10) or ""
|
||||
has_error = "ERROR" in logs.upper() or "FAILED" in logs.upper()
|
||||
return {
|
||||
"active": active,
|
||||
"has_error_logs": has_error,
|
||||
"status": "PASS" if active and not has_error else "FAIL",
|
||||
}
|
||||
|
||||
def _check_retired_pages(self) -> dict:
|
||||
raw = self._run_cmd(["nvidia-smi", "-q", "-d", "PAGE_RETIREMENT"], timeout=30) or ""
|
||||
nums = [int(x) for x in __import__("re").findall(r"Retired Pages.*?:\s*(\d+)", raw, flags=__import__("re").I)]
|
||||
pending = "Pending Page Blacklist" in raw and "Yes" in raw
|
||||
total = sum(nums)
|
||||
return {
|
||||
"retired_pages": total,
|
||||
"pending_blacklist": pending,
|
||||
"status": "PASS" if total == 0 and not pending else "FAIL",
|
||||
}
|
||||
|
||||
def _check_kernel_errors(self) -> dict:
|
||||
raw = self._run_cmd(["dmesg", "--ctime", "--level=err,crit,alert,emerg"], timeout=10) or ""
|
||||
upper = raw.upper()
|
||||
hits = [line for line in raw.splitlines() if any(k in line.upper() for k in ("XID", "AER", "PCIE", "NVRM"))]
|
||||
return {
|
||||
"count": len(hits),
|
||||
"tail": hits[-20:],
|
||||
"status": "PASS" if not hits else "FAIL",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -5,6 +5,8 @@ import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import statistics
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
@ -70,6 +72,38 @@ class NCCLTest:
|
||||
return p
|
||||
return None
|
||||
|
||||
def _message_sizes(self) -> list[str]:
|
||||
return list(self.nccl_cfg.get("message_sizes") or ["1M", "256M", "2G"])
|
||||
|
||||
def _repeats(self) -> int:
|
||||
return int(self.nccl_cfg.get("repeats", 3))
|
||||
|
||||
def _max_stddev_pct(self) -> float:
|
||||
return float(self.nccl_cfg.get("max_stddev_pct", 3))
|
||||
|
||||
def _runtime_env(self) -> dict:
|
||||
env = {**os.environ, "NCCL_DEBUG": "WARN"}
|
||||
lib_dirs = []
|
||||
|
||||
nccl_home = env.get("NCCL_HOME") or self.nccl_cfg.get("nccl_home")
|
||||
if nccl_home:
|
||||
lib_dirs.append(os.path.join(str(nccl_home), "lib"))
|
||||
|
||||
for path in sys.path:
|
||||
lib_dirs.append(os.path.join(path, "nvidia", "nccl", "lib"))
|
||||
|
||||
venv_root = os.path.dirname(os.path.dirname(sys.executable))
|
||||
lib_dirs.extend(glob.glob(os.path.join(venv_root, "lib", "python*", "site-packages", "nvidia", "nccl", "lib")))
|
||||
|
||||
existing = env.get("LD_LIBRARY_PATH", "")
|
||||
valid_dirs = []
|
||||
for d in lib_dirs:
|
||||
if d and os.path.isdir(d) and d not in valid_dirs:
|
||||
valid_dirs.append(d)
|
||||
if valid_dirs:
|
||||
env["LD_LIBRARY_PATH"] = ":".join(valid_dirs + ([existing] if existing else []))
|
||||
return env
|
||||
|
||||
def run(self) -> dict:
|
||||
gpu_count = 0
|
||||
if TORCH_AVAILABLE:
|
||||
@ -89,7 +123,7 @@ class NCCLTest:
|
||||
if self.nccl_cfg.get("test_reduce_scatter", False):
|
||||
tests.append(("reduce_scatter_perf", "ReduceScatter"))
|
||||
if self.nccl_cfg.get("test_allgather", False):
|
||||
tests.append(("allgather_perf", "AllGather"))
|
||||
tests.append(("all_gather_perf", "AllGather"))
|
||||
if self.nccl_cfg.get("test_sendrecv", False):
|
||||
tests.append(("sendrecv_perf", "SendRecv"))
|
||||
|
||||
@ -170,39 +204,7 @@ class NCCLTest:
|
||||
if not binary:
|
||||
return {"status": "SKIP", "error": f"{binary_name} not found"}
|
||||
|
||||
cmd = [
|
||||
binary,
|
||||
"-b", "8M",
|
||||
"-e", "8G",
|
||||
"-f", "2",
|
||||
"-g", str(gpu_count),
|
||||
"-w", "5",
|
||||
"-n", "20",
|
||||
]
|
||||
|
||||
try:
|
||||
env = os.environ.copy()
|
||||
env["NCCL_DEBUG"] = "WARN"
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env)
|
||||
|
||||
combined = r.stdout + r.stderr
|
||||
# Check for NCCL/CUDA compatibility errors
|
||||
if "CUDA driver version is insufficient" in combined or \
|
||||
"Test NCCL failure" in combined:
|
||||
error_msg = "NCCL/CUDA driver version mismatch" \
|
||||
if "CUDA driver version" in combined \
|
||||
else "NCCL test failure (library incompatibility)"
|
||||
return {"status": "FAIL", "error": error_msg}
|
||||
|
||||
if r.returncode != 0:
|
||||
return {"status": "FAIL", "error": r.stderr[:300]}
|
||||
|
||||
return self._parse_nccl_output(r.stdout, min_bw)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return {"status": "FAIL", "error": "timeout"}
|
||||
except Exception as e:
|
||||
return {"status": "FAIL", "error": str(e)}
|
||||
return self._run_nccl_matrix([binary, "-g", str(gpu_count)], min_bw)
|
||||
|
||||
def _run_one_nccl_test_mpirun(self, binary_name: str, label: str,
|
||||
gpu_count: int, mpirun: str, min_bw: float) -> dict:
|
||||
@ -218,37 +220,64 @@ class NCCLTest:
|
||||
"-x", "NCCL_DEBUG=WARN",
|
||||
"-x", "CUDA_VISIBLE_DEVICES=" + ",".join(str(i) for i in range(gpu_count)),
|
||||
binary,
|
||||
"-b", "8",
|
||||
"-e", "256M",
|
||||
"-f", "2",
|
||||
"-g", "1",
|
||||
"-w", "5",
|
||||
"-n", "20",
|
||||
]
|
||||
|
||||
return self._run_nccl_matrix(cmd, min_bw)
|
||||
|
||||
def _run_nccl_matrix(self, base_cmd: list[str], min_bw: float) -> dict:
|
||||
size_results = []
|
||||
failures = []
|
||||
env = self._runtime_env()
|
||||
|
||||
try:
|
||||
env = os.environ.copy()
|
||||
env["NCCL_DEBUG"] = "WARN"
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env)
|
||||
|
||||
combined = r.stdout + r.stderr
|
||||
if "CUDA driver version is insufficient" in combined or \
|
||||
"Test NCCL failure" in combined:
|
||||
error_msg = "NCCL/CUDA driver version mismatch" \
|
||||
if "CUDA driver version" in combined \
|
||||
else "NCCL test failure (library incompatibility)"
|
||||
return {"status": "FAIL", "error": error_msg}
|
||||
|
||||
if r.returncode != 0:
|
||||
return {"status": "FAIL", "error": r.stderr[:300]}
|
||||
|
||||
return self._parse_nccl_output(r.stdout, min_bw)
|
||||
for size in self._message_sizes():
|
||||
runs = []
|
||||
for _ in range(self._repeats()):
|
||||
cmd = [*base_cmd, "-b", size, "-e", size, "-f", "2", "-w", "5", "-n", "20"]
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=env)
|
||||
combined = r.stdout + r.stderr
|
||||
if "CUDA driver version is insufficient" in combined or "Test NCCL failure" in combined:
|
||||
failures.append({"size": size, "error": "NCCL/CUDA/library failure"})
|
||||
continue
|
||||
if r.returncode != 0:
|
||||
failures.append({"size": size, "error": r.stderr[:300]})
|
||||
continue
|
||||
parsed = self._parse_nccl_output(r.stdout, min_bw)
|
||||
runs.append(parsed.get("best_busbw_gbps", 0))
|
||||
if runs:
|
||||
worst = min(runs)
|
||||
mean = sum(runs) / len(runs)
|
||||
std_pct = (statistics.pstdev(runs) / mean * 100) if len(runs) > 1 and mean else 0
|
||||
size_results.append({
|
||||
"size": size,
|
||||
"runs_busbw_gbps": [round(v, 1) for v in runs],
|
||||
"worst_busbw_gbps": round(worst, 1),
|
||||
"mean_busbw_gbps": round(mean, 1),
|
||||
"stddev_pct": round(std_pct, 2),
|
||||
"status": "PASS" if worst >= min_bw and std_pct <= self._max_stddev_pct() else "FAIL",
|
||||
})
|
||||
else:
|
||||
size_results.append({"size": size, "status": "FAIL", "runs_busbw_gbps": []})
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return {"status": "FAIL", "error": "timeout"}
|
||||
except Exception as e:
|
||||
return {"status": "FAIL", "error": str(e)}
|
||||
|
||||
best_bus = max((r.get("mean_busbw_gbps", 0) for r in size_results), default=0)
|
||||
worst_bus = min((r.get("worst_busbw_gbps", 0) for r in size_results if r.get("runs_busbw_gbps")), default=0)
|
||||
passed = bool(size_results) and all(r.get("status") == "PASS" for r in size_results) and not failures
|
||||
return {
|
||||
"status": "PASS" if passed else "FAIL",
|
||||
"best_busbw_gbps": round(best_bus, 1),
|
||||
"worst_busbw_gbps": round(worst_bus, 1),
|
||||
"min_required_gbps": min_bw,
|
||||
"max_stddev_pct": self._max_stddev_pct(),
|
||||
"by_size": size_results,
|
||||
"failures": failures,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _parse_nccl_output(stdout: str, min_bw: float) -> dict:
|
||||
"""Parse nccl-tests tabular output and extract bandwidth results."""
|
||||
@ -363,7 +392,7 @@ dist.destroy_process_group()
|
||||
r = subprocess.run(
|
||||
[torchrun_cmd, f"--nproc_per_node={gpu_count}", tmp.name],
|
||||
capture_output=True, text=True, timeout=120,
|
||||
env={**os.environ, "NCCL_DEBUG": "WARN"},
|
||||
env=self._runtime_env(),
|
||||
)
|
||||
os.unlink(tmp.name)
|
||||
|
||||
@ -390,10 +419,15 @@ dist.destroy_process_group()
|
||||
}
|
||||
|
||||
return {
|
||||
"passed": all_passed,
|
||||
# torchrun fallback is a functional smoke only. It never proves
|
||||
# production bus bandwidth, so it must not satisfy acceptance.
|
||||
"passed": False,
|
||||
"functional_passed": all_passed,
|
||||
"source": "torchrun_fallback",
|
||||
"tests": tests,
|
||||
"gpu_count": gpu_count,
|
||||
"error": None if all_passed else "torchrun functional NCCL smoke failed",
|
||||
"acceptance_gap": "nccl-tests bus bandwidth was not measured",
|
||||
}
|
||||
except Exception as e:
|
||||
return {"passed": False, "source": "torchrun_fallback", "error": str(e)}
|
||||
@ -410,7 +444,8 @@ dist.destroy_process_group()
|
||||
|
||||
if source == "torchrun_fallback":
|
||||
# Connectivity check mode
|
||||
verdict = "[bold green]✓ NCCL Connectivity OK[/bold green]" if passed else "[bold red]✗ NCCL Connectivity FAILED[/bold red]"
|
||||
functional = results.get("functional_passed", passed)
|
||||
verdict = "[bold yellow]⚠ NCCL bus BW NOT VERIFIED[/bold yellow]" if functional else "[bold red]✗ NCCL Connectivity FAILED[/bold red]"
|
||||
c.print(f"{verdict} [dim](basic check via torchrun)[/dim]")
|
||||
|
||||
tests = results.get("tests", {})
|
||||
@ -427,7 +462,7 @@ dist.destroy_process_group()
|
||||
else:
|
||||
c.print(f" [{s_color}]{op_name}[/{s_color}]")
|
||||
|
||||
c.print("\n[yellow]Note: functional connectivity test only (no performance data)[/yellow]")
|
||||
c.print("\n[yellow]Note: functional connectivity test only (no bus bandwidth data; acceptance FAIL)[/yellow]")
|
||||
else:
|
||||
# nccl-tests mode
|
||||
verdict = "[bold green]✓ NCCL tests PASSED[/bold green]" if passed else "[bold yellow]⚠ NCCL tests WARNING[/bold yellow]"
|
||||
@ -448,12 +483,16 @@ dist.destroy_process_group()
|
||||
if by_size:
|
||||
t = Table(box=None, padding=(0, 1))
|
||||
t.add_column("Size", style="bold", justify="right")
|
||||
t.add_column("Time (us)", justify="right")
|
||||
t.add_column("Alg BW (GB/s)", justify="right")
|
||||
t.add_column("Bus BW (GB/s)", justify="right")
|
||||
t.add_column("Worst Bus BW", justify="right")
|
||||
t.add_column("Mean Bus BW", justify="right")
|
||||
t.add_column("StdDev", justify="right")
|
||||
t.add_column("Status", justify="right")
|
||||
for r in by_size:
|
||||
sz = r.get("size", 0)
|
||||
sz_str = f"{sz/1024:.0f}K" if sz < 1048576 else f"{sz/1048576:.0f}M"
|
||||
t.add_row(sz_str, f"{r.get('time_us',0):.1f}",
|
||||
f"{r.get('algbw_gbps',0):.1f}", f"{r.get('busbw_gbps',0):.1f}")
|
||||
t.add_row(
|
||||
str(r.get("size", "")),
|
||||
f"{r.get('worst_busbw_gbps', 0):.1f}",
|
||||
f"{r.get('mean_busbw_gbps', 0):.1f}",
|
||||
f"{r.get('stddev_pct', 0):.2f}%",
|
||||
r.get("status", "?"),
|
||||
)
|
||||
c.print(t)
|
||||
|
||||
188
modules/nvlink_test.py
Normal file
188
modules/nvlink_test.py
Normal file
@ -0,0 +1,188 @@
|
||||
"""NVLink / NVSwitch production acceptance checks."""
|
||||
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
|
||||
class NVLinkTest:
|
||||
def __init__(self, config: dict):
|
||||
self.config = config
|
||||
self.console = Console()
|
||||
self.cfg = config.get("nvlink", {})
|
||||
|
||||
def _run(self, args: list[str], timeout: int = 60) -> tuple[int, str, str]:
|
||||
if not shutil.which("nvidia-smi"):
|
||||
return 127, "", "nvidia-smi not found"
|
||||
r = subprocess.run(["nvidia-smi", *args], capture_output=True, text=True, timeout=timeout)
|
||||
return r.returncode, r.stdout, r.stderr
|
||||
|
||||
def run(self) -> dict:
|
||||
expected_links = int(self.cfg.get("expected_links_per_gpu", 18))
|
||||
expected_speed = float(self.cfg.get("expected_link_speed_gbps", 25))
|
||||
require_zero_errors = bool(self.cfg.get("require_zero_errors", True))
|
||||
|
||||
rc_s, out_s, err_s = self._run(["nvlink", "-s"])
|
||||
rc_c, out_c, err_c = self._run(["nvlink", "-c"])
|
||||
rc_e, out_e, err_e = self._run(["nvlink", "-e"])
|
||||
|
||||
if rc_s != 0:
|
||||
return {
|
||||
"passed": False,
|
||||
"error": (err_s or out_s or "nvidia-smi nvlink -s failed")[:1000],
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
links = self._parse_status(out_s)
|
||||
if not links:
|
||||
return {
|
||||
"passed": False,
|
||||
"error": "no NVLink status entries parsed from nvidia-smi nvlink -s",
|
||||
"raw_status": out_s[-4000:],
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
speeds = self._parse_speeds(out_c) if rc_c == 0 else {}
|
||||
status_speeds = self._parse_speeds(out_s)
|
||||
for gpu, gpu_speeds in status_speeds.items():
|
||||
speeds.setdefault(gpu, {}).update({k: v for k, v in gpu_speeds.items() if k not in speeds.get(gpu, {})})
|
||||
errors = self._parse_errors(out_e) if rc_e == 0 else {}
|
||||
|
||||
gpu_results = []
|
||||
overall = True
|
||||
for gpu, gpu_links in sorted(links.items(), key=lambda x: int(x[0])):
|
||||
active = sum(1 for l in gpu_links.values() if l.get("active"))
|
||||
inactive = [lid for lid, l in gpu_links.items() if not l.get("active")]
|
||||
speed_bad = []
|
||||
for lid in gpu_links:
|
||||
speed = speeds.get(gpu, {}).get(lid)
|
||||
if speed is not None and speed < expected_speed:
|
||||
speed_bad.append({"link": lid, "speed_gbps": speed})
|
||||
err_bad = []
|
||||
if require_zero_errors:
|
||||
for lid, counters in errors.get(gpu, {}).items():
|
||||
total = sum(v for v in counters.values() if isinstance(v, int))
|
||||
if total:
|
||||
err_bad.append({"link": lid, "counters": counters})
|
||||
|
||||
passed = active == expected_links and not inactive and not speed_bad and not err_bad
|
||||
if not passed:
|
||||
overall = False
|
||||
gpu_results.append({
|
||||
"gpu": int(gpu),
|
||||
"active_links": active,
|
||||
"expected_links": expected_links,
|
||||
"inactive_links": inactive,
|
||||
"speed_issues": speed_bad,
|
||||
"error_issues": err_bad,
|
||||
"passed": passed,
|
||||
})
|
||||
|
||||
return {
|
||||
"passed": overall,
|
||||
"expected_links_per_gpu": expected_links,
|
||||
"expected_link_speed_gbps": expected_speed,
|
||||
"require_zero_errors": require_zero_errors,
|
||||
"gpus": gpu_results,
|
||||
"raw_status": out_s[-4000:],
|
||||
"raw_speed": out_c[-4000:] if out_c else "",
|
||||
"raw_errors": out_e[-4000:] if out_e else "",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _parse_status(text: str) -> dict[str, dict[str, dict]]:
|
||||
result: dict[str, dict[str, dict]] = {}
|
||||
gpu = None
|
||||
for line in text.splitlines():
|
||||
m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
|
||||
if m_gpu:
|
||||
gpu = m_gpu.group(1)
|
||||
result.setdefault(gpu, {})
|
||||
continue
|
||||
if gpu is None:
|
||||
continue
|
||||
m_link = re.search(r"Link\s+(\d+).*?(Active|Inactive|Disabled|Off|Down)", line, re.I)
|
||||
if m_link:
|
||||
state = m_link.group(2)
|
||||
result[gpu][m_link.group(1)] = {
|
||||
"state": state,
|
||||
"active": state.lower() == "active",
|
||||
"raw": line.strip(),
|
||||
}
|
||||
continue
|
||||
m_speed = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I)
|
||||
if m_speed:
|
||||
result[gpu][m_speed.group(1)] = {
|
||||
"state": "Active",
|
||||
"active": True,
|
||||
"raw": line.strip(),
|
||||
}
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _parse_speeds(text: str) -> dict[str, dict[str, float]]:
|
||||
result: dict[str, dict[str, float]] = {}
|
||||
gpu = None
|
||||
for line in text.splitlines():
|
||||
m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
|
||||
if m_gpu:
|
||||
gpu = m_gpu.group(1)
|
||||
result.setdefault(gpu, {})
|
||||
continue
|
||||
if gpu is None:
|
||||
continue
|
||||
m_link = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I)
|
||||
if m_link:
|
||||
result[gpu][m_link.group(1)] = float(m_link.group(2))
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def _parse_errors(text: str) -> dict[str, dict[str, dict[str, int]]]:
|
||||
result: dict[str, dict[str, dict[str, int]]] = {}
|
||||
gpu = None
|
||||
link = None
|
||||
for line in text.splitlines():
|
||||
m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
|
||||
if m_gpu:
|
||||
gpu = m_gpu.group(1)
|
||||
result.setdefault(gpu, {})
|
||||
continue
|
||||
m_link = re.search(r"Link\s+(\d+)", line, re.I)
|
||||
if m_link and gpu is not None:
|
||||
link = m_link.group(1)
|
||||
result[gpu].setdefault(link, {})
|
||||
if gpu is None or link is None:
|
||||
continue
|
||||
for name in ("CRC", "Replay", "Recovery"):
|
||||
m = re.search(rf"{name}[^0-9]*(\d+)", line, re.I)
|
||||
if m:
|
||||
result[gpu][link][name.lower()] = int(m.group(1))
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def print_results(results: dict, console: Optional[Console] = None):
|
||||
c = console or Console()
|
||||
if results.get("error"):
|
||||
c.print(f"[bold red]NVLink error: {results['error']}[/bold red]")
|
||||
return
|
||||
passed = results.get("passed", False)
|
||||
c.print("[bold green]✓ NVLink PASSED[/bold green]" if passed else "[bold red]✗ NVLink FAILED[/bold red]")
|
||||
table = Table(box=None, padding=(0, 1))
|
||||
table.add_column("GPU", style="bold")
|
||||
table.add_column("Active Links", justify="right")
|
||||
table.add_column("Issues")
|
||||
for g in results.get("gpus", []):
|
||||
issues = []
|
||||
if g.get("inactive_links"):
|
||||
issues.append("inactive=" + ",".join(g["inactive_links"]))
|
||||
if g.get("speed_issues"):
|
||||
issues.append(f"speed={len(g['speed_issues'])}")
|
||||
if g.get("error_issues"):
|
||||
issues.append(f"errors={len(g['error_issues'])}")
|
||||
table.add_row(str(g["gpu"]), f"{g['active_links']}/{g['expected_links']}", "; ".join(issues) or "OK")
|
||||
c.print(table)
|
||||
@ -93,8 +93,8 @@ class ReportGenerator:
|
||||
|
||||
def _generate_html(self, results: dict, output: str) -> str:
|
||||
import socket
|
||||
hostname = socket.gethostname()
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
hostname = results.get("hostname") or socket.gethostname()
|
||||
timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
sections = []
|
||||
|
||||
@ -178,8 +178,8 @@ class ReportGenerator:
|
||||
|
||||
def _generate_markdown(self, results: dict, output: str) -> str:
|
||||
import socket
|
||||
hostname = socket.gethostname()
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
hostname = results.get("hostname") or socket.gethostname()
|
||||
timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
lines: list[str] = []
|
||||
|
||||
@ -201,6 +201,21 @@ class ReportGenerator:
|
||||
# --- Summary table ---
|
||||
summary_items = self._build_summary(results)
|
||||
if summary_items:
|
||||
verdict, failures, missing = self._overall_acceptance_verdict(summary_items)
|
||||
lines.append("## Overall Acceptance Verdict\n")
|
||||
lines.append(f"**Result: {verdict}**")
|
||||
lines.append("")
|
||||
if failures:
|
||||
lines.append("Failed or unverified items:")
|
||||
for name, status in failures:
|
||||
lines.append(f"- {name}: {status}")
|
||||
lines.append("")
|
||||
if missing:
|
||||
lines.append("Missing required evidence:")
|
||||
for name in missing:
|
||||
lines.append(f"- {name}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("## Summary\n")
|
||||
lines.append("| Test | Result |")
|
||||
lines.append("|------|--------|")
|
||||
@ -319,8 +334,6 @@ class ReportGenerator:
|
||||
if use_abs and thr:
|
||||
if val >= thr:
|
||||
status = "PASS"
|
||||
elif val >= thr * 0.9:
|
||||
status = "WARN"
|
||||
else:
|
||||
status = "FAIL"
|
||||
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
|
||||
@ -331,33 +344,194 @@ class ReportGenerator:
|
||||
overall_status = status
|
||||
lines.append("")
|
||||
if use_abs:
|
||||
if any(not row.get("passed", False) for row in (comp_data.get("consistency", {}) or {}).values()):
|
||||
overall_status = "FAIL"
|
||||
lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
|
||||
else:
|
||||
overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
|
||||
lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")
|
||||
|
||||
consistency = comp_data.get("consistency", {}) or {}
|
||||
if consistency:
|
||||
lines.append("### Compute Consistency\n")
|
||||
lines.append("| DType | Min | Mean | Max | Spread | Limit | Status |")
|
||||
lines.append("|-------|-----|------|-----|--------|-------|--------|")
|
||||
for dt, row in consistency.items():
|
||||
status = "PASS" if row.get("passed") else "FAIL"
|
||||
lines.append(
|
||||
f"| {dt.upper()} | {row.get('min_tflops', 0):.1f} | "
|
||||
f"{row.get('mean_tflops', 0):.1f} | {row.get('max_tflops', 0):.1f} | "
|
||||
f"{row.get('spread_pct', 0):.2f}% | <= {row.get('max_allowed_pct', 3)}% | {status} |"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
per_gpu = comp_data.get("per_gpu", []) or []
|
||||
dtype_order = [dt for dt in per_dtype.keys() if not isinstance(per_dtype.get(dt), str)]
|
||||
if per_gpu and dtype_order:
|
||||
lines.append("### Compute Per-GPU TFLOPS\n")
|
||||
headers = ["GPU", *[dt.upper() for dt in dtype_order]]
|
||||
lines.append("| " + " | ".join(headers) + " |")
|
||||
lines.append("|" + "|".join(["---"] * len(headers)) + "|")
|
||||
for row in per_gpu:
|
||||
cells = [str(row.get("index", ""))]
|
||||
for dt in dtype_order:
|
||||
val = row.get(dt, "")
|
||||
cells.append(f"{val:.1f}" if isinstance(val, (int, float)) else str(val))
|
||||
lines.append("| " + " | ".join(cells) + " |")
|
||||
lines.append("")
|
||||
|
||||
# --- NCCL ---
|
||||
nvlink = results.get("nvlink")
|
||||
if nvlink and not nvlink.get("error"):
|
||||
lines.append("## NVLink/NVSwitch\n")
|
||||
lines.append(f"**Overall: {'PASS' if nvlink.get('passed') else 'FAIL'}**\n")
|
||||
lines.append("| GPU | Active Links | Issues |")
|
||||
lines.append("|-----|--------------|--------|")
|
||||
for g in nvlink.get("gpus", []):
|
||||
issues = []
|
||||
if g.get("inactive_links"):
|
||||
issues.append("inactive=" + ",".join(g["inactive_links"]))
|
||||
if g.get("speed_issues"):
|
||||
issues.append(f"speed issues={len(g['speed_issues'])}")
|
||||
if g.get("error_issues"):
|
||||
issues.append(f"errors={len(g['error_issues'])}")
|
||||
lines.append(f"| {g.get('gpu')} | {g.get('active_links')}/{g.get('expected_links')} | {', '.join(issues) or 'OK'} |")
|
||||
lines.append("")
|
||||
elif nvlink and nvlink.get("error"):
|
||||
lines.append("## NVLink/NVSwitch\n")
|
||||
lines.append(f"**Overall: FAIL** ({nvlink.get('error')})\n")
|
||||
|
||||
dcgm = results.get("dcgm")
|
||||
if dcgm and not dcgm.get("error"):
|
||||
lines.append("## DCGM Diagnostic\n")
|
||||
lines.append(f"**Overall: {'PASS' if dcgm.get('passed') else 'FAIL'}**\n")
|
||||
if dcgm.get("subtests"):
|
||||
lines.append("| Subtest | Status |")
|
||||
lines.append("|---------|--------|")
|
||||
for s in dcgm.get("subtests", []):
|
||||
lines.append(f"| {s.get('name', '')} | {s.get('status', '')} |")
|
||||
lines.append("")
|
||||
elif dcgm and dcgm.get("error"):
|
||||
lines.append("## DCGM Diagnostic\n")
|
||||
lines.append(f"**Overall: FAIL** ({dcgm.get('error')})\n")
|
||||
|
||||
# --- NCCL ---
|
||||
nccl = results.get("nccl")
|
||||
if nccl and not nccl.get("error"):
|
||||
lines.append("## NCCL Multi-GPU\n")
|
||||
lines.append(f"Source: {nccl.get('source', 'unknown')} | "
|
||||
f"GPUs: {nccl.get('gpu_count', '?')}\n")
|
||||
if nccl.get("source") == "torchrun_fallback":
|
||||
lines.append("> Functional NCCL smoke only: nccl-tests bus bandwidth was not measured, so this does not satisfy production acceptance.\n")
|
||||
tests = nccl.get("tests", {})
|
||||
if tests:
|
||||
lines.append("| Operation | Bus BW (GB/s) | Threshold | Status |")
|
||||
lines.append("|-----------|---------------|-----------|--------|")
|
||||
lines.append("> Summary reports the best Bus BW observed for each operation. PASS/FAIL is evaluated across every tested message size and repeat run shown in the detail table below.\n")
|
||||
lines.append("| Operation | Best Bus BW (GB/s) | Failed Sizes | Threshold | Status |")
|
||||
lines.append("|-----------|--------------------|--------------|-----------|--------|")
|
||||
for op, data in tests.items():
|
||||
if isinstance(data, dict) and not data.get("error"):
|
||||
bw = data.get("best_busbw_gbps", 0)
|
||||
req = data.get("min_required_gbps", 0)
|
||||
status = data.get("status", "?")
|
||||
lines.append(f"| {op} | {bw:.1f} | >= {req:.0f} | {status} |")
|
||||
failed_sizes = [
|
||||
str(row.get("size", "?"))
|
||||
for row in data.get("by_size", [])
|
||||
if row.get("status") != "PASS"
|
||||
]
|
||||
failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-"
|
||||
lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {_format_gbps(req)} | {status} |")
|
||||
elif isinstance(data, dict) and data.get("error"):
|
||||
lines.append(f"| {op} | - | - | ERROR: {data['error']} |")
|
||||
lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |")
|
||||
lines.append("")
|
||||
for op, data in tests.items():
|
||||
by_size = data.get("by_size", []) if isinstance(data, dict) else []
|
||||
if not by_size:
|
||||
continue
|
||||
lines.append(f"### NCCL {op} by size\n")
|
||||
lines.append("| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |")
|
||||
lines.append("|------|---------------------|-------|------|--------|-----------|--------|")
|
||||
for row in by_size:
|
||||
runs = ", ".join(str(v) for v in row.get("runs_busbw_gbps", []))
|
||||
lines.append(
|
||||
f"| {row.get('size', '')} | {runs} | "
|
||||
f"{row.get('worst_busbw_gbps', 0):.1f} | "
|
||||
f"{row.get('mean_busbw_gbps', 0):.1f} | "
|
||||
f"{row.get('stddev_pct', 0):.2f}% | "
|
||||
f">= {_format_gbps(data.get('min_required_gbps', 0))} | "
|
||||
f"{row.get('status', '?')} |"
|
||||
)
|
||||
lines.append("")
|
||||
passed = nccl.get("passed", False)
|
||||
lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
|
||||
|
||||
multinode = results.get("multinode_nccl")
|
||||
if multinode and not multinode.get("error"):
|
||||
lines.append("## Multi-node NCCL / Cross Leaf\n")
|
||||
lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n")
|
||||
if multinode.get("artifact_dir"):
|
||||
lines.append(f"- **Artifacts:** `{multinode.get('artifact_dir')}`")
|
||||
hosts = multinode.get("hosts", [])
|
||||
if hosts:
|
||||
host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts)
|
||||
lines.append(f"- **Hosts:** {host_text}")
|
||||
preflight = multinode.get("preflight", {})
|
||||
if preflight.get("checks"):
|
||||
failed_checks = [c for c in preflight["checks"] if c.get("status") == "FAIL"]
|
||||
warn_checks = [c for c in preflight["checks"] if c.get("status") == "WARN"]
|
||||
lines.append(f"- **Preflight:** {'PASS' if not failed_checks else 'FAIL'}"
|
||||
f"{f' ({len(warn_checks)} warnings)' if warn_checks else ''}")
|
||||
lines.append("")
|
||||
for op, data in (multinode.get("tests") or {}).items():
|
||||
lines.append(f"### Multi-node NCCL {op}\n")
|
||||
lines.append("| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |")
|
||||
lines.append("|----------|----------------------|-------------|-----------|------------|-----------|--------|")
|
||||
for topo in data.get("topologies", []):
|
||||
threshold = topo.get("min_required_gbps", 0) or 0
|
||||
threshold_text = f">= {_format_gbps(threshold)} GB/s" if threshold else "-"
|
||||
cuda_visible = topo.get("cuda_visible_devices") or "-"
|
||||
lines.append(
|
||||
f"| {topo.get('label', '')} | {cuda_visible} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
|
||||
f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | "
|
||||
f"{threshold_text} | {topo.get('status', '?')} |"
|
||||
)
|
||||
lines.append("")
|
||||
diag_rows = []
|
||||
for topo in data.get("topologies", []):
|
||||
net = topo.get("network") or {}
|
||||
if net:
|
||||
diag_rows.append((topo, net))
|
||||
if diag_rows:
|
||||
lines.append("| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |")
|
||||
lines.append("|----------|--------------|-----------------|------------------|-------------------|")
|
||||
for topo, net in diag_rows:
|
||||
networks = ", ".join(net.get("networks") or []) or "unknown"
|
||||
gdr = net.get("gpu_direct_rdma", "UNKNOWN")
|
||||
enabled = ", ".join(net.get("gdr_enabled_hcas") or []) or "-"
|
||||
disabled = ", ".join(net.get("gdr_disabled_hcas") or []) or "-"
|
||||
lines.append(f"| {topo.get('label', '')} | {networks} | {gdr} | {enabled} | {disabled} |")
|
||||
lines.append("")
|
||||
failed_topos = [topo for topo in data.get("topologies", []) if topo.get("status") == "FAIL"]
|
||||
if failed_topos:
|
||||
lines.append("| Topology | Return Code | Error / Output Tail |")
|
||||
lines.append("|----------|-------------|---------------------|")
|
||||
for topo in failed_topos:
|
||||
tail = topo.get("error") or topo.get("stderr_tail") or topo.get("stdout_tail") or ""
|
||||
tail = str(tail).replace("\n", " ").replace("|", "\\|")[-240:]
|
||||
lines.append(f"| {topo.get('label', '')} | {topo.get('returncode', '')} | {tail} |")
|
||||
lines.append("")
|
||||
lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n")
|
||||
elif multinode and multinode.get("error"):
|
||||
lines.append("## Multi-node NCCL / Cross Leaf\n")
|
||||
lines.append(f"**Overall: FAIL** ({multinode.get('error')})\n")
|
||||
preflight = multinode.get("preflight", {})
|
||||
if preflight.get("checks"):
|
||||
lines.append("| Check | Status | Detail |")
|
||||
lines.append("|-------|--------|--------|")
|
||||
for check in preflight["checks"]:
|
||||
detail = str(check.get("detail", "")).replace("\n", " ")
|
||||
lines.append(f"| {check.get('name', '')} | {check.get('status', '')} | {detail} |")
|
||||
lines.append("")
|
||||
|
||||
# --- Stress Test ---
|
||||
stress = results.get("stress")
|
||||
if stress and not stress.get("error"):
|
||||
@ -368,6 +542,21 @@ class ReportGenerator:
|
||||
source = stress.get("source", "unknown")
|
||||
lines.append(f"- **Source:** {source}")
|
||||
lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
|
||||
telemetry = stress.get("telemetry") or {}
|
||||
if telemetry:
|
||||
lines.append(f"- **Telemetry samples:** {telemetry.get('samples', 0)}")
|
||||
lines.append(f"- **Max temp:** {telemetry.get('max_temp_c', {})}")
|
||||
lines.append(f"- **Avg power:** {telemetry.get('avg_power_w', {})}")
|
||||
lines.append(f"- **Temp delta:** {telemetry.get('temp_delta_c', 'N/A')} C")
|
||||
lines.append(f"- **TFLOPS jitter:** {telemetry.get('tflops_jitter_pct', 'N/A')}%")
|
||||
lines.append(f"- **Steady TFLOPS samples:** {telemetry.get('steady_tflops_samples', 0)}")
|
||||
lines.append(f"- **Throttle events:** {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}")
|
||||
lines.append(f"- **XID events:** {len(telemetry.get('xid_events', []))}")
|
||||
failures = telemetry.get("failures") or []
|
||||
if failures:
|
||||
lines.append("- **Failure reasons:**")
|
||||
for reason in failures:
|
||||
lines.append(f" - {reason}")
|
||||
lines.append(f"- **Result: {'PASS' if passed else 'FAIL'}**")
|
||||
lines.append("")
|
||||
|
||||
@ -378,26 +567,70 @@ class ReportGenerator:
|
||||
lines.append(f"**Overall: SKIP** [{rdma.get('reason', 'no IB hardware detected')}]\n")
|
||||
elif rdma and not rdma.get("error"):
|
||||
lines.append("## RDMA/InfiniBand\n")
|
||||
rdma_legacy_note = self._rdma_legacy_note(rdma)
|
||||
if rdma_legacy_note:
|
||||
lines.append(f"> {rdma_legacy_note}\n")
|
||||
port_checks = rdma.get("port_checks", [])
|
||||
if port_checks:
|
||||
lines.append("### RDMA Port Checks\n")
|
||||
lines.append("| Device | Port | State | Rate | Required | Status |")
|
||||
lines.append("|--------|------|-------|------|----------|--------|")
|
||||
for p in port_checks:
|
||||
lines.append(
|
||||
f"| {p.get('device', '')} | {p.get('port', '')} | "
|
||||
f"{p.get('state', '')} | {p.get('rate', '')} | "
|
||||
f">= {p.get('min_rate_gbps', 400):.0f}Gbps ACTIVE | {p.get('status', '?')} |"
|
||||
)
|
||||
lines.append("")
|
||||
bw_tests = rdma.get("bandwidth_tests", [])
|
||||
lat_tests = rdma.get("latency_tests", [])
|
||||
if bw_tests or lat_tests:
|
||||
ibping_tests = rdma.get("ibping_tests", [])
|
||||
if bw_tests or lat_tests or ibping_tests:
|
||||
lines.append("| Test | Value | Threshold | Status |")
|
||||
lines.append("|------|-------|-----------|--------|")
|
||||
for bt in bw_tests:
|
||||
if not bt.get("error"):
|
||||
if bt.get("error"):
|
||||
lines.append(f"| {bt.get('test', 'ib_bw')} | {bt.get('error')} | required runnable test | {bt.get('status', 'FAIL')} |")
|
||||
else:
|
||||
threshold, status = self._rdma_bandwidth_verdict(bt)
|
||||
lines.append(f"| {bt['test']} | {bt.get('bandwidth_gbps', 0):.1f} GB/s | "
|
||||
f">= {bt.get('min_required_gbps', 0)} GB/s | {bt.get('status', '?')} |")
|
||||
f">= {threshold:g} GB/s | {status} |")
|
||||
for lt in lat_tests:
|
||||
if not lt.get("error"):
|
||||
if lt.get("error"):
|
||||
lines.append(f"| {lt.get('test', 'ib_lat')} | {lt.get('error')} | required runnable test | {lt.get('status', 'FAIL')} |")
|
||||
else:
|
||||
threshold, status = self._rdma_latency_verdict(lt)
|
||||
lines.append(f"| {lt['test']} | {lt.get('latency_us', 0):.2f} us | "
|
||||
f"<= {lt.get('max_allowed_us', 0)} us | {lt.get('status', '?')} |")
|
||||
f"<= {threshold:g} us | {status} |")
|
||||
for it in ibping_tests:
|
||||
direction = it.get("direction") or it.get("role", "N/A")
|
||||
if it.get("error"):
|
||||
lines.append(f"| {it.get('test', 'ibping')} | {it.get('error')} | bidirectional peer evidence | {it.get('status', 'FAIL')} |")
|
||||
else:
|
||||
lines.append(f"| {it['test']} | {direction} target={it.get('target', 'N/A')} count={it.get('count', 'N/A')} | "
|
||||
f"0% packet loss | {it.get('status', '?')} |")
|
||||
lines.append("")
|
||||
fabric = rdma.get("fabric_counters") or {}
|
||||
if fabric:
|
||||
counters = fabric.get("counters", {})
|
||||
lines.append(f"- **PFC/ECN/CNP/congestion counters checked:** {len(counters)}")
|
||||
lines.append(f"- **PFC/ECN/CNP/congestion non-zero:** {'yes' if fabric.get('failed') else 'no'}")
|
||||
if not counters:
|
||||
lines.append("- **PFC/ECN/CNP/congestion evidence:** missing")
|
||||
failures = rdma.get("failures") or []
|
||||
if not failures:
|
||||
failures = self._rdma_failure_reasons(rdma)
|
||||
if failures:
|
||||
lines.append("- **Failure reasons:**")
|
||||
for reason in failures:
|
||||
lines.append(f" - {reason}")
|
||||
passed = rdma.get("passed", False)
|
||||
lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
|
||||
|
||||
# --- Training ---
|
||||
training = results.get("training")
|
||||
if training and not training.get("error"):
|
||||
training_status, training_detail, training_missing = self._training_verdict(training)
|
||||
lines.append("## Training Simulation\n")
|
||||
lines.append("| Metric | Value |")
|
||||
lines.append("|--------|-------|")
|
||||
@ -405,8 +638,14 @@ class ReportGenerator:
|
||||
lines.append(f"| Params | {training.get('total_params_m', 0):.1f}M |")
|
||||
lines.append(f"| Throughput | {training.get('throughput_tokens_per_sec', 0):.0f} tokens/sec |")
|
||||
lines.append(f"| Avg Step Time | {training.get('avg_step_time_ms', 0):.1f} ms |")
|
||||
lines.append(f"| Warmup Steps | {training.get('warmup_steps', 'N/A')} |")
|
||||
lines.append(f"| Peak Memory | {training.get('peak_memory_gb', 0):.1f} GB |")
|
||||
lines.append(f"| Final Loss | {training.get('final_loss', 'N/A')} |")
|
||||
lines.append(f"| Step Jitter | {training.get('step_jitter_pct', 'N/A')}% |")
|
||||
lines.append(f"| Distributed Mode | {training.get('distributed_mode', 'N/A')} |")
|
||||
if training_missing:
|
||||
lines.append(f"| Acceptance Gaps | missing {', '.join(training_missing)} |")
|
||||
lines.append(f"| Verdict | {training_status} ({training_detail}) |")
|
||||
lines.append("")
|
||||
|
||||
# --- Footer ---
|
||||
@ -441,6 +680,114 @@ class ReportGenerator:
|
||||
return bench["compute"]
|
||||
return {}
|
||||
|
||||
@staticmethod
|
||||
def _training_verdict(training: dict) -> tuple[str, str, list[str]]:
|
||||
"""Return report status for both current and legacy training result schemas."""
|
||||
tps = float(training.get("throughput_tokens_per_sec", 0) or 0)
|
||||
if "passed" in training:
|
||||
status = "PASS" if training.get("passed") else "FAIL"
|
||||
return status, f"{tps:.0f} tokens/sec", []
|
||||
|
||||
required = ["passed", "step_jitter_pct", "distributed_mode", "loss_finite"]
|
||||
missing = [k for k in required if k not in training]
|
||||
return "UNVERIFIED", f"{tps:.0f} tokens/sec; legacy result lacks explicit acceptance verdict", missing
|
||||
|
||||
def _rdma_cfg_value(self, key: str, default: float) -> float:
|
||||
try:
|
||||
return float((self.config.get("rdma", {}) or {}).get(key, default))
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
def _rdma_bandwidth_verdict(self, row: dict) -> tuple[float, str]:
|
||||
threshold = self._rdma_cfg_value("min_bandwidth_gbps", 47.0)
|
||||
value = float(row.get("bandwidth_gbps", 0) or 0)
|
||||
return threshold, "PASS" if value >= threshold else "FAIL"
|
||||
|
||||
def _rdma_latency_verdict(self, row: dict) -> tuple[float, str]:
|
||||
name = row.get("test", "")
|
||||
if name == "ib_write_lat":
|
||||
threshold = self._rdma_cfg_value("max_write_latency_us", 2.0)
|
||||
elif name == "ib_read_lat":
|
||||
threshold = self._rdma_cfg_value("max_read_latency_us", 3.5)
|
||||
else:
|
||||
threshold = self._rdma_cfg_value("max_latency_us", 3.5)
|
||||
value = float(row.get("latency_us", 0) or 0)
|
||||
return threshold, "PASS" if 0 < value <= threshold else "FAIL"
|
||||
|
||||
def _rdma_legacy_note(self, rdma: dict) -> str:
|
||||
"""Flag old RDMA result schemas whose embedded thresholds were looser."""
|
||||
for row in rdma.get("bandwidth_tests", []) or []:
|
||||
if row.get("min_required_gbps") != self._rdma_cfg_value("min_bandwidth_gbps", 47.0):
|
||||
return (
|
||||
"Legacy RDMA result re-evaluated with current PDF acceptance thresholds; "
|
||||
"old WARN statuses and old 50GB/s/10us limits are not used for verdict."
|
||||
)
|
||||
for row in rdma.get("latency_tests", []) or []:
|
||||
threshold, _ = self._rdma_latency_verdict(row)
|
||||
if row.get("max_allowed_us") != threshold:
|
||||
return (
|
||||
"Legacy RDMA result re-evaluated with current PDF acceptance thresholds; "
|
||||
"old WARN statuses and old 50GB/s/10us limits are not used for verdict."
|
||||
)
|
||||
return ""
|
||||
|
||||
def _rdma_failure_reasons(self, rdma: dict) -> list[str]:
|
||||
failures = []
|
||||
for row in rdma.get("bandwidth_tests", []) or []:
|
||||
threshold, status = self._rdma_bandwidth_verdict(row)
|
||||
if status != "PASS":
|
||||
failures.append(
|
||||
f"{row.get('test')} bandwidth {row.get('bandwidth_gbps', 0)}GB/s < {threshold:g}GB/s"
|
||||
)
|
||||
for row in rdma.get("latency_tests", []) or []:
|
||||
threshold, status = self._rdma_latency_verdict(row)
|
||||
if status != "PASS":
|
||||
failures.append(
|
||||
f"{row.get('test')} latency {row.get('latency_us', 0)}us > {threshold:g}us"
|
||||
)
|
||||
for row in rdma.get("ibping_tests", []) or []:
|
||||
if row.get("status") != "PASS":
|
||||
failures.append(f"{row.get('test')} failed")
|
||||
return failures
|
||||
|
||||
@staticmethod
|
||||
def _overall_acceptance_verdict(summary_items: list[tuple[str, str]]) -> tuple[str, list[tuple[str, str]], list[str]]:
|
||||
"""PDF-style verdict for the report scope.
|
||||
|
||||
Full-suite reports require every single-node acceptance item. Standalone
|
||||
reports, such as `--test multinode-nccl`, should only judge the items
|
||||
that were actually requested instead of reporting unrelated evidence as
|
||||
missing.
|
||||
"""
|
||||
single_node_required = [
|
||||
"GPU Info",
|
||||
"Health Check",
|
||||
"Memory Bandwidth",
|
||||
"Compute Throughput",
|
||||
"NVLink/NVSwitch",
|
||||
"NCCL",
|
||||
"Stress Test",
|
||||
"RDMA",
|
||||
"DCGM",
|
||||
"Training",
|
||||
]
|
||||
status_by_name = dict(summary_items)
|
||||
present_single_node = [name for name in single_node_required if name in status_by_name]
|
||||
if len(present_single_node) >= 3:
|
||||
required = list(single_node_required)
|
||||
if "Multi-node NCCL" in status_by_name:
|
||||
required.append("Multi-node NCCL")
|
||||
else:
|
||||
required = list(status_by_name)
|
||||
missing = [name for name in required if name not in status_by_name]
|
||||
failures = [
|
||||
(name, status)
|
||||
for name, status in summary_items
|
||||
if name in required and not str(status).startswith("PASS")
|
||||
]
|
||||
verdict = "PASS" if not missing and not failures else "FAIL"
|
||||
return verdict, failures, missing
|
||||
|
||||
def _build_summary(self, results: dict) -> list[tuple[str, str]]:
|
||||
"""Build summary verdict list from results."""
|
||||
items = []
|
||||
@ -473,7 +820,7 @@ class ReportGenerator:
|
||||
d2d = mem.get("d2d_bandwidth_gbps") or 0
|
||||
items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
|
||||
else:
|
||||
eff = mem.get("efficiency_pct") or 0
|
||||
eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0
|
||||
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
|
||||
items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
|
||||
|
||||
@ -491,25 +838,43 @@ class ReportGenerator:
|
||||
rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
|
||||
worst_status = "PASS"
|
||||
worst_dt = None
|
||||
lowest_margin = None
|
||||
for dt, thr in pass_thresholds.items():
|
||||
val = per_dtype.get(dt)
|
||||
if not isinstance(val, (int, float)):
|
||||
continue
|
||||
if val >= thr:
|
||||
st = "PASS"
|
||||
elif val >= thr * 0.9:
|
||||
st = "WARN"
|
||||
else:
|
||||
st = "FAIL"
|
||||
margin = val / thr if thr else 0
|
||||
if lowest_margin is None or margin < lowest_margin:
|
||||
lowest_margin = margin
|
||||
worst_dt = dt
|
||||
if rank[st] > rank[worst_status]:
|
||||
worst_status = st
|
||||
worst_dt = dt
|
||||
if worst_dt:
|
||||
items.append((
|
||||
"Compute Throughput",
|
||||
f"{worst_status} (worst {worst_dt.upper()} "
|
||||
f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
|
||||
))
|
||||
consistency = comp.get("consistency", {}) or {}
|
||||
failed_consistency = [
|
||||
(dt, row)
|
||||
for dt, row in consistency.items()
|
||||
if not row.get("passed", False)
|
||||
]
|
||||
if failed_consistency:
|
||||
worst_status = "FAIL"
|
||||
fail_dt, fail_row = failed_consistency[0]
|
||||
items.append((
|
||||
"Compute Throughput",
|
||||
f"FAIL ({fail_dt.upper()} spread "
|
||||
f"{fail_row.get('spread_pct', 0):.2f}% > "
|
||||
f"{fail_row.get('max_allowed_pct', 3)}%)"
|
||||
))
|
||||
else:
|
||||
items.append((
|
||||
"Compute Throughput",
|
||||
f"{worst_status} (worst {worst_dt.upper()} "
|
||||
f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
|
||||
))
|
||||
else:
|
||||
items.append(("Compute Throughput", f"{worst_status}"))
|
||||
else:
|
||||
@ -521,16 +886,46 @@ class ReportGenerator:
|
||||
else:
|
||||
items.append(("Compute Throughput", "N/A"))
|
||||
|
||||
# NCCL
|
||||
if "nvlink" in results:
|
||||
nvl = results["nvlink"]
|
||||
if nvl.get("error"):
|
||||
items.append(("NVLink/NVSwitch", f"ERROR: {nvl['error']}"))
|
||||
elif nvl.get("passed"):
|
||||
items.append(("NVLink/NVSwitch", "PASS"))
|
||||
else:
|
||||
items.append(("NVLink/NVSwitch", "FAIL"))
|
||||
|
||||
if "dcgm" in results:
|
||||
d = results["dcgm"]
|
||||
if d.get("error"):
|
||||
items.append(("DCGM", f"ERROR: {d['error']}"))
|
||||
elif d.get("passed"):
|
||||
items.append(("DCGM", "PASS"))
|
||||
else:
|
||||
items.append(("DCGM", "FAIL"))
|
||||
|
||||
# NCCL
|
||||
if "nccl" in results:
|
||||
n = results["nccl"]
|
||||
if n.get("error"):
|
||||
items.append(("NCCL", f"ERROR: {n['error']}"))
|
||||
elif n.get("source") == "torchrun_fallback":
|
||||
items.append(("NCCL", "FAIL (no nccl-tests bus BW)"))
|
||||
elif n.get("passed"):
|
||||
items.append(("NCCL", "PASS"))
|
||||
else:
|
||||
items.append(("NCCL", "FAIL"))
|
||||
|
||||
if "multinode_nccl" in results:
|
||||
mn = results["multinode_nccl"]
|
||||
if mn.get("error"):
|
||||
items.append(("Multi-node NCCL", f"ERROR: {mn['error']}"))
|
||||
elif mn.get("passed"):
|
||||
items.append(("Multi-node NCCL", "PASS"))
|
||||
else:
|
||||
items.append(("Multi-node NCCL", "FAIL"))
|
||||
|
||||
# Stress
|
||||
if "stress" in results:
|
||||
s = results["stress"]
|
||||
@ -559,7 +954,17 @@ class ReportGenerator:
|
||||
if t.get("error"):
|
||||
items.append(("Training", f"ERROR: {t['error']}"))
|
||||
else:
|
||||
tps = t.get("throughput_tokens_per_sec", 0)
|
||||
items.append(("Training", f"PASS ({tps:.0f} tokens/sec)"))
|
||||
status, detail, _missing = self._training_verdict(t)
|
||||
items.append(("Training", f"{status} ({detail})"))
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def _format_gbps(value) -> str:
|
||||
try:
|
||||
numeric = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return str(value)
|
||||
if numeric.is_integer():
|
||||
return f"{numeric:.0f}"
|
||||
return f"{numeric:.2f}"
|
||||
|
||||
@ -1,9 +1,10 @@
|
||||
"""GPU stress test module — wraps gpu-burn for long-running stability tests."""
|
||||
"""GPU stress test module — gpu-burn or PyTorch GEMM with telemetry."""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
@ -46,7 +47,7 @@ class StressTest:
|
||||
memory_pct = cfg.get("memory_pct", 90)
|
||||
target_gpus = cfg.get("gpus", "all")
|
||||
|
||||
gpu_burn = self._find_gpu_burn()
|
||||
gpu_burn = self._find_gpu_burn() if cfg.get("use_gpu_burn", False) else ""
|
||||
|
||||
if gpu_burn:
|
||||
# Try gpu-burn first
|
||||
@ -60,7 +61,7 @@ class StressTest:
|
||||
|
||||
return result
|
||||
|
||||
self.console.print("[yellow]gpu_burn not found, using PyTorch stress test[/yellow]")
|
||||
self.console.print("[yellow]Using PyTorch stress test[/yellow]")
|
||||
return self._run_pytorch_stress(duration_sec, memory_pct)
|
||||
|
||||
def _run_gpu_burn(self, gpu_burn: str, duration: int,
|
||||
@ -77,12 +78,26 @@ class StressTest:
|
||||
cmd.append(str(duration))
|
||||
|
||||
t0 = time.time()
|
||||
xid_before = self._collect_xid_events()
|
||||
interval = int(self.stress_cfg.get("telemetry_interval_sec", 1))
|
||||
telemetry = []
|
||||
stop_sampling = threading.Event()
|
||||
sampler = threading.Thread(
|
||||
target=self._sample_telemetry,
|
||||
args=(telemetry, stop_sampling, interval),
|
||||
daemon=True,
|
||||
)
|
||||
sampler.start()
|
||||
try:
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=duration + 120)
|
||||
elapsed = round(time.time() - t0, 1)
|
||||
stop_sampling.set()
|
||||
sampler.join(timeout=interval + 1)
|
||||
|
||||
output = r.stdout + r.stderr
|
||||
passed = r.returncode == 0
|
||||
xid_events = self._new_xid_events(xid_before, self._collect_xid_events())
|
||||
telemetry_summary = self._evaluate_telemetry(telemetry, [], xid_events)
|
||||
passed = r.returncode == 0 and telemetry_summary.get("passed", False)
|
||||
|
||||
gpu_results = []
|
||||
for line in output.split("\n"):
|
||||
@ -96,25 +111,36 @@ class StressTest:
|
||||
"duration_sec": duration,
|
||||
"elapsed_sec": elapsed,
|
||||
"gpu_results": gpu_results,
|
||||
"telemetry": telemetry_summary,
|
||||
"raw_output_tail": output[-500:] if output else "",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
stop_sampling.set()
|
||||
return {
|
||||
"source": "gpu-burn",
|
||||
"passed": False,
|
||||
"duration_sec": duration,
|
||||
"error": "timeout",
|
||||
"telemetry": self._evaluate_telemetry(
|
||||
telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events())
|
||||
),
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
except Exception as e:
|
||||
stop_sampling.set()
|
||||
return {
|
||||
"source": "gpu-burn",
|
||||
"passed": False,
|
||||
"error": str(e),
|
||||
"telemetry": self._evaluate_telemetry(
|
||||
telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events())
|
||||
),
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
finally:
|
||||
stop_sampling.set()
|
||||
|
||||
def _run_pytorch_stress(self, duration: int, memory_pct: int = 90) -> dict:
|
||||
try:
|
||||
@ -127,58 +153,79 @@ class StressTest:
|
||||
gpu_count = torch.cuda.device_count()
|
||||
self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs, target {memory_pct}% memory)[/cyan]")
|
||||
|
||||
dtype_name = self.stress_cfg.get("dtype", "bf16")
|
||||
matrix_size = int(self.stress_cfg.get("matrix_size", 8192))
|
||||
interval = int(self.stress_cfg.get("telemetry_interval_sec", 1))
|
||||
dtype_map = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
|
||||
dtype = dtype_map.get(dtype_name, torch.bfloat16)
|
||||
|
||||
gpu_status = {}
|
||||
telemetry = []
|
||||
stop_sampling = threading.Event()
|
||||
t0 = time.time()
|
||||
xid_before = self._collect_xid_events()
|
||||
|
||||
try:
|
||||
sampler = threading.Thread(
|
||||
target=self._sample_telemetry,
|
||||
args=(telemetry, stop_sampling, interval),
|
||||
daemon=True,
|
||||
)
|
||||
sampler.start()
|
||||
tensors = {}
|
||||
ballast = {}
|
||||
pass_tflops = []
|
||||
for i in range(gpu_count):
|
||||
with torch.cuda.device(i):
|
||||
# Get actual free memory (accounting for other processes)
|
||||
free_mem, total_mem = torch.cuda.mem_get_info(i)
|
||||
|
||||
# Calculate allocation from configured memory_pct
|
||||
target_mem = int(total_mem * memory_pct / 100)
|
||||
|
||||
# Cap at actual free memory with 5% safety margin
|
||||
alloc_bytes = min(target_mem, int(free_mem * 0.95))
|
||||
|
||||
# matmul(A, A.T) needs 2x input memory (input + output)
|
||||
mem_side = int((alloc_bytes / 4 / 2) ** 0.5)
|
||||
# Cap compute matrix so a single matmul completes in ~2s on H100/H200
|
||||
# (FP32 ≈ 67 TFLOPS → 2*4096³/67e12 ≈ 2s). Without this cap, a 141GB
|
||||
# HBM yields side ≈ 131K → single matmul ~68s × 8 GPUs serial → loop
|
||||
# overshoots a 60s duration request by 10×+.
|
||||
MAX_COMPUTE_SIDE = 4096
|
||||
side = min(mem_side, MAX_COMPUTE_SIDE)
|
||||
|
||||
actual_mem_mb = side * side * 4 / 1024 / 1024
|
||||
side = matrix_size
|
||||
elem = torch.tensor([], dtype=dtype).element_size()
|
||||
compute_bytes = side * side * elem * 3
|
||||
target_mem = min(int(total_mem * memory_pct / 100), int(free_mem * 0.90))
|
||||
ballast_bytes = max(0, target_mem - compute_bytes)
|
||||
if ballast_bytes:
|
||||
ballast_elems = ballast_bytes // 2
|
||||
ballast[i] = torch.empty(ballast_elems, device=f"cuda:{i}", dtype=torch.float16)
|
||||
actual_mem_mb = (compute_bytes + ballast_bytes) / 1024 / 1024
|
||||
total_mem_mb = total_mem / 1024 / 1024
|
||||
free_mem_mb = free_mem / 1024 / 1024
|
||||
|
||||
|
||||
self.console.print(
|
||||
f" [dim]GPU {i}: total {total_mem_mb:.0f}MB, free {free_mem_mb:.0f}MB, "
|
||||
f"alloc {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - "
|
||||
f"matrix {side}x{side}[/dim]"
|
||||
f"{dtype_name} matrix {side}x{side}[/dim]"
|
||||
)
|
||||
tensors[i] = (
|
||||
torch.randn(side, side, device=f"cuda:{i}", dtype=dtype),
|
||||
torch.randn(side, side, device=f"cuda:{i}", dtype=dtype),
|
||||
torch.empty(side, side, device=f"cuda:{i}", dtype=dtype),
|
||||
)
|
||||
tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32)
|
||||
|
||||
self.console.print(f"\n[cyan]Starting stress test for {duration} seconds...[/cyan]")
|
||||
|
||||
elapsed_check = 0
|
||||
while time.time() - t0 < duration:
|
||||
loop_start = time.perf_counter()
|
||||
# Dispatch matmul on all GPUs in parallel — do NOT synchronize between
|
||||
# GPUs, otherwise the 8 GPUs run serially and overshoot the duration.
|
||||
for i in range(gpu_count):
|
||||
with torch.cuda.device(i):
|
||||
tensors[i] = torch.matmul(tensors[i], tensors[i].T)
|
||||
a, b, out = tensors[i]
|
||||
torch.matmul(a, b, out=out)
|
||||
# Single sync per pass — waits for all 8 streams concurrently
|
||||
for i in range(gpu_count):
|
||||
with torch.cuda.device(i):
|
||||
torch.cuda.synchronize()
|
||||
loop_elapsed = time.perf_counter() - loop_start
|
||||
current_elapsed = time.time() - t0
|
||||
if loop_elapsed > 0:
|
||||
flops = gpu_count * 2 * (matrix_size ** 3)
|
||||
pass_tflops.append({
|
||||
"elapsed_sec": current_elapsed,
|
||||
"tflops": flops / loop_elapsed / 1e12,
|
||||
})
|
||||
|
||||
# Show progress every 10 seconds
|
||||
current_elapsed = time.time() - t0
|
||||
if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0:
|
||||
self.console.print(f" [dim]Running {int(current_elapsed)}s / {duration}s[/dim]")
|
||||
elapsed_check = current_elapsed
|
||||
@ -198,21 +245,196 @@ class StressTest:
|
||||
"duration_sec": duration,
|
||||
"error": error_msg,
|
||||
"gpu_status": gpu_status,
|
||||
"telemetry": self._evaluate_telemetry(
|
||||
telemetry, pass_tflops if "pass_tflops" in locals() else [],
|
||||
self._new_xid_events(xid_before, self._collect_xid_events()),
|
||||
),
|
||||
}
|
||||
finally:
|
||||
stop_sampling.set()
|
||||
tensors.clear()
|
||||
ballast.clear()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
elapsed = round(time.time() - t0, 1)
|
||||
xid_events = self._new_xid_events(xid_before, self._collect_xid_events())
|
||||
telemetry_summary = self._evaluate_telemetry(telemetry, pass_tflops, xid_events)
|
||||
passed = all(v == "PASS" for v in gpu_status.values()) and telemetry_summary.get("passed", False)
|
||||
return {
|
||||
"source": "pytorch",
|
||||
"passed": True,
|
||||
"passed": passed,
|
||||
"duration_sec": duration,
|
||||
"elapsed_sec": elapsed,
|
||||
"gpu_status": gpu_status,
|
||||
"telemetry": telemetry_summary,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
def _sample_telemetry(self, telemetry: list, stop_event: threading.Event, interval: int):
|
||||
query = "index,temperature.gpu,power.draw,clocks_throttle_reasons.active"
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["nvidia-smi", f"--query-gpu={query}", "--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
if r.returncode == 0:
|
||||
sample = {"time": time.time(), "gpus": []}
|
||||
for line in r.stdout.splitlines():
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) >= 4:
|
||||
sample["gpus"].append({
|
||||
"index": int(parts[0]),
|
||||
"temp_c": float(parts[1]),
|
||||
"power_w": float(parts[2]),
|
||||
"throttle": parts[3],
|
||||
})
|
||||
telemetry.append(sample)
|
||||
except Exception:
|
||||
pass
|
||||
stop_event.wait(interval)
|
||||
|
||||
def _collect_xid_events(self) -> list[str]:
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["dmesg", "--color=never"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
if r.returncode != 0:
|
||||
return []
|
||||
return [
|
||||
line.strip()
|
||||
for line in r.stdout.splitlines()
|
||||
if any(token in line.upper() for token in ("XID", "NVRM: XID"))
|
||||
]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
def _new_xid_events(before: list[str], after: list[str]) -> list[str]:
|
||||
seen = set(before)
|
||||
return [line for line in after if line not in seen]
|
||||
|
||||
def _evaluate_telemetry(self, telemetry: list, pass_tflops: list, xid_events: list[str] | None = None) -> dict:
|
||||
cfg = self.stress_cfg
|
||||
max_temp = float(cfg.get("max_temp_c", 80))
|
||||
max_delta = float(cfg.get("max_temp_delta_c", 5))
|
||||
min_power = float(cfg.get("min_power_watts", 630))
|
||||
max_jitter = float(cfg.get("max_tflops_jitter_pct", 5))
|
||||
require_jitter = bool(cfg.get("require_tflops_jitter", True))
|
||||
duration = float(cfg.get("duration_sec", 60))
|
||||
requested_warmup = float(cfg.get("warmup_sec", 60))
|
||||
warmup_sec = min(requested_warmup, max(0.0, duration * 0.2))
|
||||
min_steady_samples = int(cfg.get("min_steady_samples", 10))
|
||||
temps = {}
|
||||
powers = {}
|
||||
throttle_bad = []
|
||||
xid_events = xid_events or []
|
||||
steady_telemetry = [
|
||||
sample for sample in telemetry
|
||||
if sample.get("time", 0) - telemetry[0].get("time", 0) >= warmup_sec
|
||||
] if telemetry else []
|
||||
evaluation_samples = steady_telemetry if len(steady_telemetry) >= min_steady_samples else telemetry
|
||||
for sample in evaluation_samples:
|
||||
for g in sample.get("gpus", []):
|
||||
idx = g["index"]
|
||||
temps.setdefault(idx, []).append(g["temp_c"])
|
||||
powers.setdefault(idx, []).append(g["power_w"])
|
||||
try:
|
||||
bitmask = int(str(g["throttle"]), 16)
|
||||
except ValueError:
|
||||
bitmask = 0
|
||||
real_throttle = bitmask & ~0x1
|
||||
if real_throttle:
|
||||
throttle_bad.append({
|
||||
"gpu": idx,
|
||||
"throttle": g["throttle"],
|
||||
"real_throttle": f"0x{real_throttle:x}",
|
||||
})
|
||||
max_temps = {idx: max(vals) for idx, vals in temps.items() if vals}
|
||||
avg_powers = {idx: sum(vals) / len(vals) for idx, vals in powers.items() if vals}
|
||||
temp_delta = (max(max_temps.values()) - min(max_temps.values())) if len(max_temps) >= 2 else 0
|
||||
jitter = 0
|
||||
steady_tflops = []
|
||||
for item in pass_tflops:
|
||||
if isinstance(item, dict):
|
||||
if float(item.get("elapsed_sec", 0)) >= warmup_sec:
|
||||
steady_tflops.append(float(item.get("tflops", 0)))
|
||||
else:
|
||||
steady_tflops.append(float(item))
|
||||
if len(steady_tflops) < 2 and pass_tflops:
|
||||
steady_tflops = [
|
||||
float(item.get("tflops", 0)) if isinstance(item, dict) else float(item)
|
||||
for item in pass_tflops
|
||||
]
|
||||
if steady_tflops:
|
||||
mean = sum(steady_tflops) / len(steady_tflops)
|
||||
jitter = max(abs(v - mean) / mean * 100 for v in steady_tflops) if mean else 0
|
||||
failures = []
|
||||
temp_failures = {idx: v for idx, v in max_temps.items() if v > max_temp}
|
||||
power_failures = {idx: v for idx, v in avg_powers.items() if v < min_power}
|
||||
if not evaluation_samples:
|
||||
failures.append("no telemetry samples available for evaluation")
|
||||
if temp_failures:
|
||||
failures.append(
|
||||
"max temperature above threshold: "
|
||||
+ ", ".join(f"GPU {idx} {val:.1f}C" for idx, val in sorted(temp_failures.items()))
|
||||
)
|
||||
if temp_delta > max_delta:
|
||||
failures.append(f"GPU temperature delta {temp_delta:.1f}C exceeds {max_delta:.1f}C")
|
||||
if power_failures:
|
||||
failures.append(
|
||||
"average steady-state power below threshold: "
|
||||
+ ", ".join(f"GPU {idx} {val:.1f}W" for idx, val in sorted(power_failures.items()))
|
||||
)
|
||||
if throttle_bad:
|
||||
failures.append(
|
||||
f"non-idle throttle reasons observed in {len(throttle_bad)} samples "
|
||||
f"(first: GPU {throttle_bad[0]['gpu']} {throttle_bad[0]['real_throttle']})"
|
||||
)
|
||||
if xid_events:
|
||||
failures.append(f"{len(xid_events)} new XID/NVRM XID events observed")
|
||||
if require_jitter and len(steady_tflops) < 2:
|
||||
failures.append(
|
||||
f"insufficient steady TFLOPS samples for jitter evaluation: {len(steady_tflops)} < 2"
|
||||
)
|
||||
if jitter > max_jitter:
|
||||
failures.append(f"TFLOPS jitter {jitter:.2f}% exceeds {max_jitter:.2f}%")
|
||||
passed = (
|
||||
bool(evaluation_samples)
|
||||
and all(v <= max_temp for v in max_temps.values())
|
||||
and temp_delta <= max_delta
|
||||
and all(v >= min_power for v in avg_powers.values())
|
||||
and not throttle_bad
|
||||
and not xid_events
|
||||
and (not require_jitter or len(steady_tflops) >= 2)
|
||||
and jitter <= max_jitter
|
||||
)
|
||||
return {
|
||||
"passed": passed,
|
||||
"samples": len(telemetry),
|
||||
"steady_samples": len(evaluation_samples),
|
||||
"warmup_sec": round(warmup_sec, 1),
|
||||
"max_temp_c": {k: round(v, 1) for k, v in max_temps.items()},
|
||||
"avg_power_w": {k: round(v, 1) for k, v in avg_powers.items()},
|
||||
"temp_delta_c": round(temp_delta, 1),
|
||||
"throttle_events": throttle_bad[:20],
|
||||
"throttle_event_count": len(throttle_bad),
|
||||
"xid_events": xid_events[-20:],
|
||||
"tflops_jitter_pct": round(jitter, 2),
|
||||
"steady_tflops_samples": len(steady_tflops),
|
||||
"failures": failures,
|
||||
"thresholds": {
|
||||
"max_temp_c": max_temp,
|
||||
"max_temp_delta_c": max_delta,
|
||||
"min_power_w": min_power,
|
||||
"max_tflops_jitter_pct": max_jitter,
|
||||
"require_tflops_jitter": require_jitter,
|
||||
"warmup_sec": requested_warmup,
|
||||
"min_steady_samples": min_steady_samples,
|
||||
},
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def print_results(results: dict, console: Console = None):
|
||||
c = console or Console()
|
||||
@ -245,5 +467,21 @@ class StressTest:
|
||||
color = "green" if status == "PASS" else "red"
|
||||
c.print(f" GPU {gid}: [{color}]{status}[/{color}]")
|
||||
|
||||
telemetry = results.get("telemetry") or {}
|
||||
if telemetry:
|
||||
c.print("\n Telemetry:")
|
||||
c.print(f" Samples: {telemetry.get('samples', 0)} total, {telemetry.get('steady_samples', 0)} evaluated after {telemetry.get('warmup_sec', 0)}s warmup")
|
||||
c.print(f" Avg steady power: {telemetry.get('avg_power_w', {})}")
|
||||
c.print(f" Max steady temp: {telemetry.get('max_temp_c', {})}")
|
||||
c.print(f" Temp delta: {telemetry.get('temp_delta_c', 'N/A')} C")
|
||||
c.print(f" TFLOPS jitter: {telemetry.get('tflops_jitter_pct', 'N/A')}%")
|
||||
c.print(f" Throttle events: {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}")
|
||||
c.print(f" XID events: {len(telemetry.get('xid_events', []))}")
|
||||
failures = telemetry.get("failures", [])
|
||||
if failures:
|
||||
c.print(" [red]Failure reasons:[/red]")
|
||||
for reason in failures:
|
||||
c.print(f" [red]- {reason}[/red]")
|
||||
|
||||
if results.get("error"):
|
||||
c.print(f" [red]Error: {results['error']}[/red]")
|
||||
|
||||
@ -1,8 +1,13 @@
|
||||
"""Training simulation module - LLM training workload with PyTorch."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import subprocess
|
||||
import shutil
|
||||
import math
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
@ -36,6 +41,7 @@ class TrainingSim:
|
||||
batch_size = self.train_cfg.get("batch_size", 8)
|
||||
seq_length = self.train_cfg.get("seq_length", 2048)
|
||||
num_steps = self.train_cfg.get("num_steps", 50)
|
||||
warmup_steps = int(self.train_cfg.get("warmup_steps", 5))
|
||||
dtype_str = self.train_cfg.get("dtype", "bf16")
|
||||
|
||||
dtype_map = {
|
||||
@ -47,7 +53,13 @@ class TrainingSim:
|
||||
|
||||
self.console.print(f"[cyan]Training Simulation[/cyan]")
|
||||
self.console.print(f" Model: {model_name} | Batch: {batch_size} | Seq: {seq_length} | "
|
||||
f"DType: {dtype_str} | Steps: {num_steps} | GPUs: {gpu_count}")
|
||||
f"DType: {dtype_str} | Steps: {num_steps} | Warmup: {warmup_steps} | GPUs: {gpu_count}")
|
||||
|
||||
if self.train_cfg.get("mode", "ddp") == "ddp" and gpu_count > 1:
|
||||
ddp_result = self._run_synthetic_ddp(gpu_count, batch_size, seq_length, num_steps, dtype_str)
|
||||
if ddp_result.get("passed") or not self.train_cfg.get("allow_fallback", False):
|
||||
return ddp_result
|
||||
self.console.print("[yellow]DDP synthetic training failed, falling back to single-process synthetic path[/yellow]")
|
||||
|
||||
try:
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
@ -87,9 +99,10 @@ class TrainingSim:
|
||||
BarColumn(), TextColumn("{task.completed}/{task.total}"),
|
||||
TimeElapsedColumn(), console=self.console,
|
||||
) as progress:
|
||||
task = progress.add_task("Training steps...", total=num_steps)
|
||||
total_steps = num_steps + warmup_steps
|
||||
task = progress.add_task("Training steps...", total=total_steps)
|
||||
|
||||
for step in range(num_steps):
|
||||
for step in range(total_steps):
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.perf_counter()
|
||||
|
||||
@ -119,8 +132,15 @@ class TrainingSim:
|
||||
|
||||
progress.advance(task)
|
||||
|
||||
avg_step_time = sum(step_times) / len(step_times)
|
||||
measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
|
||||
avg_step_time = sum(measured_steps) / len(measured_steps)
|
||||
throughput = batch_size * seq_length / avg_step_time
|
||||
jitter = self._jitter_pct(measured_steps)
|
||||
peak_mem = round(max(mem_usage) if mem_usage else 0, 2)
|
||||
final_loss = float(loss.item()) if hasattr(loss, "item") else float("nan")
|
||||
passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss)
|
||||
if self.train_cfg.get("require_distributed", True):
|
||||
passed = False
|
||||
|
||||
return {
|
||||
"model": model_name,
|
||||
@ -130,11 +150,18 @@ class TrainingSim:
|
||||
"batch_size": batch_size,
|
||||
"seq_length": seq_length,
|
||||
"num_steps": num_steps,
|
||||
"warmup_steps": warmup_steps,
|
||||
"total_steps": total_steps,
|
||||
"avg_step_time_ms": round(avg_step_time * 1000, 1),
|
||||
"throughput_tokens_per_sec": round(throughput, 0),
|
||||
"throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
|
||||
"peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
|
||||
"final_loss": round(loss.item(), 4) if hasattr(loss, 'item') else None,
|
||||
"peak_memory_gb": peak_mem,
|
||||
"final_loss": round(final_loss, 4),
|
||||
"step_jitter_pct": round(jitter, 2),
|
||||
"distributed_mode": "device_map",
|
||||
"loss_finite": math.isfinite(final_loss),
|
||||
"passed": passed,
|
||||
"acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
@ -142,6 +169,196 @@ class TrainingSim:
|
||||
self.console.print(f"[yellow]Model loading failed: {e}[/yellow]")
|
||||
return self._run_synthetic(gpu_count, batch_size, seq_length, num_steps, dtype)
|
||||
|
||||
def _run_synthetic_ddp(self, gpu_count: int, batch_size: int, seq_length: int,
|
||||
num_steps: int, dtype_str: str) -> dict:
|
||||
"""Run the 1.5B synthetic Transformer with one process per GPU."""
|
||||
torchrun = os.path.join(os.path.dirname(sys.executable), "torchrun")
|
||||
if not os.path.isfile(torchrun):
|
||||
torchrun = shutil.which("torchrun") or ""
|
||||
if not torchrun:
|
||||
return {
|
||||
"model": "synthetic_transformer_1.5b",
|
||||
"gpu_count": gpu_count,
|
||||
"distributed_mode": "ddp",
|
||||
"passed": False,
|
||||
"error": "torchrun not found",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
script = r'''
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import time
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
|
||||
def main():
|
||||
local_rank = int(os.environ["LOCAL_RANK"])
|
||||
world_size = int(os.environ["WORLD_SIZE"])
|
||||
torch.cuda.set_device(local_rank)
|
||||
dist.init_process_group("nccl")
|
||||
|
||||
global_batch = int(os.environ["TRAIN_BATCH_SIZE"])
|
||||
local_batch = max(1, global_batch // world_size)
|
||||
seq_length = int(os.environ["TRAIN_SEQ_LENGTH"])
|
||||
num_steps = int(os.environ["TRAIN_NUM_STEPS"])
|
||||
warmup_steps = int(os.environ.get("TRAIN_WARMUP_STEPS", "5"))
|
||||
total_steps = num_steps + warmup_steps
|
||||
dtype_name = os.environ.get("TRAIN_DTYPE", "bf16")
|
||||
dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}.get(dtype_name, torch.bfloat16)
|
||||
|
||||
hidden_size = 4096
|
||||
num_layers = 6
|
||||
num_heads = 32
|
||||
vocab_size = 32000
|
||||
|
||||
class SyntheticTransformer(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.embed = torch.nn.Embedding(vocab_size, hidden_size)
|
||||
self.layers = torch.nn.ModuleList([
|
||||
torch.nn.TransformerEncoderLayer(
|
||||
d_model=hidden_size,
|
||||
nhead=num_heads,
|
||||
dim_feedforward=hidden_size * 4,
|
||||
batch_first=True,
|
||||
dtype=dtype,
|
||||
) for _ in range(num_layers)
|
||||
])
|
||||
self.head = torch.nn.Linear(hidden_size, vocab_size, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
h = self.embed(x).to(dtype)
|
||||
for layer in self.layers:
|
||||
h = layer(h)
|
||||
return self.head(h)
|
||||
|
||||
model = SyntheticTransformer().cuda()
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
model = DDP(model, device_ids=[local_rank], output_device=local_rank)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
|
||||
input_ids = torch.randint(0, vocab_size, (local_batch, seq_length), device="cuda")
|
||||
step_times = []
|
||||
last_loss = torch.tensor(float("nan"), device="cuda")
|
||||
torch.cuda.reset_peak_memory_stats(local_rank)
|
||||
|
||||
for _ in range(total_steps):
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.perf_counter()
|
||||
with torch.amp.autocast("cuda", dtype=dtype, enabled=dtype in (torch.float16, torch.bfloat16)):
|
||||
logits = model(input_ids)
|
||||
loss = torch.nn.functional.cross_entropy(logits.reshape(-1, vocab_size), input_ids.reshape(-1))
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
optimizer.zero_grad(set_to_none=True)
|
||||
torch.cuda.synchronize()
|
||||
step_times.append(time.perf_counter() - t0)
|
||||
last_loss = loss.detach()
|
||||
|
||||
peak_mem = torch.tensor(torch.cuda.max_memory_allocated(local_rank) / 1024**3, device="cuda")
|
||||
dist.all_reduce(peak_mem, op=dist.ReduceOp.MAX)
|
||||
finite = torch.tensor(1 if math.isfinite(float(last_loss.item())) else 0, device="cuda")
|
||||
dist.all_reduce(finite, op=dist.ReduceOp.MIN)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
|
||||
avg_step = sum(measured_steps) / len(measured_steps)
|
||||
mean = avg_step
|
||||
jitter = max(abs(v - mean) / mean * 100 for v in measured_steps) if mean else 0.0
|
||||
throughput = global_batch * seq_length / avg_step if avg_step else 0.0
|
||||
print("TRAINING_DDP_JSON=" + json.dumps({
|
||||
"model": "synthetic_transformer_1.5b",
|
||||
"total_params_m": round(total_params / 1e6, 1),
|
||||
"num_layers": num_layers,
|
||||
"hidden_size": hidden_size,
|
||||
"gpu_count": world_size,
|
||||
"dtype": dtype_name,
|
||||
"batch_size": global_batch,
|
||||
"local_batch_size": local_batch,
|
||||
"seq_length": seq_length,
|
||||
"num_steps": num_steps,
|
||||
"warmup_steps": warmup_steps,
|
||||
"total_steps": total_steps,
|
||||
"avg_step_time_ms": round(avg_step * 1000, 1),
|
||||
"throughput_tokens_per_sec": round(throughput, 0),
|
||||
"throughput_samples_per_sec": round(global_batch / avg_step, 2) if avg_step else 0,
|
||||
"peak_memory_gb": round(float(peak_mem.item()), 2),
|
||||
"final_loss": round(float(last_loss.item()), 4),
|
||||
"step_jitter_pct": round(jitter, 2),
|
||||
"distributed_mode": "ddp",
|
||||
"loss_finite": bool(int(finite.item())),
|
||||
}), flush=True)
|
||||
dist.destroy_process_group()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
'''
|
||||
tmp = tempfile.NamedTemporaryFile("w", suffix="_training_ddp.py", delete=False)
|
||||
tmp.write(script)
|
||||
tmp.close()
|
||||
|
||||
env = {
|
||||
**os.environ,
|
||||
"TRAIN_BATCH_SIZE": str(batch_size),
|
||||
"TRAIN_SEQ_LENGTH": str(seq_length),
|
||||
"TRAIN_NUM_STEPS": str(num_steps),
|
||||
"TRAIN_WARMUP_STEPS": str(int(self.train_cfg.get("warmup_steps", 5))),
|
||||
"TRAIN_DTYPE": dtype_str,
|
||||
"NCCL_DEBUG": os.environ.get("NCCL_DEBUG", "WARN"),
|
||||
}
|
||||
cmd = [torchrun, f"--nproc_per_node={gpu_count}", tmp.name]
|
||||
self.console.print(f" Running synthetic 1.5B DDP via torchrun ({gpu_count} processes)...")
|
||||
try:
|
||||
timeout = int(self.train_cfg.get("timeout_sec", max(600, num_steps * 180)))
|
||||
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=env)
|
||||
except subprocess.TimeoutExpired:
|
||||
os.unlink(tmp.name)
|
||||
return {
|
||||
"model": "synthetic_transformer_1.5b",
|
||||
"gpu_count": gpu_count,
|
||||
"distributed_mode": "ddp",
|
||||
"passed": False,
|
||||
"error": "training_ddp_timeout",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
finally:
|
||||
if os.path.exists(tmp.name):
|
||||
try:
|
||||
os.unlink(tmp.name)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
marker = "TRAINING_DDP_JSON="
|
||||
payload = None
|
||||
for line in (r.stdout + "\n" + r.stderr).splitlines():
|
||||
if marker in line:
|
||||
payload = line.split(marker, 1)[1].strip()
|
||||
if r.returncode != 0 or not payload:
|
||||
return {
|
||||
"model": "synthetic_transformer_1.5b",
|
||||
"gpu_count": gpu_count,
|
||||
"distributed_mode": "ddp",
|
||||
"passed": False,
|
||||
"error": (r.stderr or r.stdout or "training_ddp_failed")[-1000:],
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
result = json.loads(payload)
|
||||
loss_value = float(result.get("final_loss", "nan"))
|
||||
passed = self._acceptance_pass(
|
||||
float(result.get("throughput_tokens_per_sec", 0)),
|
||||
float(result.get("step_jitter_pct", 999)),
|
||||
float(result.get("peak_memory_gb", 999)),
|
||||
loss_value,
|
||||
) and bool(result.get("loss_finite", False)) and result.get("gpu_count") == gpu_count
|
||||
result.update({
|
||||
"passed": passed,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
})
|
||||
return result
|
||||
|
||||
def _run_synthetic(self, gpu_count, batch_size, seq_length, num_steps, dtype) -> dict:
|
||||
self.console.print(" Running synthetic training benchmark...")
|
||||
|
||||
@ -170,11 +387,17 @@ class TrainingSim:
|
||||
h = layer(h)
|
||||
return self.head(h)
|
||||
|
||||
model = SyntheticTransformer().cuda()
|
||||
model = SyntheticTransformer()
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
|
||||
self.console.print(f" Synthetic params: {total_params / 1e6:.1f}M")
|
||||
|
||||
distributed_mode = "single_gpu"
|
||||
if gpu_count > 1:
|
||||
model = torch.nn.DataParallel(model).cuda()
|
||||
distributed_mode = "data_parallel"
|
||||
else:
|
||||
model = model.cuda()
|
||||
model.train()
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
|
||||
|
||||
@ -183,14 +406,17 @@ class TrainingSim:
|
||||
step_times = []
|
||||
mem_usage = []
|
||||
|
||||
warmup_steps = int(self.train_cfg.get("warmup_steps", 5))
|
||||
total_steps = num_steps + warmup_steps
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(), TextColumn("{task.completed}/{task.total}"),
|
||||
TimeElapsedColumn(), console=self.console,
|
||||
) as progress:
|
||||
task = progress.add_task("Synthetic training...", total=num_steps)
|
||||
task = progress.add_task("Synthetic training...", total=total_steps)
|
||||
|
||||
for step in range(num_steps):
|
||||
for step in range(total_steps):
|
||||
torch.cuda.synchronize()
|
||||
t0 = time.perf_counter()
|
||||
|
||||
@ -206,14 +432,22 @@ class TrainingSim:
|
||||
elapsed = time.perf_counter() - t0
|
||||
step_times.append(elapsed)
|
||||
|
||||
mem_used = torch.cuda.max_memory_allocated() / 1024**3
|
||||
mem_used = max(torch.cuda.max_memory_allocated(i) for i in range(gpu_count)) / 1024**3
|
||||
mem_usage.append(mem_used)
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
for i in range(gpu_count):
|
||||
torch.cuda.reset_peak_memory_stats(i)
|
||||
|
||||
progress.advance(task)
|
||||
|
||||
avg_step_time = sum(step_times) / len(step_times)
|
||||
measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
|
||||
avg_step_time = sum(measured_steps) / len(measured_steps)
|
||||
throughput = batch_size * seq_length / avg_step_time
|
||||
jitter = self._jitter_pct(measured_steps)
|
||||
peak_mem = round(max(mem_usage) if mem_usage else 0, 2)
|
||||
final_loss = float(loss.item())
|
||||
passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss)
|
||||
if self.train_cfg.get("require_distributed", True):
|
||||
passed = False
|
||||
|
||||
return {
|
||||
"model": "synthetic_transformer",
|
||||
@ -225,14 +459,36 @@ class TrainingSim:
|
||||
"batch_size": batch_size,
|
||||
"seq_length": seq_length,
|
||||
"num_steps": num_steps,
|
||||
"warmup_steps": warmup_steps,
|
||||
"total_steps": total_steps,
|
||||
"avg_step_time_ms": round(avg_step_time * 1000, 1),
|
||||
"throughput_tokens_per_sec": round(throughput, 0),
|
||||
"throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
|
||||
"peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
|
||||
"final_loss": round(loss.item(), 4),
|
||||
"peak_memory_gb": peak_mem,
|
||||
"final_loss": round(final_loss, 4),
|
||||
"step_jitter_pct": round(jitter, 2),
|
||||
"distributed_mode": distributed_mode,
|
||||
"loss_finite": math.isfinite(final_loss),
|
||||
"passed": passed,
|
||||
"acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _jitter_pct(step_times: list[float]) -> float:
|
||||
if not step_times:
|
||||
return 0.0
|
||||
mean = sum(step_times) / len(step_times)
|
||||
return max(abs(v - mean) / mean * 100 for v in step_times) if mean else 0.0
|
||||
|
||||
def _acceptance_pass(self, throughput: float, jitter: float, peak_mem: float, loss_value: float) -> bool:
|
||||
return (
|
||||
throughput >= float(self.train_cfg.get("min_tokens_per_sec", 45000))
|
||||
and jitter <= float(self.train_cfg.get("max_step_jitter_pct", 3))
|
||||
and peak_mem <= float(self.train_cfg.get("max_peak_memory_gb", 70))
|
||||
and math.isfinite(loss_value)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def print_results(results: dict, console: Console = None):
|
||||
c = console or Console()
|
||||
@ -254,11 +510,15 @@ class TrainingSim:
|
||||
("Batch Size", str(results.get("batch_size", "N/A"))),
|
||||
("Seq Length", str(results.get("seq_length", "N/A"))),
|
||||
("Steps", str(results.get("num_steps", "N/A"))),
|
||||
("Warmup Steps", str(results.get("warmup_steps", "N/A"))),
|
||||
("Avg Step Time", f"{results.get('avg_step_time_ms', 'N/A')} ms"),
|
||||
("Throughput", f"{results.get('throughput_tokens_per_sec', 'N/A')} tokens/s"),
|
||||
("Samples/sec", f"{results.get('throughput_samples_per_sec', 'N/A')}"),
|
||||
("Peak Memory", f"{results.get('peak_memory_gb', 'N/A')} GB"),
|
||||
("Final Loss", str(results.get("final_loss", "N/A"))),
|
||||
("Step Jitter", f"{results.get('step_jitter_pct', 'N/A')}%"),
|
||||
("Distributed Mode", results.get("distributed_mode", "N/A")),
|
||||
("Verdict", "PASS" if results.get("passed") else "FAIL"),
|
||||
]
|
||||
for label, val in metrics:
|
||||
table.add_row(label, str(val))
|
||||
|
||||
291
scripts/cublaslt_fp8_gemm_bench.cu
Normal file
291
scripts/cublaslt_fp8_gemm_bench.cu
Normal file
@ -0,0 +1,291 @@
|
||||
#include <cublasLt.h>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp8.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#define CHECK_CUDA(call) \
|
||||
do { \
|
||||
cudaError_t status = (call); \
|
||||
if (status != cudaSuccess) { \
|
||||
std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorString(status)); \
|
||||
std::exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define CHECK_CUBLAS(call) \
|
||||
do { \
|
||||
cublasStatus_t status = (call); \
|
||||
if (status != CUBLAS_STATUS_SUCCESS) { \
|
||||
std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__, \
|
||||
__LINE__, static_cast<int>(status)); \
|
||||
std::exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) {
|
||||
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
for (size_t i = tid; i < count; i += stride) {
|
||||
ptr[i] = __nv_fp8_e4m3(value);
|
||||
}
|
||||
}
|
||||
|
||||
struct Args {
|
||||
int matrix_size = 8192;
|
||||
int warmup = 20;
|
||||
int iterations = 200;
|
||||
int first_gpu = 0;
|
||||
int gpu_count = -1;
|
||||
size_t workspace_mb = 256;
|
||||
int fast_accum = 1;
|
||||
};
|
||||
|
||||
static Args parse_args(int argc, char **argv) {
|
||||
Args args;
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
auto need = [&](const char *name) {
|
||||
if (i + 1 >= argc) {
|
||||
std::fprintf(stderr, "Missing value for %s\n", name);
|
||||
std::exit(2);
|
||||
}
|
||||
return argv[++i];
|
||||
};
|
||||
if (!std::strcmp(argv[i], "--matrix-size")) {
|
||||
args.matrix_size = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--warmup")) {
|
||||
args.warmup = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--iterations")) {
|
||||
args.iterations = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--first-gpu")) {
|
||||
args.first_gpu = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--gpu-count")) {
|
||||
args.gpu_count = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--workspace-mb")) {
|
||||
args.workspace_mb = static_cast<size_t>(std::atoll(need(argv[i])));
|
||||
} else if (!std::strcmp(argv[i], "--fast-accum")) {
|
||||
args.fast_accum = std::atoi(need(argv[i]));
|
||||
} else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) {
|
||||
std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] "
|
||||
"[--iterations N] [--first-gpu N] [--gpu-count N] "
|
||||
"[--workspace-mb N] [--fast-accum 0|1]");
|
||||
std::exit(0);
|
||||
} else {
|
||||
std::fprintf(stderr, "Unknown argument: %s\n", argv[i]);
|
||||
std::exit(2);
|
||||
}
|
||||
}
|
||||
return args;
|
||||
}
|
||||
|
||||
static double run_one_gpu(int gpu, const Args &args) {
|
||||
CHECK_CUDA(cudaSetDevice(gpu));
|
||||
|
||||
const int64_t m = args.matrix_size;
|
||||
const int64_t n = args.matrix_size;
|
||||
const int64_t k = args.matrix_size;
|
||||
const size_t a_elems = static_cast<size_t>(m) * k;
|
||||
const size_t b_elems = static_cast<size_t>(k) * n;
|
||||
const size_t d_elems = static_cast<size_t>(m) * n;
|
||||
|
||||
__nv_fp8_e4m3 *d_a = nullptr;
|
||||
__nv_fp8_e4m3 *d_b = nullptr;
|
||||
__nv_bfloat16 *d_d = nullptr;
|
||||
void *workspace = nullptr;
|
||||
float *d_scale_a = nullptr;
|
||||
float *d_scale_b = nullptr;
|
||||
const float scale = 1.0f;
|
||||
const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL;
|
||||
|
||||
CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3)));
|
||||
CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3)));
|
||||
CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16)));
|
||||
CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes));
|
||||
CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float)));
|
||||
CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float)));
|
||||
CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice));
|
||||
CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice));
|
||||
|
||||
const int threads = 256;
|
||||
const int blocks = 4096;
|
||||
fill_fp8<<<blocks, threads>>>(d_a, a_elems, 0.01f);
|
||||
fill_fp8<<<blocks, threads>>>(d_b, b_elems, 0.01f);
|
||||
CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16)));
|
||||
CHECK_CUDA(cudaGetLastError());
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
|
||||
cublasLtHandle_t lt;
|
||||
cublasLtMatmulDesc_t op_desc;
|
||||
cublasLtMatrixLayout_t a_desc, b_desc, d_desc;
|
||||
cublasLtMatmulPreference_t preference;
|
||||
CHECK_CUBLAS(cublasLtCreate(<));
|
||||
CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F));
|
||||
|
||||
// cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed.
|
||||
// With square GEMMs this keeps the benchmark FLOP count identical to the PDF
|
||||
// acceptance shape while satisfying the library's FP8 kernel constraints.
|
||||
cublasOperation_t transa = CUBLAS_OP_T;
|
||||
cublasOperation_t transb = CUBLAS_OP_N;
|
||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
||||
op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
|
||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
||||
op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
|
||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
||||
op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a,
|
||||
sizeof(d_scale_a)));
|
||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
||||
op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b,
|
||||
sizeof(d_scale_b)));
|
||||
int8_t fast_accum = args.fast_accum ? 1 : 0;
|
||||
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
|
||||
op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum,
|
||||
sizeof(fast_accum)));
|
||||
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k));
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k));
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m));
|
||||
|
||||
CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference));
|
||||
CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute(
|
||||
preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes,
|
||||
sizeof(workspace_bytes)));
|
||||
|
||||
cublasLtMatmulHeuristicResult_t heuristic;
|
||||
int returned = 0;
|
||||
CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(
|
||||
lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic,
|
||||
&returned));
|
||||
if (returned == 0) {
|
||||
std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu);
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
||||
int32_t value = -1;
|
||||
size_t written = 0;
|
||||
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
||||
&heuristic.algo, attr, &value, sizeof(value), &written));
|
||||
return static_cast<int>(value);
|
||||
};
|
||||
auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
||||
uint32_t value = 0;
|
||||
size_t written = 0;
|
||||
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
||||
&heuristic.algo, attr, &value, sizeof(value), &written));
|
||||
return static_cast<int>(value);
|
||||
};
|
||||
auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
|
||||
uint16_t value = 0;
|
||||
size_t written = 0;
|
||||
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
|
||||
&heuristic.algo, attr, &value, sizeof(value), &written));
|
||||
return static_cast<int>(value);
|
||||
};
|
||||
const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID);
|
||||
const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID);
|
||||
const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM);
|
||||
const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID);
|
||||
const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID);
|
||||
const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID);
|
||||
|
||||
const float alpha = 1.0f;
|
||||
const float beta = 0.0f;
|
||||
auto matmul = [&]() {
|
||||
CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc,
|
||||
&beta, d_d, d_desc, d_d, d_desc,
|
||||
&heuristic.algo, workspace, workspace_bytes, 0));
|
||||
};
|
||||
|
||||
for (int i = 0; i < args.warmup; ++i) {
|
||||
matmul();
|
||||
}
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
|
||||
cudaEvent_t start, stop;
|
||||
CHECK_CUDA(cudaEventCreate(&start));
|
||||
CHECK_CUDA(cudaEventCreate(&stop));
|
||||
CHECK_CUDA(cudaEventRecord(start));
|
||||
for (int i = 0; i < args.iterations; ++i) {
|
||||
matmul();
|
||||
}
|
||||
CHECK_CUDA(cudaEventRecord(stop));
|
||||
CHECK_CUDA(cudaEventSynchronize(stop));
|
||||
float elapsed_ms = 0.0f;
|
||||
CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop));
|
||||
const double flops =
|
||||
2.0 * static_cast<double>(m) * static_cast<double>(n) *
|
||||
static_cast<double>(k) * static_cast<double>(args.iterations);
|
||||
const double tflops = flops / (static_cast<double>(elapsed_ms) / 1000.0) / 1e12;
|
||||
std::printf(
|
||||
" {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, "
|
||||
"\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, "
|
||||
"\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n",
|
||||
gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape,
|
||||
(gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ",");
|
||||
std::fflush(stdout);
|
||||
|
||||
CHECK_CUDA(cudaEventDestroy(start));
|
||||
CHECK_CUDA(cudaEventDestroy(stop));
|
||||
CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference));
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc));
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc));
|
||||
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc));
|
||||
CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc));
|
||||
CHECK_CUBLAS(cublasLtDestroy(lt));
|
||||
CHECK_CUDA(cudaFree(d_a));
|
||||
CHECK_CUDA(cudaFree(d_b));
|
||||
CHECK_CUDA(cudaFree(d_d));
|
||||
CHECK_CUDA(cudaFree(workspace));
|
||||
CHECK_CUDA(cudaFree(d_scale_a));
|
||||
CHECK_CUDA(cudaFree(d_scale_b));
|
||||
CHECK_CUDA(cudaDeviceSynchronize());
|
||||
|
||||
return tflops;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
Args args = parse_args(argc, argv);
|
||||
int device_count = 0;
|
||||
CHECK_CUDA(cudaGetDeviceCount(&device_count));
|
||||
if (args.gpu_count < 0) {
|
||||
args.gpu_count = device_count - args.first_gpu;
|
||||
}
|
||||
if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) {
|
||||
std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n",
|
||||
args.first_gpu, args.gpu_count, device_count);
|
||||
return 2;
|
||||
}
|
||||
|
||||
std::vector<double> values;
|
||||
std::printf("{\n");
|
||||
std::printf(" \"source\": \"cuBLASLt\",\n");
|
||||
std::printf(" \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n");
|
||||
std::printf(" \"matrix_size\": %d,\n", args.matrix_size);
|
||||
std::printf(" \"warmup\": %d,\n", args.warmup);
|
||||
std::printf(" \"iterations\": %d,\n", args.iterations);
|
||||
std::printf(" \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0);
|
||||
std::printf(" \"per_gpu\": [\n");
|
||||
for (int i = 0; i < args.gpu_count; ++i) {
|
||||
int gpu = args.first_gpu + i;
|
||||
double tflops = run_one_gpu(gpu, args);
|
||||
values.push_back(tflops);
|
||||
}
|
||||
double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size();
|
||||
auto minmax = std::minmax_element(values.begin(), values.end());
|
||||
double spread = ((*minmax.second - *minmax.first) / mean) * 100.0;
|
||||
std::printf(" ],\n");
|
||||
std::printf(" \"mean_tflops\": %.1f,\n", mean);
|
||||
std::printf(" \"min_tflops\": %.1f,\n", *minmax.first);
|
||||
std::printf(" \"max_tflops\": %.1f,\n", *minmax.second);
|
||||
std::printf(" \"spread_pct\": %.2f\n", spread);
|
||||
std::printf("}\n");
|
||||
return mean >= 1400.0 ? 0 : 1;
|
||||
}
|
||||
425
scripts/multinode_nccl_deep_diagnose.sh
Executable file
425
scripts/multinode_nccl_deep_diagnose.sh
Executable file
@ -0,0 +1,425 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Deep-diagnose multi-node NCCL behavior from the coordinator node.
|
||||
# Default values match the current 2-node H100 cross-leaf investigation.
|
||||
|
||||
MODE="${1:-all}"
|
||||
|
||||
MPI_BIN="${MPI_BIN:-/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun}"
|
||||
NCCL_TESTS_DIR="${NCCL_TESTS_DIR:-/data/nccl-tests-latest/build}"
|
||||
HOSTS="${HOSTS:-172.72.8.12:8,172.72.8.16:8}"
|
||||
PEER_HOST="${PEER_HOST:-172.72.8.16}"
|
||||
SSH_USER="${SSH_USER:-root}"
|
||||
HCAS="${HCAS:-mlx5_0 mlx5_1 mlx5_6 mlx5_7}"
|
||||
HCA_CSV="${HCA_CSV:-mlx5_0,mlx5_1,mlx5_6,mlx5_7}"
|
||||
OUT_DIR="${OUT_DIR:-/tmp/nccl_deep_diagnose_$(date +%Y%m%d_%H%M%S)}"
|
||||
|
||||
BEGIN_SIZE="${BEGIN_SIZE:-16G}"
|
||||
END_SIZE="${END_SIZE:-16G}"
|
||||
WARMUP_ITERS="${WARMUP_ITERS:-10}"
|
||||
ITERS="${ITERS:-10}"
|
||||
GRAPH_WARMUP_ITERS="${GRAPH_WARMUP_ITERS:-1}"
|
||||
GRAPH_ITERS="${GRAPH_ITERS:-1}"
|
||||
SWEEP_WARMUP_ITERS="${SWEEP_WARMUP_ITERS:-3}"
|
||||
SWEEP_ITERS="${SWEEP_ITERS:-5}"
|
||||
|
||||
NCCL_LD_LIBRARY_PATH="${NCCL_LD_LIBRARY_PATH:-/usr/mpi/gcc/openmpi-4.1.9a1/lib:/tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.4/targets/x86_64-linux/lib}"
|
||||
DEFAULT_NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
|
||||
|
||||
COUNTERS="${COUNTERS:-port_xmit_data port_rcv_data port_xmit_packets port_rcv_packets port_xmit_wait port_xmit_discards port_rcv_errors port_rcv_remote_physical_errors port_rcv_switch_relay_errors port_xmit_constraint_errors port_rcv_constraint_errors symbol_error link_error_recovery link_downed local_link_integrity_errors excessive_buffer_overrun_errors VL15_dropped}"
|
||||
HW_COUNTERS="${HW_COUNTERS:-roce_adp_retrans roce_adp_retrans_to roce_slow_restart roce_slow_restart_cnps roce_slow_restart_trans packet_seq_err out_of_sequence out_of_buffer duplicate_request implied_nak_seq_err local_ack_timeout_err req_transport_retries_exceeded rnr_nak_retry_err rx_write_requests rx_read_requests}"
|
||||
|
||||
mkdir -p "$OUT_DIR"
|
||||
|
||||
mpi_base=(
|
||||
"$MPI_BIN"
|
||||
--allow-run-as-root
|
||||
--mca btl_openib_warn_no_device_params_found 0
|
||||
--mca btl_tcp_if_include bond0
|
||||
--mca oob_tcp_if_include bond0
|
||||
--mca plm_rsh_args "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes -o ConnectTimeout=10"
|
||||
-H "$HOSTS"
|
||||
--map-by ppr:8:node
|
||||
-np 16
|
||||
)
|
||||
|
||||
base_exports=(
|
||||
LD_LIBRARY_PATH
|
||||
NCCL_IB_GID_INDEX NCCL_IB_SL NCCL_IB_TC NCCL_SOCKET_IFNAME
|
||||
NCCL_DEBUG NCCL_DEBUG_SUBSYS NCCL_IB_TIMEOUT NCCL_IB_HCA
|
||||
NCCL_NET_PLUGIN NCCL_NVLS_ENABLE NCCL_NET_GDR_LEVEL NCCL_NET_GDR_READ
|
||||
NCCL_DMABUF_ENABLE NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION
|
||||
NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS
|
||||
NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER
|
||||
NCCL_IB_AR_THRESHOLD
|
||||
)
|
||||
|
||||
set_common_env() {
|
||||
unset NCCL_DEBUG_SUBSYS NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION
|
||||
unset NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS
|
||||
unset NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER
|
||||
unset NCCL_IB_AR_THRESHOLD
|
||||
|
||||
export LD_LIBRARY_PATH="$NCCL_LD_LIBRARY_PATH"
|
||||
export NCCL_IB_GID_INDEX="${NCCL_IB_GID_INDEX:-3}"
|
||||
export NCCL_IB_SL="${NCCL_IB_SL:-5}"
|
||||
export NCCL_IB_TC="${NCCL_IB_TC:-136}"
|
||||
export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-bond0}"
|
||||
export NCCL_DEBUG="$DEFAULT_NCCL_DEBUG"
|
||||
export NCCL_IB_TIMEOUT="${NCCL_IB_TIMEOUT:-22}"
|
||||
export NCCL_IB_HCA="$HCA_CSV"
|
||||
export NCCL_NET_PLUGIN="${NCCL_NET_PLUGIN:-none}"
|
||||
export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-1}"
|
||||
export NCCL_NET_GDR_LEVEL="${NCCL_NET_GDR_LEVEL:-5}"
|
||||
export NCCL_NET_GDR_READ="${NCCL_NET_GDR_READ:-1}"
|
||||
export NCCL_DMABUF_ENABLE="${NCCL_DMABUF_ENABLE:-0}"
|
||||
}
|
||||
|
||||
mpi_xargs() {
|
||||
for name in "${base_exports[@]}"; do
|
||||
if [[ -n "${!name+x}" ]]; then
|
||||
printf -- '-x\n%s\n' "$name"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
run_nccl() {
|
||||
local op="$1"
|
||||
local bin="$2"
|
||||
local log="$3"
|
||||
local warmup="$4"
|
||||
local iters="$5"
|
||||
mapfile -t xargs < <(mpi_xargs)
|
||||
"${mpi_base[@]}" "${xargs[@]}" \
|
||||
"$bin" -b "$BEGIN_SIZE" -e "$END_SIZE" -g 1 -f 2 -w "$warmup" -n "$iters" \
|
||||
>"$log" 2>&1
|
||||
awk -v op="$op" '/Avg bus bandwidth/ {print op, $0}' "$log"
|
||||
}
|
||||
|
||||
read_one_snapshot() {
|
||||
local host_label="$1"
|
||||
local out="$2"
|
||||
: >"$out"
|
||||
for hca in $HCAS; do
|
||||
for c in $COUNTERS; do
|
||||
local f="/sys/class/infiniband/$hca/ports/1/counters/$c"
|
||||
if [[ -r "$f" ]]; then
|
||||
printf '%s %s counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out"
|
||||
fi
|
||||
done
|
||||
for c in $HW_COUNTERS; do
|
||||
local f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c"
|
||||
if [[ -r "$f" ]]; then
|
||||
printf '%s %s hw_counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out"
|
||||
fi
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
read_remote_snapshot() {
|
||||
local out="$1"
|
||||
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||||
-o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \
|
||||
"HCAS='$HCAS' COUNTERS='$COUNTERS' HW_COUNTERS='$HW_COUNTERS' bash -s" <<'EOS' >"$out"
|
||||
for hca in $HCAS; do
|
||||
for c in $COUNTERS; do
|
||||
f="/sys/class/infiniband/$hca/ports/1/counters/$c"
|
||||
if [ -r "$f" ]; then
|
||||
printf '%s %s counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)"
|
||||
fi
|
||||
done
|
||||
for c in $HW_COUNTERS; do
|
||||
f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c"
|
||||
if [ -r "$f" ]; then
|
||||
printf '%s %s hw_counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)"
|
||||
fi
|
||||
done
|
||||
done
|
||||
EOS
|
||||
}
|
||||
|
||||
summarize_counter_delta() {
|
||||
local before_a="$1"
|
||||
local before_b="$2"
|
||||
local after_a="$3"
|
||||
local after_b="$4"
|
||||
local out="$5"
|
||||
python3 - "$before_a" "$before_b" "$after_a" "$after_b" >"$out" <<'PY'
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
interesting = {
|
||||
"port_xmit_wait", "port_xmit_discards", "port_rcv_errors",
|
||||
"port_rcv_remote_physical_errors", "port_rcv_switch_relay_errors",
|
||||
"port_xmit_constraint_errors", "port_rcv_constraint_errors",
|
||||
"symbol_error", "link_error_recovery", "link_downed",
|
||||
"local_link_integrity_errors", "excessive_buffer_overrun_errors",
|
||||
"VL15_dropped", "roce_adp_retrans", "roce_adp_retrans_to",
|
||||
"roce_slow_restart", "roce_slow_restart_cnps", "roce_slow_restart_trans",
|
||||
"packet_seq_err", "out_of_sequence", "out_of_buffer",
|
||||
"duplicate_request", "implied_nak_seq_err", "local_ack_timeout_err",
|
||||
"req_transport_retries_exceeded", "rnr_nak_retry_err",
|
||||
}
|
||||
|
||||
def load(path):
|
||||
data = {}
|
||||
for line in pathlib.Path(path).read_text().splitlines():
|
||||
parts = line.split()
|
||||
if len(parts) != 5:
|
||||
continue
|
||||
host, hca, kind, counter, value = parts
|
||||
try:
|
||||
data[(host, hca, kind, counter)] = int(value)
|
||||
except ValueError:
|
||||
pass
|
||||
return data
|
||||
|
||||
before = {}
|
||||
after = {}
|
||||
before.update(load(sys.argv[1]))
|
||||
before.update(load(sys.argv[2]))
|
||||
after.update(load(sys.argv[3]))
|
||||
after.update(load(sys.argv[4]))
|
||||
|
||||
print("NONZERO_DELTAS")
|
||||
for key in sorted(set(before) | set(after)):
|
||||
delta = after.get(key, 0) - before.get(key, 0)
|
||||
if not delta:
|
||||
continue
|
||||
host, hca, kind, counter = key
|
||||
if counter in {"port_xmit_data", "port_rcv_data"}:
|
||||
gib = delta * 4 / (1024 ** 3)
|
||||
print(f"{host} {hca} {kind} {counter} {delta} words4B {gib:.2f} GiB")
|
||||
else:
|
||||
print(f"{host} {hca} {kind} {counter} {delta}")
|
||||
|
||||
print("ERROR_OR_CONGESTION_DELTAS")
|
||||
seen = False
|
||||
for key in sorted(set(before) | set(after)):
|
||||
delta = after.get(key, 0) - before.get(key, 0)
|
||||
if delta and key[3] in interesting:
|
||||
seen = True
|
||||
print(*key, delta)
|
||||
if not seen:
|
||||
print("none")
|
||||
PY
|
||||
}
|
||||
|
||||
run_counter_case() {
|
||||
local op="$1"
|
||||
local bin="$2"
|
||||
local extra="${3:-}"
|
||||
set_common_env
|
||||
if [[ -n "$extra" ]]; then
|
||||
eval "export $extra"
|
||||
fi
|
||||
local dir="$OUT_DIR/${op}_counter"
|
||||
mkdir -p "$dir"
|
||||
read_one_snapshot "$(hostname)" "$dir/before.local"
|
||||
read_remote_snapshot "$dir/before.remote"
|
||||
run_nccl "$op" "$bin" "$dir/${op}.log" "$WARMUP_ITERS" "$ITERS"
|
||||
read_one_snapshot "$(hostname)" "$dir/after.local"
|
||||
read_remote_snapshot "$dir/after.remote"
|
||||
summarize_counter_delta "$dir/before.local" "$dir/before.remote" "$dir/after.local" "$dir/after.remote" "$dir/counter_delta.txt"
|
||||
echo "$dir"
|
||||
}
|
||||
|
||||
summarize_graph_log() {
|
||||
local log="$1"
|
||||
local out="$2"
|
||||
python3 - "$log" >"$out" <<'PY'
|
||||
from pathlib import Path
|
||||
import collections
|
||||
import re
|
||||
import sys
|
||||
|
||||
text = Path(sys.argv[1]).read_text(errors="ignore")
|
||||
print("avg_busbw", (re.findall(r"Avg bus bandwidth\s*:\s*([0-9.]+)", text) or ["NA"])[-1])
|
||||
print("nccl_version", sorted(set(re.findall(r"NCCL version ([^\s]+)", text))))
|
||||
print("plugin_missing", len(re.findall(r"Could not find: none libnccl-net-none\.so", text)))
|
||||
print("gdr_enabled_lines", len(re.findall(r"GPU Direct RDMA Enabled", text)))
|
||||
print("using_hca")
|
||||
for value, count in collections.Counter(re.findall(r"NET/IB : Using \[(.*?)\]; OOB", text)).most_common(4):
|
||||
print(f" {count} {value}")
|
||||
print("pattern_counts")
|
||||
patterns = re.findall(
|
||||
r"Pattern (\d+), crossNic (\d+), nChannels (\d+), bw ([0-9.]+)/([0-9.]+), type ([^,]+), sameChannels (\d+)",
|
||||
text,
|
||||
)
|
||||
for key, count in collections.Counter(patterns).most_common():
|
||||
print(f" {count} {key}")
|
||||
print("channel_summary")
|
||||
for value, count in collections.Counter(
|
||||
re.findall(r"(\d+ coll channels, \d+ collnet channels, \d+ nvls channels, \d+ p2p channels, \d+ p2p channels per peer)", text)
|
||||
).most_common():
|
||||
print(f" {count} {value}")
|
||||
print("p2p_chunks", collections.Counter(re.findall(r"P2P Chunksize set to (\d+)", text)))
|
||||
print("check_p2p", collections.Counter(re.findall(r"Check P2P Type ([^\n]+)", text)))
|
||||
for token in ["NET/IB/0/GDRDMA", "NET/IB/1/GDRDMA", "NET/IB/2/GDRDMA", "NET/IB/3/GDRDMA", "P2P/CUMEM", "P2P/IPC", "SHM"]:
|
||||
print(token, text.count(token))
|
||||
print("channel_edge_lines", len([line for line in text.splitlines() if "Channel " in line and ("via NET/IB" in line or "via P2P" in line)]))
|
||||
PY
|
||||
}
|
||||
|
||||
run_graph_case() {
|
||||
local op="$1"
|
||||
local bin="$2"
|
||||
local extra="${3:-}"
|
||||
set_common_env
|
||||
export NCCL_DEBUG=INFO
|
||||
export NCCL_DEBUG_SUBSYS=INIT,NET,GRAPH,TUNING,COLL
|
||||
if [[ -n "$extra" ]]; then
|
||||
eval "export $extra"
|
||||
fi
|
||||
local dir="$OUT_DIR/graph"
|
||||
mkdir -p "$dir"
|
||||
local log="$dir/${op}.log"
|
||||
run_nccl "$op" "$bin" "$log" "$GRAPH_WARMUP_ITERS" "$GRAPH_ITERS"
|
||||
summarize_graph_log "$log" "$dir/${op}_summary.txt"
|
||||
echo "$dir/${op}_summary.txt"
|
||||
}
|
||||
|
||||
run_pxn_sweep() {
|
||||
local dir="$OUT_DIR/pxn_sweep"
|
||||
mkdir -p "$dir"
|
||||
local cases=(
|
||||
"baseline|"
|
||||
"nvls_off|NCCL_NVLS_ENABLE=0"
|
||||
"qps4_split1|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1"
|
||||
"qps8_split1|NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1"
|
||||
"qps4_split0|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0"
|
||||
"channels16|NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16"
|
||||
"buff8m|NCCL_BUFFSIZE=8388608"
|
||||
"p2pchunk4m|NCCL_P2P_NET_CHUNKSIZE=4194304"
|
||||
"netpeer8|NCCL_NCHANNELS_PER_NET_PEER=8"
|
||||
"ar0|NCCL_IB_AR_THRESHOLD=0"
|
||||
)
|
||||
: >"$dir/summary.txt"
|
||||
for item in "${cases[@]}"; do
|
||||
local name="${item%%|*}"
|
||||
local extra="${item#*|}"
|
||||
set_common_env
|
||||
export NCCL_PXN_DISABLE=1
|
||||
if [[ -n "$extra" ]]; then
|
||||
eval "export $extra"
|
||||
fi
|
||||
local log="$dir/${name}.log"
|
||||
{
|
||||
echo "===== CASE $name ====="
|
||||
echo "extra: ${extra:-none}"
|
||||
run_nccl "alltoall" "$NCCL_TESTS_DIR/alltoall_perf" "$log" "$SWEEP_WARMUP_ITERS" "$SWEEP_ITERS"
|
||||
awk '/Avg bus bandwidth/ {print}' "$log" | tail -1
|
||||
} | tee -a "$dir/summary.txt"
|
||||
done
|
||||
echo "$dir/summary.txt"
|
||||
}
|
||||
|
||||
run_preflight() {
|
||||
set_common_env
|
||||
local out="$OUT_DIR/preflight.txt"
|
||||
{
|
||||
echo "===== LOCAL ====="
|
||||
echo "hostname: $(hostname)"
|
||||
echo "mpirun: $MPI_BIN"
|
||||
if [[ -x "$MPI_BIN" ]]; then
|
||||
"$MPI_BIN" --version 2>&1 | sed -n '1p'
|
||||
else
|
||||
echo "MISSING executable: $MPI_BIN"
|
||||
fi
|
||||
for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do
|
||||
if [[ -x "$bin" ]]; then
|
||||
echo "OK executable: $bin"
|
||||
else
|
||||
echo "MISSING executable: $bin"
|
||||
fi
|
||||
done
|
||||
for hca in $HCAS; do
|
||||
local state="/sys/class/infiniband/$hca/ports/1/state"
|
||||
local rate="/sys/class/infiniband/$hca/ports/1/rate"
|
||||
if [[ -r "$state" ]]; then
|
||||
echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)"
|
||||
else
|
||||
echo "MISSING HCA path: $hca"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "===== REMOTE ====="
|
||||
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
||||
-o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \
|
||||
"MPI_BIN='$MPI_BIN' NCCL_TESTS_DIR='$NCCL_TESTS_DIR' HCAS='$HCAS' bash -s" <<'EOS'
|
||||
echo "hostname: $(hostname)"
|
||||
echo "mpirun: $MPI_BIN"
|
||||
if [ -x "$MPI_BIN" ]; then
|
||||
"$MPI_BIN" --version 2>&1 | sed -n '1p'
|
||||
else
|
||||
echo "MISSING executable: $MPI_BIN"
|
||||
fi
|
||||
for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do
|
||||
if [ -x "$bin" ]; then
|
||||
echo "OK executable: $bin"
|
||||
else
|
||||
echo "MISSING executable: $bin"
|
||||
fi
|
||||
done
|
||||
for hca in $HCAS; do
|
||||
state="/sys/class/infiniband/$hca/ports/1/state"
|
||||
rate="/sys/class/infiniband/$hca/ports/1/rate"
|
||||
if [ -r "$state" ]; then
|
||||
echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)"
|
||||
else
|
||||
echo "MISSING HCA path: $hca"
|
||||
fi
|
||||
done
|
||||
EOS
|
||||
} | tee "$out"
|
||||
echo "$out"
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat <<EOF
|
||||
Usage: $0 [preflight|all|allreduce-counter|alltoall-counter|graph|pxn-sweep]
|
||||
|
||||
Outputs are written to: $OUT_DIR
|
||||
|
||||
Common overrides:
|
||||
HOSTS, PEER_HOST, HCAS, HCA_CSV, MPI_BIN, NCCL_TESTS_DIR,
|
||||
NCCL_LD_LIBRARY_PATH, BEGIN_SIZE, END_SIZE, WARMUP_ITERS, ITERS
|
||||
EOF
|
||||
}
|
||||
|
||||
case "$MODE" in
|
||||
preflight)
|
||||
run_preflight
|
||||
;;
|
||||
all)
|
||||
run_preflight
|
||||
run_counter_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
|
||||
run_counter_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
|
||||
run_graph_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
|
||||
run_graph_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
|
||||
run_pxn_sweep
|
||||
;;
|
||||
allreduce-counter)
|
||||
run_counter_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
|
||||
;;
|
||||
alltoall-counter)
|
||||
run_counter_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
|
||||
;;
|
||||
graph)
|
||||
run_graph_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
|
||||
run_graph_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
|
||||
;;
|
||||
pxn-sweep)
|
||||
run_pxn_sweep
|
||||
;;
|
||||
-h|--help|help)
|
||||
usage
|
||||
;;
|
||||
*)
|
||||
usage
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "OUT_DIR=$OUT_DIR"
|
||||
169
scripts/nccl_environment_snapshot.sh
Normal file
169
scripts/nccl_environment_snapshot.sh
Normal file
@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Collect a lightweight NCCL/RDMA environment snapshot on one node.
|
||||
# This script does not run NCCL workloads and is safe to use before deeper tests.
|
||||
|
||||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
OUT_FILE="${1:-${OUT_FILE:-/tmp/nccl_environment_snapshot_${HOST}_${TS}.md}}"
|
||||
PDF_ALLREDUCE_BUSBW="${PDF_ALLREDUCE_BUSBW:-491.84}"
|
||||
PDF_ALLTOALL_BUSBW="${PDF_ALLTOALL_BUSBW:-76.54}"
|
||||
PLUGIN_SEARCH_ROOTS="${PLUGIN_SEARCH_ROOTS:-/usr /opt /tmp /root}"
|
||||
|
||||
mkdir -p "$(dirname "$OUT_FILE")"
|
||||
shopt -s nullglob
|
||||
|
||||
have_cmd() {
|
||||
command -v "$1" >/dev/null 2>&1
|
||||
}
|
||||
|
||||
emit_cmd() {
|
||||
local title="$1"
|
||||
shift
|
||||
{
|
||||
echo
|
||||
echo "### $title"
|
||||
echo
|
||||
echo '```text'
|
||||
"$@" 2>&1 || true
|
||||
echo '```'
|
||||
} >>"$OUT_FILE"
|
||||
}
|
||||
|
||||
active_400g_hcas=()
|
||||
non_400g_rows=()
|
||||
|
||||
{
|
||||
echo "# NCCL/RDMA 环境快照"
|
||||
echo
|
||||
echo "- Host: \`$HOST\`"
|
||||
echo "- Time: \`$(date -Is 2>/dev/null || date)\`"
|
||||
echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`"
|
||||
echo
|
||||
echo "## HCA / Port 状态"
|
||||
echo
|
||||
echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |"
|
||||
echo "|---|---:|---|---|---:|---|---|"
|
||||
} >"$OUT_FILE"
|
||||
|
||||
hca_paths=(/sys/class/infiniband/mlx5_*)
|
||||
if ((${#hca_paths[@]})); then
|
||||
for hca_path in "${hca_paths[@]}"; do
|
||||
hca="$(basename "$hca_path")"
|
||||
for port_path in "$hca_path"/ports/*; do
|
||||
[[ -d "$port_path" ]] || continue
|
||||
port="$(basename "$port_path")"
|
||||
state="$(cat "$port_path/state" 2>/dev/null || echo NA)"
|
||||
phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)"
|
||||
rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)"
|
||||
layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)"
|
||||
is_400g="NO"
|
||||
if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then
|
||||
is_400g="YES"
|
||||
active_400g_hcas+=("$hca")
|
||||
else
|
||||
non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer")
|
||||
fi
|
||||
printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \
|
||||
"$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE"
|
||||
done
|
||||
done
|
||||
else
|
||||
printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE"
|
||||
fi
|
||||
|
||||
{
|
||||
echo
|
||||
echo "## Rail 摘要"
|
||||
echo
|
||||
if ((${#active_400g_hcas[@]})); then
|
||||
hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")"
|
||||
echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`"
|
||||
echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`"
|
||||
echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`"
|
||||
else
|
||||
echo "- Active 400G IB rail count: \`0\`"
|
||||
echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`"
|
||||
fi
|
||||
echo
|
||||
echo "Non-400G / non-IB / down ports:"
|
||||
echo
|
||||
if ((${#non_400g_rows[@]})); then
|
||||
for row in "${non_400g_rows[@]}"; do
|
||||
echo "- \`$row\`"
|
||||
done
|
||||
else
|
||||
echo "- none"
|
||||
fi
|
||||
echo
|
||||
echo "## PDF 目标换算"
|
||||
echo
|
||||
echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`"
|
||||
echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`"
|
||||
} >>"$OUT_FILE"
|
||||
|
||||
python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true
|
||||
import sys
|
||||
|
||||
busbw = float(sys.argv[1])
|
||||
rail_count = int(sys.argv[2])
|
||||
algbw = busbw / 1.875
|
||||
raw = rail_count * 50.0
|
||||
print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`")
|
||||
if rail_count:
|
||||
pct = algbw / raw * 100
|
||||
print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`")
|
||||
if algbw > raw:
|
||||
print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.")
|
||||
else:
|
||||
print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.")
|
||||
else:
|
||||
print("- Interpretation: no active 400G IB rail was detected.")
|
||||
PY
|
||||
|
||||
{
|
||||
echo
|
||||
echo "## NCCL Net Plugin / SHARP 文件"
|
||||
echo
|
||||
echo '```text'
|
||||
} >>"$OUT_FILE"
|
||||
|
||||
read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS"
|
||||
find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \
|
||||
2>/dev/null | sort >>"$OUT_FILE" || true
|
||||
|
||||
if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then
|
||||
echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE"
|
||||
fi
|
||||
|
||||
echo '```' >>"$OUT_FILE"
|
||||
|
||||
if have_cmd dpkg; then
|
||||
emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'"
|
||||
else
|
||||
emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'"
|
||||
fi
|
||||
|
||||
if have_cmd nvidia-smi; then
|
||||
emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m
|
||||
else
|
||||
emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'"
|
||||
fi
|
||||
|
||||
if have_cmd ibstat; then
|
||||
emit_cmd "ibstat" ibstat
|
||||
fi
|
||||
|
||||
{
|
||||
echo
|
||||
echo "## 建议判断"
|
||||
echo
|
||||
echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境,不能直接按 PDF 阈值判断等价。"
|
||||
echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`,NCCL 可能只能走 internal IB plugin。"
|
||||
echo "3. 若要追 PDF 2x8 目标,请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。"
|
||||
echo
|
||||
echo "Snapshot written to: \`$OUT_FILE\`"
|
||||
} >>"$OUT_FILE"
|
||||
|
||||
echo "$OUT_FILE"
|
||||
277
scripts/pytorch_fp8_path_bench.py
Executable file
277
scripts/pytorch_fp8_path_bench.py
Executable file
@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare FP8 GEMM paths used for H100/H200 acceptance debugging.
|
||||
|
||||
Paths:
|
||||
A. torch._scaled_mm eager, default accumulation
|
||||
B. torch._scaled_mm eager, use_fast_accum=True
|
||||
C. CUDA Graph replay of torch._scaled_mm(out=..., use_fast_accum=True)
|
||||
D. Transformer Engine Linear under fp8_autocast, when installed
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from typing import Any, Callable
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
def tflops_from_ms(matrix_size: int, iterations: int, elapsed_ms: float) -> float:
|
||||
flops = 2.0 * matrix_size * matrix_size * matrix_size * iterations
|
||||
return flops / (elapsed_ms / 1000.0) / 1e12
|
||||
|
||||
|
||||
def cuda_event_bench(
|
||||
name: str,
|
||||
matrix_size: int,
|
||||
iterations: int,
|
||||
warmup: int,
|
||||
func: Callable[[int], Any],
|
||||
) -> dict[str, Any]:
|
||||
for i in range(warmup):
|
||||
func(i)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
wall_start = time.perf_counter()
|
||||
start.record()
|
||||
for i in range(iterations):
|
||||
func(i)
|
||||
end.record()
|
||||
torch.cuda.synchronize()
|
||||
wall_elapsed = time.perf_counter() - wall_start
|
||||
elapsed_ms = start.elapsed_time(end)
|
||||
return {
|
||||
"name": name,
|
||||
"status": "ok",
|
||||
"matrix_size": matrix_size,
|
||||
"iterations": iterations,
|
||||
"warmup": warmup,
|
||||
"event_ms_total": round(elapsed_ms, 3),
|
||||
"event_us_per_iter": round(elapsed_ms * 1000.0 / iterations, 3),
|
||||
"wall_ms_total": round(wall_elapsed * 1000.0, 3),
|
||||
"tflops": round(tflops_from_ms(matrix_size, iterations, elapsed_ms), 1),
|
||||
}
|
||||
|
||||
|
||||
def make_fp8_inputs(matrix_size: int, pools: int, device: str) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
|
||||
a = [
|
||||
torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
|
||||
for _ in range(pools)
|
||||
]
|
||||
b = [
|
||||
torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
|
||||
for _ in range(pools)
|
||||
]
|
||||
torch.cuda.synchronize()
|
||||
return a, b
|
||||
|
||||
|
||||
def bench_scaled_mm(args: argparse.Namespace) -> list[dict[str, Any]]:
|
||||
device = f"cuda:{args.gpu_index}"
|
||||
torch.cuda.set_device(args.gpu_index)
|
||||
scale_a = torch.tensor(1.0, device=device)
|
||||
scale_b = torch.tensor(1.0, device=device)
|
||||
pools_a, pools_b = make_fp8_inputs(args.matrix_size, args.pools, device)
|
||||
results: list[dict[str, Any]] = []
|
||||
|
||||
def eager_default(i: int) -> torch.Tensor:
|
||||
idx = i % args.pools
|
||||
return torch._scaled_mm(
|
||||
pools_a[idx],
|
||||
pools_b[idx].T,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=torch.bfloat16,
|
||||
)
|
||||
|
||||
def eager_fast(i: int) -> torch.Tensor:
|
||||
idx = i % args.pools
|
||||
return torch._scaled_mm(
|
||||
pools_a[idx],
|
||||
pools_b[idx].T,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=torch.bfloat16,
|
||||
use_fast_accum=True,
|
||||
)
|
||||
|
||||
results.append(
|
||||
cuda_event_bench(
|
||||
"A_eager_scaled_mm_default",
|
||||
args.matrix_size,
|
||||
args.iterations,
|
||||
args.warmup,
|
||||
eager_default,
|
||||
)
|
||||
)
|
||||
results.append(
|
||||
cuda_event_bench(
|
||||
"B_eager_scaled_mm_fast_accum",
|
||||
args.matrix_size,
|
||||
args.iterations,
|
||||
args.warmup,
|
||||
eager_fast,
|
||||
)
|
||||
)
|
||||
|
||||
graph_out = torch.empty(
|
||||
(args.matrix_size, args.matrix_size),
|
||||
device=device,
|
||||
dtype=torch.bfloat16,
|
||||
)
|
||||
static_a = pools_a[0]
|
||||
static_b_t = pools_b[0].T
|
||||
|
||||
try:
|
||||
side_stream = torch.cuda.Stream()
|
||||
side_stream.wait_stream(torch.cuda.current_stream())
|
||||
with torch.cuda.stream(side_stream):
|
||||
for _ in range(max(3, args.warmup // 2)):
|
||||
torch._scaled_mm(
|
||||
static_a,
|
||||
static_b_t,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=torch.bfloat16,
|
||||
use_fast_accum=True,
|
||||
out=graph_out,
|
||||
)
|
||||
torch.cuda.current_stream().wait_stream(side_stream)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
graph = torch.cuda.CUDAGraph()
|
||||
with torch.cuda.graph(graph):
|
||||
torch._scaled_mm(
|
||||
static_a,
|
||||
static_b_t,
|
||||
scale_a=scale_a,
|
||||
scale_b=scale_b,
|
||||
out_dtype=torch.bfloat16,
|
||||
use_fast_accum=True,
|
||||
out=graph_out,
|
||||
)
|
||||
|
||||
def graph_replay(_: int) -> None:
|
||||
graph.replay()
|
||||
|
||||
results.append(
|
||||
cuda_event_bench(
|
||||
"C_cuda_graph_scaled_mm_fast_accum",
|
||||
args.matrix_size,
|
||||
args.iterations,
|
||||
3,
|
||||
graph_replay,
|
||||
)
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
results.append(
|
||||
{
|
||||
"name": "C_cuda_graph_scaled_mm_fast_accum",
|
||||
"status": "unavailable",
|
||||
"reason": f"{type(exc).__name__}: {exc}",
|
||||
}
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def bench_transformer_engine(args: argparse.Namespace) -> dict[str, Any]:
|
||||
try:
|
||||
import transformer_engine.pytorch as te # type: ignore[import-not-found]
|
||||
from transformer_engine.common.recipe import DelayedScaling, Format # type: ignore[import-not-found]
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return {
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "unavailable",
|
||||
"reason": f"{type(exc).__name__}: {exc}",
|
||||
}
|
||||
|
||||
device = f"cuda:{args.gpu_index}"
|
||||
x = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=torch.bfloat16)
|
||||
layer = te.Linear(
|
||||
args.matrix_size,
|
||||
args.matrix_size,
|
||||
bias=False,
|
||||
params_dtype=torch.bfloat16,
|
||||
device=device,
|
||||
)
|
||||
recipe = DelayedScaling(fp8_format=Format.HYBRID)
|
||||
|
||||
def run(_: int) -> torch.Tensor:
|
||||
with te.fp8_autocast(enabled=True, fp8_recipe=recipe):
|
||||
return layer(x)
|
||||
|
||||
try:
|
||||
result = cuda_event_bench(
|
||||
"D_transformer_engine_fp8_linear",
|
||||
args.matrix_size,
|
||||
args.iterations,
|
||||
args.warmup,
|
||||
run,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return {
|
||||
"name": "D_transformer_engine_fp8_linear",
|
||||
"status": "error",
|
||||
"reason": f"{type(exc).__name__}: {exc}",
|
||||
}
|
||||
result["note"] = "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
|
||||
return result
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--matrix-size", type=int, default=8192)
|
||||
parser.add_argument("--warmup", type=int, default=20)
|
||||
parser.add_argument("--iterations", type=int, default=100)
|
||||
parser.add_argument("--gpu-index", type=int, default=0)
|
||||
parser.add_argument("--pools", type=int, default=4)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not torch.cuda.is_available():
|
||||
print(json.dumps({"error": "cuda unavailable"}, indent=2))
|
||||
return 1
|
||||
if not hasattr(torch, "_scaled_mm") or not hasattr(torch, "float8_e4m3fn"):
|
||||
print(json.dumps({"error": "torch FP8 _scaled_mm unavailable"}, indent=2))
|
||||
return 1
|
||||
|
||||
torch.cuda.set_device(args.gpu_index)
|
||||
props = torch.cuda.get_device_properties(args.gpu_index)
|
||||
payload = {
|
||||
"source": "pytorch_fp8_path_bench",
|
||||
"torch": torch.__version__,
|
||||
"cuda": torch.version.cuda,
|
||||
"gpu_index": args.gpu_index,
|
||||
"gpu_name": props.name,
|
||||
"matrix_size": args.matrix_size,
|
||||
"warmup": args.warmup,
|
||||
"iterations": args.iterations,
|
||||
"results": [],
|
||||
}
|
||||
try:
|
||||
payload["results"].extend(bench_scaled_mm(args))
|
||||
payload["results"].append(bench_transformer_engine(args))
|
||||
except torch.cuda.OutOfMemoryError as exc:
|
||||
payload["error"] = f"CUDA OOM: {exc}"
|
||||
print(json.dumps(payload, indent=2))
|
||||
return 1
|
||||
|
||||
ok_values = [r["tflops"] for r in payload["results"] if r.get("status") == "ok"]
|
||||
if ok_values:
|
||||
payload["summary"] = {
|
||||
"max_tflops": round(max(ok_values), 1),
|
||||
"min_tflops": round(min(ok_values), 1),
|
||||
"mean_tflops": round(statistics.mean(ok_values), 1),
|
||||
}
|
||||
print(json.dumps(payload, indent=2))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
45
scripts/run_cublaslt_fp8_gemm.sh
Executable file
45
scripts/run_cublaslt_fp8_gemm.sh
Executable file
@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
||||
|
||||
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
|
||||
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
|
||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
||||
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
|
||||
WARMUP="${WARMUP:-20}"
|
||||
ITERATIONS="${ITERATIONS:-200}"
|
||||
GPU_COUNT="${GPU_COUNT:-8}"
|
||||
FIRST_GPU="${FIRST_GPU:-0}"
|
||||
WORKSPACE_MB="${WORKSPACE_MB:-256}"
|
||||
|
||||
if [[ ! -x "$NVCC" ]]; then
|
||||
echo "nvcc not found: $NVCC" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$OUT_DIR" "$PROJECT_DIR/build"
|
||||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
BIN="$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
|
||||
REPORT="$OUT_DIR/cublaslt_fp8_gemm_${HOST}_${TS}.json"
|
||||
|
||||
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
|
||||
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
|
||||
-lcublasLt -lcublas -o "$BIN"
|
||||
|
||||
set +e
|
||||
"$BIN" \
|
||||
--matrix-size "$MATRIX_SIZE" \
|
||||
--warmup "$WARMUP" \
|
||||
--iterations "$ITERATIONS" \
|
||||
--first-gpu "$FIRST_GPU" \
|
||||
--gpu-count "$GPU_COUNT" \
|
||||
--workspace-mb "$WORKSPACE_MB" \
|
||||
| tee "$REPORT"
|
||||
status=${PIPESTATUS[0]}
|
||||
set -e
|
||||
|
||||
echo "Report written to: $REPORT"
|
||||
exit "$status"
|
||||
93
scripts/run_fp8_path_comparison.sh
Executable file
93
scripts/run_fp8_path_comparison.sh
Executable file
@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
||||
|
||||
PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}"
|
||||
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}"
|
||||
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
|
||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
||||
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
|
||||
WARMUP="${WARMUP:-20}"
|
||||
ITERATIONS="${ITERATIONS:-100}"
|
||||
GPU_INDEX="${GPU_INDEX:-0}"
|
||||
WORKSPACE_MB="${WORKSPACE_MB:-256}"
|
||||
VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY'
|
||||
import site
|
||||
print(site.getsitepackages()[0])
|
||||
PY
|
||||
)"
|
||||
export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}"
|
||||
|
||||
mkdir -p "$PROJECT_DIR/build" "$OUT_DIR"
|
||||
|
||||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json"
|
||||
CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json"
|
||||
COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json"
|
||||
|
||||
"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \
|
||||
--matrix-size "$MATRIX_SIZE" \
|
||||
--warmup "$WARMUP" \
|
||||
--iterations "$ITERATIONS" \
|
||||
--gpu-index "$GPU_INDEX" | tee "$PY_REPORT"
|
||||
|
||||
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
|
||||
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
|
||||
-lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
|
||||
|
||||
"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \
|
||||
--matrix-size "$MATRIX_SIZE" \
|
||||
--warmup "$WARMUP" \
|
||||
--iterations "$ITERATIONS" \
|
||||
--first-gpu "$GPU_INDEX" \
|
||||
--gpu-count 1 \
|
||||
--workspace-mb "$WORKSPACE_MB" \
|
||||
--fast-accum 1 | tee "$CUBLAS_REPORT"
|
||||
|
||||
"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY'
|
||||
import json
|
||||
import pathlib
|
||||
import sys
|
||||
|
||||
py_report = pathlib.Path(sys.argv[1])
|
||||
cublas_report = pathlib.Path(sys.argv[2])
|
||||
combined_report = pathlib.Path(sys.argv[3])
|
||||
|
||||
with py_report.open() as f:
|
||||
py_payload = json.load(f)
|
||||
with cublas_report.open() as f:
|
||||
cublas_payload = json.load(f)
|
||||
|
||||
combined = {
|
||||
"source": "fp8_path_comparison",
|
||||
"host": cublas_payload.get("host"),
|
||||
"matrix_size": py_payload.get("matrix_size"),
|
||||
"gpu_index": py_payload.get("gpu_index"),
|
||||
"pytorch": py_payload,
|
||||
"cublaslt": cublas_payload,
|
||||
"results": [],
|
||||
}
|
||||
combined["results"].extend(py_payload.get("results", []))
|
||||
per_gpu = cublas_payload.get("per_gpu", [])
|
||||
if per_gpu:
|
||||
row = dict(per_gpu[0])
|
||||
row.update({
|
||||
"name": "E_direct_cublaslt_fast_accum",
|
||||
"status": "ok",
|
||||
"tflops": row.pop("fp8_tflops"),
|
||||
"matrix_size": cublas_payload.get("matrix_size"),
|
||||
"iterations": cublas_payload.get("iterations"),
|
||||
"warmup": cublas_payload.get("warmup"),
|
||||
"fast_accum": cublas_payload.get("fast_accum"),
|
||||
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.",
|
||||
})
|
||||
combined["results"].append(row)
|
||||
|
||||
combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8")
|
||||
print(f"Combined report written to: {combined_report}")
|
||||
PY
|
||||
|
||||
echo "$COMBINED_REPORT"
|
||||
134
scripts/run_h100_single_node_all.sh
Executable file
134
scripts/run_h100_single_node_all.sh
Executable file
@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
# Run the single-node H100 acceptance suite and keep the raw report paths stable.
|
||||
# The suite itself still lives in gpu_tester.py; this wrapper only standardizes
|
||||
# snapshot/report naming for repeated machine-level runs.
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
||||
|
||||
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
|
||||
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/default.yaml}"
|
||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
||||
FORMAT="${FORMAT:-md}"
|
||||
DRY_RUN=0
|
||||
SNAPSHOT=1
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: run_h100_single_node_all.sh [options]
|
||||
|
||||
Options:
|
||||
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
|
||||
--config PATH gpu_tester config file (default: configs/default.yaml)
|
||||
--out-dir PATH Report output directory (default: reports)
|
||||
--format FORMAT Report format: md, json, or html (default: md)
|
||||
--no-snapshot Do not run nccl_environment_snapshot.sh first
|
||||
--dry-run Print commands without running them
|
||||
-h, --help Show this help
|
||||
EOF
|
||||
}
|
||||
|
||||
while (($#)); do
|
||||
case "$1" in
|
||||
--python)
|
||||
PYTHON_BIN="$2"
|
||||
shift 2
|
||||
;;
|
||||
--config)
|
||||
CONFIG_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--out-dir)
|
||||
OUT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--format)
|
||||
FORMAT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--no-snapshot)
|
||||
SNAPSHOT=0
|
||||
shift
|
||||
;;
|
||||
--dry-run)
|
||||
DRY_RUN=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
|
||||
echo "Unsupported format: $FORMAT" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [[ ! -x "$PYTHON_BIN" ]]; then
|
||||
PYTHON_BIN="$(command -v python3 || true)"
|
||||
fi
|
||||
|
||||
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
|
||||
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
HOST="$(hostname 2>/dev/null || echo unknown)"
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$OUT_DIR"
|
||||
|
||||
SNAPSHOT_FILE="$OUT_DIR/nccl_environment_snapshot_${HOST}_${TS}.md"
|
||||
REPORT_FILE="$OUT_DIR/h100_single_node_all_${HOST}_${TS}.${FORMAT}"
|
||||
|
||||
snapshot_cmd=(bash "$PROJECT_DIR/scripts/nccl_environment_snapshot.sh" "$SNAPSHOT_FILE")
|
||||
test_cmd=(
|
||||
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
|
||||
--config "$CONFIG_FILE"
|
||||
--test all
|
||||
--report
|
||||
--format "$FORMAT"
|
||||
--output "$REPORT_FILE"
|
||||
)
|
||||
|
||||
echo "Project: $PROJECT_DIR"
|
||||
echo "Host: $HOST"
|
||||
echo "Config: $CONFIG_FILE"
|
||||
echo "Report: $REPORT_FILE"
|
||||
if ((SNAPSHOT)); then
|
||||
echo "Snapshot: $SNAPSHOT_FILE"
|
||||
fi
|
||||
|
||||
if ((DRY_RUN)); then
|
||||
if ((SNAPSHOT)); then
|
||||
printf 'DRY RUN snapshot:'
|
||||
printf ' %q' "${snapshot_cmd[@]}"
|
||||
printf '\n'
|
||||
fi
|
||||
printf 'DRY RUN test:'
|
||||
printf ' %q' "${test_cmd[@]}"
|
||||
printf '\n'
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ((SNAPSHOT)); then
|
||||
"${snapshot_cmd[@]}"
|
||||
fi
|
||||
|
||||
"${test_cmd[@]}"
|
||||
status=$?
|
||||
|
||||
echo "Report written to: $REPORT_FILE"
|
||||
if ((SNAPSHOT)); then
|
||||
echo "Snapshot written to: $SNAPSHOT_FILE"
|
||||
fi
|
||||
|
||||
exit "$status"
|
||||
147
scripts/run_multinode_nccl_all_collectives.sh
Executable file
147
scripts/run_multinode_nccl_all_collectives.sh
Executable file
@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
# Run a two-node, eight-GPU-per-node NCCL evidence pass across the six
|
||||
# collectives used by the single-node H100 acceptance flow.
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
||||
|
||||
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
|
||||
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml}"
|
||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
||||
FORMAT="${FORMAT:-md}"
|
||||
DRY_RUN=0
|
||||
RUN_PREFLIGHT=1
|
||||
PREFLIGHT_ONLY=0
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: run_multinode_nccl_all_collectives.sh [options]
|
||||
|
||||
Options:
|
||||
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
|
||||
--config PATH Config file (default: configs/multinode_nccl_nccl227_all_collectives_2x8.yaml)
|
||||
--out-dir PATH Report output directory (default: reports)
|
||||
--format FORMAT Report format: md, json, or html (default: md)
|
||||
--no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight
|
||||
--preflight-only Run only the preflight check, not the workload
|
||||
--dry-run Print commands without running them
|
||||
-h, --help Show this help
|
||||
EOF
|
||||
}
|
||||
|
||||
while (($#)); do
|
||||
case "$1" in
|
||||
--python)
|
||||
PYTHON_BIN="$2"
|
||||
shift 2
|
||||
;;
|
||||
--config)
|
||||
CONFIG_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--out-dir)
|
||||
OUT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--format)
|
||||
FORMAT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--no-preflight)
|
||||
RUN_PREFLIGHT=0
|
||||
shift
|
||||
;;
|
||||
--preflight-only)
|
||||
PREFLIGHT_ONLY=1
|
||||
shift
|
||||
;;
|
||||
--dry-run)
|
||||
DRY_RUN=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
|
||||
echo "Unsupported format: $FORMAT" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [[ ! -x "$PYTHON_BIN" ]]; then
|
||||
PYTHON_BIN="$(command -v python3 || true)"
|
||||
fi
|
||||
|
||||
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
|
||||
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$OUT_DIR"
|
||||
|
||||
REPORT_FILE="$OUT_DIR/multinode_nccl_all_collectives_${TS}.${FORMAT}"
|
||||
ARTIFACT_DIR="$OUT_DIR/multinode_nccl_all_collectives_${TS}_artifacts"
|
||||
PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
|
||||
RUN_CMD=(
|
||||
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
|
||||
--config "$CONFIG_FILE"
|
||||
--test multinode-nccl
|
||||
--report
|
||||
--format "$FORMAT"
|
||||
--output "$REPORT_FILE"
|
||||
)
|
||||
|
||||
echo "Project: $PROJECT_DIR"
|
||||
echo "Config: $CONFIG_FILE"
|
||||
echo "Report: $REPORT_FILE"
|
||||
echo "Artifacts: $ARTIFACT_DIR"
|
||||
echo "Collectives: allreduce, alltoall, broadcast, reducescatter, allgather, sendrecv"
|
||||
echo "Topology: 2 nodes x 8 GPUs per node; 16G"
|
||||
|
||||
if ((DRY_RUN)); then
|
||||
if ((RUN_PREFLIGHT)); then
|
||||
printf 'DRY RUN preflight:'
|
||||
printf ' %q' "${PREFLIGHT_CMD[@]}"
|
||||
printf '\n'
|
||||
fi
|
||||
if ((PREFLIGHT_ONLY)); then
|
||||
exit 0
|
||||
fi
|
||||
printf 'DRY RUN workload:'
|
||||
printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
|
||||
printf ' %q' "${RUN_CMD[@]}"
|
||||
printf '\n'
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ((RUN_PREFLIGHT)); then
|
||||
"${PREFLIGHT_CMD[@]}"
|
||||
preflight_status=$?
|
||||
if ((preflight_status != 0)); then
|
||||
echo "Preflight failed with exit code $preflight_status" >&2
|
||||
exit "$preflight_status"
|
||||
fi
|
||||
fi
|
||||
|
||||
if ((PREFLIGHT_ONLY)); then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "$ARTIFACT_DIR"
|
||||
MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${RUN_CMD[@]}"
|
||||
status=$?
|
||||
|
||||
echo "Report written to: $REPORT_FILE"
|
||||
echo "Artifacts written to: $ARTIFACT_DIR"
|
||||
exit "$status"
|
||||
147
scripts/run_multinode_nccl_pdf_matrix.sh
Executable file
147
scripts/run_multinode_nccl_pdf_matrix.sh
Executable file
@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env bash
|
||||
set -uo pipefail
|
||||
|
||||
# Run the formal cross-node NCCL PDF matrix for the current two-node H100 pair.
|
||||
# This wrapper standardizes the command, output naming, and preflight hook; the
|
||||
# actual benchmark implementation remains in gpu_tester.py / MultiNodeNCCLTest.
|
||||
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
|
||||
|
||||
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
|
||||
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_pdf_matrix.yaml}"
|
||||
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
|
||||
FORMAT="${FORMAT:-md}"
|
||||
DRY_RUN=0
|
||||
RUN_PREFLIGHT=1
|
||||
PREFLIGHT_ONLY=0
|
||||
|
||||
usage() {
|
||||
cat <<'EOF'
|
||||
Usage: run_multinode_nccl_pdf_matrix.sh [options]
|
||||
|
||||
Options:
|
||||
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
|
||||
--config PATH Matrix config file (default: configs/multinode_nccl_nccl227_pdf_matrix.yaml)
|
||||
--out-dir PATH Report output directory (default: reports)
|
||||
--format FORMAT Report format: md, json, or html (default: md)
|
||||
--no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight
|
||||
--preflight-only Run only the preflight check, not the matrix workload
|
||||
--dry-run Print commands without running them
|
||||
-h, --help Show this help
|
||||
EOF
|
||||
}
|
||||
|
||||
while (($#)); do
|
||||
case "$1" in
|
||||
--python)
|
||||
PYTHON_BIN="$2"
|
||||
shift 2
|
||||
;;
|
||||
--config)
|
||||
CONFIG_FILE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--out-dir)
|
||||
OUT_DIR="$2"
|
||||
shift 2
|
||||
;;
|
||||
--format)
|
||||
FORMAT="$2"
|
||||
shift 2
|
||||
;;
|
||||
--no-preflight)
|
||||
RUN_PREFLIGHT=0
|
||||
shift
|
||||
;;
|
||||
--preflight-only)
|
||||
PREFLIGHT_ONLY=1
|
||||
shift
|
||||
;;
|
||||
--dry-run)
|
||||
DRY_RUN=1
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown argument: $1" >&2
|
||||
usage >&2
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
|
||||
echo "Unsupported format: $FORMAT" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
if [[ ! -x "$PYTHON_BIN" ]]; then
|
||||
PYTHON_BIN="$(command -v python3 || true)"
|
||||
fi
|
||||
|
||||
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
|
||||
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TS="$(date +%Y%m%d_%H%M%S)"
|
||||
mkdir -p "$OUT_DIR"
|
||||
|
||||
REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}"
|
||||
ARTIFACT_DIR="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}_artifacts"
|
||||
PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
|
||||
MATRIX_CMD=(
|
||||
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
|
||||
--config "$CONFIG_FILE"
|
||||
--test multinode-nccl
|
||||
--report
|
||||
--format "$FORMAT"
|
||||
--output "$REPORT_FILE"
|
||||
)
|
||||
|
||||
echo "Project: $PROJECT_DIR"
|
||||
echo "Config: $CONFIG_FILE"
|
||||
echo "Report: $REPORT_FILE"
|
||||
echo "Artifacts: $ARTIFACT_DIR"
|
||||
echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G"
|
||||
|
||||
if ((DRY_RUN)); then
|
||||
if ((RUN_PREFLIGHT)); then
|
||||
printf 'DRY RUN preflight:'
|
||||
printf ' %q' "${PREFLIGHT_CMD[@]}"
|
||||
printf '\n'
|
||||
fi
|
||||
if ((PREFLIGHT_ONLY)); then
|
||||
exit 0
|
||||
fi
|
||||
printf 'DRY RUN matrix:'
|
||||
printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
|
||||
printf ' %q' "${MATRIX_CMD[@]}"
|
||||
printf '\n'
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if ((RUN_PREFLIGHT)); then
|
||||
"${PREFLIGHT_CMD[@]}"
|
||||
preflight_status=$?
|
||||
if ((preflight_status != 0)); then
|
||||
echo "Preflight failed with exit code $preflight_status" >&2
|
||||
exit "$preflight_status"
|
||||
fi
|
||||
fi
|
||||
|
||||
if ((PREFLIGHT_ONLY)); then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "$ARTIFACT_DIR"
|
||||
MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${MATRIX_CMD[@]}"
|
||||
status=$?
|
||||
|
||||
echo "Report written to: $REPORT_FILE"
|
||||
echo "Artifacts written to: $ARTIFACT_DIR"
|
||||
exit "$status"
|
||||
Loading…
x
Reference in New Issue
Block a user