h100-acceptance-current #3

Open
cs wants to merge 41 commits from h100-acceptance-current into main
26 changed files with 3988 additions and 161 deletions

8
.gitignore vendored
View File

@ -6,6 +6,12 @@ __pycache__/
dist/
build/
reports/
reports_*
H100*.md
test_all*.md
docs/h100_test_all_metrics_guide_cn.md
docs/multinode_nccl_concepts.md
docs/multinode_nccl_deep_diagnose_runbook.md
*.egg
.eggs/
*.log
@ -14,4 +20,6 @@ reports/
.venv/
venv/
.qoder/*
.playwright-mcp/
.claude/settings.local.json
.omx/

View File

@ -48,6 +48,55 @@ nccl:
test_allgather: false
test_sendrecv: false
multinode_nccl:
enabled: false
mode: sweep
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build
tests:
- all_reduce_perf
- alltoall_perf
topologies:
- nodes: 2
gpus_per_node: 8
begin_size: 1k
end_size: 16g
step_factor: 2
warmup_iters: 10
gpus_per_rank: 1
timeout_sec: 1800
socket_ifname: bond0
oob_tcp_ifname: bond0
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: null
min_nchannels: null
net_plugin: none
nvls_enable: 1
split_data_on_qps: null
extra_env: {}
min_peak_busbw_gbps:
allreduce: 480
alltoall: 75
stress:
duration_sec: 600 # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup
use_doubles: false

View File

@ -0,0 +1,62 @@
tools:
install_dir: /opt/gpu-test-tools
report:
output_dir: ./reports
format: md
multinode_nccl:
enabled: true
mode: diagnostic
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: null
tests:
- all_reduce_perf
- alltoall_perf
topologies:
- nodes: 2
gpus_per_node: 8
label: 2 nodes x 8 GPUs diagnostic
begin_size: 256M
end_size: 256M
step_factor: 2
warmup_iters: 1
iters: 3
gpus_per_rank: 1
timeout_sec: 600
debug: INFO
socket_ifname: bond0
oob_tcp_ifname: bond0
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: null
min_nchannels: null
net_plugin: none
nvls_enable: 1
split_data_on_qps: null
extra_env:
NCCL_DEBUG_SUBSYS: INIT,NET
NCCL_NET_GDR_LEVEL: 5
NCCL_NET_GDR_READ: 1
NCCL_DMABUF_ENABLE: 0
min_peak_busbw_gbps:
allreduce: 480
alltoall: 75

View File

@ -0,0 +1,62 @@
tools:
install_dir: /opt/gpu-test-tools
report:
output_dir: ./reports
format: md
multinode_nccl:
enabled: true
mode: large-message-nccl-2.27.7
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: /data/nccl-tests-latest/build
tests:
- all_reduce_perf
- alltoall_perf
topologies:
- nodes: 2
gpus_per_node: 8
label: 2 nodes x 8 GPUs NCCL 2.27.7 16G
begin_size: 16G
end_size: 16G
step_factor: 2
warmup_iters: 1
iters: 3
gpus_per_rank: 1
timeout_sec: 1200
debug: INFO
socket_ifname: bond0
oob_tcp_ifname: bond0
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: null
min_nchannels: null
net_plugin: none
nvls_enable: 1
split_data_on_qps: null
extra_env:
NCCL_DEBUG_SUBSYS: INIT,NET
NCCL_NET_GDR_LEVEL: 5
NCCL_NET_GDR_READ: 1
NCCL_DMABUF_ENABLE: 0
min_peak_busbw_gbps:
allreduce: 480
alltoall: 75

View File

@ -0,0 +1,72 @@
tools:
install_dir: /opt/gpu-test-tools
report:
output_dir: ./reports
format: md
multinode_nccl:
enabled: true
mode: cross-leaf-all-collectives-nccl-2.27.7
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: /data/nccl-tests-latest/build
tests:
- all_reduce_perf
- alltoall_perf
- broadcast_perf
- reduce_scatter_perf
- all_gather_perf
- sendrecv_perf
topologies:
- nodes: 2
gpus_per_node: 8
label: 2 nodes x 8 GPUs (all collectives evidence run)
op_env:
alltoall:
NCCL_PXN_DISABLE: 1
begin_size: 16G
end_size: 16G
step_factor: 2
warmup_iters: 10
gpus_per_rank: 1
timeout_sec: 1800
debug: INFO
socket_ifname: bond0
oob_tcp_ifname: bond0
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: null
min_nchannels: null
net_plugin: none
nvls_enable: 1
split_data_on_qps: null
extra_env:
NCCL_DEBUG_SUBSYS: INIT,NET
NCCL_NET_GDR_LEVEL: 5
NCCL_NET_GDR_READ: 1
NCCL_DMABUF_ENABLE: 0
min_peak_busbw_gbps:
allreduce: 491.84
alltoall: 76.54
broadcast: 0
reducescatter: 0
allgather: 0
sendrecv: 0

View File

@ -0,0 +1,62 @@
tools:
install_dir: /opt/gpu-test-tools
report:
output_dir: ./reports
format: md
multinode_nccl:
enabled: true
mode: large-message-nccl-2.27.7-auto
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: /data/nccl-tests-latest/build
tests:
- all_reduce_perf
- alltoall_perf
topologies:
- nodes: 2
gpus_per_node: 8
label: 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G
begin_size: 16G
end_size: 16G
step_factor: 2
warmup_iters: 1
iters: 3
gpus_per_rank: 1
timeout_sec: 1200
debug: INFO
socket_ifname: bond0
oob_tcp_ifname: bond0
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: null
min_nchannels: null
net_plugin: none
nvls_enable: 1
split_data_on_qps: null
extra_env:
NCCL_DEBUG_SUBSYS: INIT,NET
NCCL_NET_GDR_LEVEL: 5
NCCL_NET_GDR_READ: 1
NCCL_DMABUF_ENABLE: 0
min_peak_busbw_gbps:
allreduce: 480
alltoall: 75

View File

@ -0,0 +1,62 @@
tools:
install_dir: /opt/gpu-test-tools
report:
output_dir: ./reports
format: md
multinode_nccl:
enabled: true
mode: diagnostic-nccl-2.27.7
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: /data/nccl-tests-latest/build
tests:
- all_reduce_perf
- alltoall_perf
topologies:
- nodes: 2
gpus_per_node: 8
label: 2 nodes x 8 GPUs NCCL 2.27.7
begin_size: 256M
end_size: 256M
step_factor: 2
warmup_iters: 1
iters: 3
gpus_per_rank: 1
timeout_sec: 600
debug: INFO
socket_ifname: bond0
oob_tcp_ifname: bond0
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: null
min_nchannels: null
net_plugin: none
nvls_enable: 1
split_data_on_qps: null
extra_env:
NCCL_DEBUG_SUBSYS: INIT,NET
NCCL_NET_GDR_LEVEL: 5
NCCL_NET_GDR_READ: 1
NCCL_DMABUF_ENABLE: 0
min_peak_busbw_gbps:
allreduce: 480
alltoall: 75

View File

@ -0,0 +1,91 @@
tools:
install_dir: /opt/gpu-test-tools
report:
output_dir: ./reports
format: md
multinode_nccl:
enabled: true
mode: cross-leaf-pdf-matrix-nccl-2.27.7
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: /data/nccl-tests-latest/build
tests:
- all_reduce_perf
- alltoall_perf
topologies:
- nodes: 2
gpus_per_node: 1
label: 2 nodes x 1 GPU (PDF 2 machines 2 GPUs)
min_peak_busbw_gbps:
allreduce: 48.90
alltoall: 27.25
- nodes: 2
gpus_per_node: 2
label: 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs)
min_peak_busbw_gbps:
allreduce: 136.93
alltoall: 54.41
- nodes: 2
gpus_per_node: 4
label: 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs)
cuda_visible_devices: 0,1,4,5
op_env:
alltoall:
NCCL_IB_QPS_PER_CONNECTION: 4
NCCL_MIN_NCHANNELS: 4
NCCL_IB_SPLIT_DATA_ON_QPS: 1
min_peak_busbw_gbps:
allreduce: 335.48
alltoall: 73.73
- nodes: 2
gpus_per_node: 8
label: 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs)
op_env:
alltoall:
NCCL_PXN_DISABLE: 1
min_peak_busbw_gbps:
allreduce: 491.84
alltoall: 76.54
begin_size: 16G
end_size: 16G
step_factor: 2
warmup_iters: 10
gpus_per_rank: 1
timeout_sec: 1800
debug: INFO
socket_ifname: bond0
oob_tcp_ifname: bond0
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: null
min_nchannels: null
net_plugin: none
nvls_enable: 1
split_data_on_qps: null
extra_env:
NCCL_DEBUG_SUBSYS: INIT,NET
NCCL_NET_GDR_LEVEL: 5
NCCL_NET_GDR_READ: 1
NCCL_DMABUF_ENABLE: 0
min_peak_busbw_gbps:
allreduce: 0
alltoall: 0

View File

@ -0,0 +1,62 @@
tools:
install_dir: /opt/gpu-test-tools
report:
output_dir: ./reports
format: md
multinode_nccl:
enabled: true
mode: sweep-nccl-2.27.7
hosts:
- name: nccl-gpu-1
addr: 172.72.8.12
slots: 8
- name: nccl-gpu-2
addr: 172.72.8.16
slots: 8
ssh_user: root
ssh_preflight: true
mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun
mpi_ld_preload: null
extra_ld_library_path:
- /usr/mpi/gcc/openmpi-4.1.9a1/lib
- /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu
- /usr/local/cuda-12.4/targets/x86_64-linux/lib
nccl_tests_dir: /data/nccl-tests-latest/build
tests:
- all_reduce_perf
- alltoall_perf
topologies:
- nodes: 2
gpus_per_node: 8
label: 2 nodes x 8 GPUs NCCL 2.27.7 sweep
begin_size: 1M
end_size: 4G
step_factor: 4
warmup_iters: 2
iters: 5
gpus_per_rank: 1
timeout_sec: 1200
debug: INFO
socket_ifname: bond0
oob_tcp_ifname: bond0
plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30"
ib_gid_index: 3
ib_sl: 5
ib_tc: 136
ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7
ib_timeout: 22
qps_per_connection: null
min_nchannels: null
net_plugin: none
nvls_enable: 1
split_data_on_qps: null
extra_env:
NCCL_DEBUG_SUBSYS: INIT,NET
NCCL_NET_GDR_LEVEL: 5
NCCL_NET_GDR_READ: 1
NCCL_DMABUF_ENABLE: 0
min_peak_busbw_gbps:
allreduce: 480
alltoall: 75

View File

@ -5,6 +5,7 @@ import argparse
import json
import os
import signal
import socket
import sys
import time
from datetime import datetime
@ -25,6 +26,9 @@ from modules.nccl_test import NCCLTest
from modules.training_sim import TrainingSim
from modules.stress_test import StressTest
from modules.rdma_test import RDMATest
from modules.nvlink_test import NVLinkTest
from modules.dcgm_test import DCGMTest
from modules.multinode_nccl_test import MultiNodeNCCLTest
from modules.report import ReportGenerator
from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility
@ -32,43 +36,125 @@ DEFAULT_CONFIG = {
"benchmark": {
"memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3},
"compute": {
"dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"],
"matrix_size": 4096,
"warmup": 10,
"iterations": 100,
"dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
"matrix_size": 8192,
"warmup": 50,
"iterations": 500,
"use_compile": True,
},
},
"health": {"temp_warning": 80, "temp_critical": 90, "power_limit": None},
"health": {"temp_warning": 75, "temp_critical": 85, "power_limit": None},
"nccl": {
"min_bandwidth_gbps": None,
"test_allreduce": True,
"test_alltoall": True,
"test_broadcast": True,
"test_reduce_scatter": False,
"test_allgather": False,
"test_sendrecv": False,
"test_reduce_scatter": True,
"test_allgather": True,
"test_sendrecv": True,
"message_sizes": ["1M", "256M", "2G"],
"repeats": 3,
"max_stddev_pct": 3,
},
"multinode_nccl": {
"enabled": False,
"mode": "sweep",
"hosts": [
{"name": "nccl-gpu-1", "addr": "172.72.8.12", "slots": 8},
{"name": "nccl-gpu-2", "addr": "172.72.8.16", "slots": 8},
],
"ssh_user": "root",
"ssh_preflight": True,
"mpirun_path": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun",
"mpi_ld_preload": None,
"extra_ld_library_path": [
"/usr/mpi/gcc/openmpi-4.1.9a1/lib",
"/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib",
"/usr/local/cuda-12.4/targets/x86_64-linux/lib",
],
"nccl_tests_dir": None,
"tests": ["all_reduce_perf", "alltoall_perf"],
"topologies": [{"nodes": 2, "gpus_per_node": 8}],
"begin_size": "1k",
"end_size": "16g",
"step_factor": 2,
"warmup_iters": 10,
"gpus_per_rank": 1,
"timeout_sec": 1800,
"socket_ifname": "bond0",
"ib_gid_index": 3,
"ib_sl": 5,
"ib_tc": 136,
"ib_hca": "mlx5_0,mlx5_1,mlx5_6,mlx5_7",
"ib_timeout": 22,
"qps_per_connection": 4,
"min_nchannels": 4,
"net_plugin": "none",
"nvls_enable": 1,
"split_data_on_qps": 1,
"min_peak_busbw_gbps": {"allreduce": 480, "alltoall": 75},
},
"stress": {
"duration_sec": 60,
"duration_sec": 1800,
"production_duration_sec": 1800,
"use_gpu_burn": False,
"use_doubles": False,
"use_tensor_cores": True,
"memory_pct": 90,
"gpus": "all",
"dtype": "bf16",
"matrix_size": 24576,
"telemetry_interval_sec": 1,
"warmup_sec": 60,
"min_steady_samples": 10,
"max_temp_c": 80,
"max_temp_delta_c": 5,
"min_power_watts": 630,
"max_tflops_jitter_pct": 5,
"require_tflops_jitter": True,
},
"rdma": {
"min_bandwidth_gbps": 50,
"max_latency_us": 10,
"min_bandwidth_gbps": 47,
"min_port_rate_gbps": 400,
"max_latency_us": 3.5,
"max_write_latency_us": 2.0,
"max_read_latency_us": 3.5,
"ib_iterations": 1000,
"msg_size": 65536,
"msg_size": 4194304,
"latency_msg_size": 8,
"ib_device": None,
"ib_port": 1,
"server_addr": None,
"ibping_target": None,
"ibping_count": 5,
"role": "auto",
"pfc_ecn_counters": True,
},
"nvlink": {
"expected_links_per_gpu": 18,
"expected_link_speed_gbps": 25,
"require_zero_errors": True,
},
"dcgm": {
"diag_level": 3,
"timeout_sec": 1200,
"expected_num_gpus": 8,
"json_output": True,
"require_subtests": True,
},
"training": {
"model": "gpt2",
"model": "synthetic_1.5b",
"batch_size": 8,
"seq_length": 2048,
"num_steps": 50,
"warmup_steps": 5,
"dtype": "bf16",
"mode": "ddp",
"synthetic_params_b": 1.5,
"min_tokens_per_sec": 45000,
"max_step_jitter_pct": 3,
"max_peak_memory_gb": 70,
"require_distributed": True,
},
"report": {"output_dir": "./reports", "format": "json"},
"tools": {"install_dir": "/opt/gpu-test-tools"},
@ -131,7 +217,7 @@ def interactive_menu(config: dict):
if not check_prerequisites(console):
return
results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}}
results_store: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname(), "tests": {}}
menu_items = [
("1", "GPU Information", "gpu_info"),
@ -139,10 +225,13 @@ def interactive_menu(config: dict):
("3", "Memory Benchmark (nvbandwidth)", "memory_bench"),
("4", "Compute Benchmark", "compute_bench"),
("5", "NCCL Multi-GPU Test", "nccl"),
("6", "GPU Stress Test (gpu-burn)", "stress"),
("6", "GPU Stress Test (PyTorch/gpu-burn)", "stress"),
("7", "RDMA/IB Test", "rdma"),
("8", "Training Simulation", "training"),
("9", "Full Test Suite (All Tests)", "all"),
("8", "NVLink/NVSwitch Test", "nvlink"),
("9", "DCGM Diagnostic", "dcgm"),
("10", "Training Simulation", "training"),
("11", "Multi-node NCCL Test", "multinode_nccl"),
("12", "Full Test Suite (All Tests)", "all"),
("0", "Generate Report", "report"),
]
@ -164,9 +253,12 @@ def interactive_menu(config: dict):
"memory_bench": "HBM bandwidth via nvbandwidth",
"compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8",
"nccl": "AllReduce, AllToAll, Broadcast via nccl-tests",
"stress": "Long-running GPU stress via gpu-burn",
"stress": "Long-running high-power GEMM stress with telemetry",
"rdma": "InfiniBand bandwidth & latency (ib_write_bw)",
"nvlink": "NVLink links, speed, and error counters",
"dcgm": "DCGM diag -r 3 production diagnostic",
"training": "Simulate LLM training with PyTorch",
"multinode_nccl": "Cross-node NCCL via mpirun/nccl-tests",
"all": "Run all tests sequentially",
"report": "Export results to JSON/HTML",
}
@ -257,12 +349,30 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
m.print_results(result)
return result
elif test_name == "nvlink":
m = NVLinkTest(config)
result = m.run()
m.print_results(result)
return result
elif test_name == "dcgm":
m = DCGMTest(config)
result = m.run()
m.print_results(result)
return result
elif test_name == "training":
m = TrainingSim(config)
result = m.run()
m.print_results(result)
return result
elif test_name == "multinode_nccl":
m = MultiNodeNCCLTest(config)
result = m.run()
m.print_results(result)
return result
elif test_name == "all":
return _run_full_suite(config, console)
@ -280,17 +390,21 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict:
def _run_full_suite(config: dict, console: Console) -> dict:
"""Run all tests sequentially."""
console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE))
all_results: dict = {"timestamp": datetime.now().isoformat()}
all_results: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname()}
tests = [
("gpu_info", "GPU Information", GPUInfo),
("health", "Health Check", HealthCheck),
("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)),
("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)),
("nvlink", "NVLink/NVSwitch Test", NVLinkTest),
("nccl", "NCCL Test", NCCLTest),
("stress", "GPU Stress Test", StressTest),
("rdma", "RDMA/IB Test", RDMATest),
("dcgm", "DCGM Diagnostic", DCGMTest),
("training", "Training Simulation", TrainingSim),
]
if (config.get("multinode_nccl", {}) or {}).get("enabled"):
tests.append(("multinode_nccl", "Multi-node NCCL Test", MultiNodeNCCLTest))
for i, (key, name, mod_cls) in enumerate(tests, 1):
console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]")
@ -313,14 +427,49 @@ def _run_full_suite(config: dict, console: Console) -> dict:
# Summary
console.print("\n" + "=" * 60)
# Only count test results, exclude metadata like timestamp
test_results = {k: v for k, v in all_results.items() if k != "timestamp"}
passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v)
test_results = {k: v for k, v in all_results.items() if k not in ("timestamp", "hostname")}
passed = sum(1 for v in test_results.values() if _test_result_passed(v))
total = len(test_results)
color = "green" if passed == total else ("yellow" if passed > 0 else "red")
console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]")
return all_results
def _test_result_passed(result) -> bool:
"""Strict production verdict helper for full-suite exit status."""
if not isinstance(result, dict):
return True
if result.get("error"):
return False
if result.get("skipped") or result.get("status") == "SKIP":
return False
if result.get("source") == "torchrun_fallback":
return False
if "passed" in result:
return bool(result.get("passed"))
if "memory" in result:
mem = result["memory"]
if isinstance(mem, dict) and "passed" in mem:
return bool(mem.get("passed"))
if mem.get("error") or mem.get("source") == "pytorch":
return False
eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0
return eff >= 80
if "compute" in result:
comp = result["compute"]
if isinstance(comp, dict) and "passed" in comp:
return bool(comp.get("passed"))
thresholds = comp.get("pass_thresholds_tflops", {}) or {}
per_dtype = comp.get("per_dtype_tflops", {})
for dt, threshold in thresholds.items():
val = per_dtype.get(dt)
if not isinstance(val, (int, float)) or val < threshold:
return False
consistency = comp.get("consistency", {})
return not any(not c.get("passed", False) for c in consistency.values())
return True
def main():
gpu_list_str = " / ".join(g.upper() for g in get_supported_gpus())
parser = argparse.ArgumentParser(
@ -335,15 +484,18 @@ Examples:
python gpu_tester.py --test benchmark --type memory
python gpu_tester.py --test benchmark --type compute --dtype fp16
python gpu_tester.py --test nccl # NCCL test
python gpu_tester.py --test multinode-nccl # Cross-node NCCL test
python gpu_tester.py --test nvlink # NVLink/NVSwitch test
python gpu_tester.py --test dcgm # DCGM diagnostic
python gpu_tester.py --test training # Training sim
python gpu_tester.py --test all # Full suite
python gpu_tester.py --report --format json --output report.json
""",
)
parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"],
parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "multinode-nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"],
help="Run a specific test")
parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)")
parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"],
parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"],
help="Compute benchmark dtype (with --test benchmark --type compute)")
parser.add_argument("--interactive", action="store_true", help="Force interactive mode")
parser.add_argument("--report", action="store_true", help="Generate report from last results")
@ -397,8 +549,11 @@ Examples:
"health": "health",
"benchmark": None,
"nccl": "nccl",
"multinode-nccl": "multinode_nccl",
"stress": "stress",
"rdma": "rdma",
"nvlink": "nvlink",
"dcgm": "dcgm",
"training": "training",
"all": "all",
}
@ -415,19 +570,30 @@ Examples:
result = bench.run()
Benchmark.print_results(result)
if args.report:
ReportGenerator(config).generate({"benchmark": result, "timestamp": datetime.now().isoformat()},
ReportGenerator(config).generate({
"benchmark": result,
"timestamp": datetime.now().isoformat(),
"hostname": socket.gethostname(),
},
fmt=args.format, output=args.output)
sys.exit(0 if _test_result_passed(result) else 1)
elif args.test == "all":
results = _run_full_suite(config, console)
if args.report:
ReportGenerator(config).generate(results, fmt=args.format, output=args.output)
has_errors = any("error" in v for v in results.values() if isinstance(v, dict))
sys.exit(1 if has_errors else 0)
failed = any(not _test_result_passed(v) for k, v in results.items() if k not in ("timestamp", "hostname"))
sys.exit(1 if failed else 0)
else:
result = _run_test(test_map[args.test], config, console)
if args.report and result:
ReportGenerator(config).generate({args.test: result, "timestamp": datetime.now().isoformat()},
report_key = test_map[args.test] or args.test
ReportGenerator(config).generate({
report_key: result,
"timestamp": datetime.now().isoformat(),
"hostname": socket.gethostname(),
},
fmt=args.format, output=args.output)
sys.exit(0 if _test_result_passed(result) else 1)
if __name__ == "__main__":

231
modules/dcgm_test.py Normal file
View File

@ -0,0 +1,231 @@
"""DCGM diagnostic acceptance wrapper."""
import json
import os
import re
import shutil
import signal
import subprocess
from datetime import datetime
from typing import Optional
from rich.console import Console
from rich.table import Table
class DCGMTest:
def __init__(self, config: dict):
self.config = config
self.console = Console()
self.cfg = config.get("dcgm", {})
def run(self) -> dict:
dcgmi = shutil.which("dcgmi")
if not dcgmi:
return {
"passed": False,
"error": "dcgmi not found",
"timestamp": datetime.now().isoformat(),
}
level = str(self.cfg.get("diag_level", 3))
timeout = int(self.cfg.get("timeout_sec", 1200))
cmd = [dcgmi, "diag", "-r", level]
expected_gpus = self.cfg.get("expected_num_gpus")
if expected_gpus:
cmd.extend(["-n", f"gpu:{int(expected_gpus)}"])
if self.cfg.get("json_output", True):
cmd.append("-j")
try:
r = self._run_with_process_group_timeout(cmd, timeout)
except subprocess.TimeoutExpired as e:
output = ((e.output or "") + "\n" + (e.stderr or "")).strip()
return {
"passed": False,
"error": f"dcgmi diag -r {level} timeout after {timeout}s",
"command": cmd,
"raw_output_tail": output[-8000:],
"timestamp": datetime.now().isoformat(),
}
output = r.stdout + "\n" + r.stderr
subtests = self._parse_json_output(output) or self._parse_output(output)
strict_statuses = {"PASS"}
failed = [s for s in subtests if s["status"] not in strict_statuses]
require_subtests = bool(self.cfg.get("require_subtests", True))
passed = r.returncode == 0 and not failed and (bool(subtests) or not require_subtests)
return {
"passed": passed,
"returncode": r.returncode,
"level": int(level),
"command": cmd,
"expected_num_gpus": int(expected_gpus) if expected_gpus else None,
"subtests": subtests,
"raw_output_tail": output[-8000:],
"timestamp": datetime.now().isoformat(),
}
@staticmethod
def _run_with_process_group_timeout(cmd: list[str], timeout: int) -> subprocess.CompletedProcess:
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
start_new_session=True,
)
try:
stdout, stderr = proc.communicate(timeout=timeout)
except subprocess.TimeoutExpired as e:
try:
os.killpg(proc.pid, signal.SIGTERM)
stdout, stderr = proc.communicate(timeout=10)
except subprocess.TimeoutExpired:
os.killpg(proc.pid, signal.SIGKILL)
stdout, stderr = proc.communicate(timeout=10)
raise subprocess.TimeoutExpired(cmd, timeout, output=stdout, stderr=stderr) from e
return subprocess.CompletedProcess(cmd, proc.returncode, stdout, stderr)
@classmethod
def _parse_json_output(cls, output: str) -> list[dict]:
text = output.strip()
if not text:
return []
try:
payload = json.loads(text)
except json.JSONDecodeError:
m = re.search(r"(\{.*\})", text, re.S)
if not m:
return []
try:
payload = json.loads(m.group(1))
except json.JSONDecodeError:
return []
dcgm_payload = payload.get("DCGM Diagnostic") if isinstance(payload, dict) else None
if isinstance(dcgm_payload, dict):
parsed = cls._parse_dcgm_diagnostic_json(dcgm_payload)
if parsed:
return parsed
subtests = []
def walk(node, path: list[str]):
if isinstance(node, dict):
node_name = (
node.get("name")
or node.get("testName")
or node.get("test_name")
or node.get("category")
or node.get("category_name")
)
child_path = [*path, str(node_name)] if node_name else path
status = node.get("status") or node.get("result") or node.get("Result")
if isinstance(status, str):
name = (
node_name
or " / ".join(path[-3:])
)
normalized = cls._normalize_status(status)
if normalized:
subtests.append({
"name": str(name)[:160],
"status": normalized,
"raw": json.dumps(node, default=str)[:1000],
})
for key, value in node.items():
walk(value, [*child_path, str(key)])
elif isinstance(node, list):
for idx, item in enumerate(node):
walk(item, [*path, str(idx)])
walk(payload, [])
return subtests
@classmethod
def _parse_dcgm_diagnostic_json(cls, payload: dict) -> list[dict]:
subtests = []
for category in payload.get("test_categories", []) or []:
category_name = str(category.get("category") or "DCGM")
for test in category.get("tests", []) or []:
test_name = str(test.get("name") or "unnamed")
for result in test.get("results", []) or []:
status = cls._normalize_status(str(result.get("status", "")))
if not status:
continue
entity_group = result.get("entity_group") or "entity"
entity_id = result.get("entity_id", "unknown")
name = f"{category_name}/{test_name}/{entity_group}{entity_id}"
subtests.append({
"name": name[:160],
"status": status,
"raw": json.dumps(result, default=str)[:1000],
})
summary = test.get("test_summary") or {}
status = cls._normalize_status(str(summary.get("status", "")))
if status:
subtests.append({
"name": f"{category_name}/{test_name}/summary"[:160],
"status": status,
"raw": json.dumps(summary, default=str)[:1000],
})
return subtests
@staticmethod
def _normalize_status(status: str) -> str:
s = status.strip().upper()
aliases = {
"PASS": "PASS",
"PASSED": "PASS",
"OK": "PASS",
"FAIL": "FAIL",
"FAILED": "FAIL",
"ERROR": "ERROR",
"WARN": "WARN",
"WARNING": "WARN",
"SKIP": "SKIP",
"SKIPPED": "SKIP",
"NOT_RUN": "SKIP",
"NOT RUN": "SKIP",
}
return aliases.get(s, s if s in {"PASS", "FAIL", "ERROR", "WARN", "SKIP"} else "")
@staticmethod
def _parse_output(output: str) -> list[dict]:
subtests = []
for line in output.splitlines():
stripped = line.strip()
if not stripped:
continue
m = re.search(r"(.+?)\s*[:|]\s*(PASS|FAIL|WARN|ERROR|SKIP)\b", stripped, re.I)
if not m:
m = re.search(r"\b(PASS|FAIL|WARN|ERROR|SKIP)\b\s*[-:|]\s*(.+)", stripped, re.I)
if m:
status = DCGMTest._normalize_status(m.group(1))
name = m.group(2).strip()
else:
continue
else:
name = m.group(1).strip(" .|-")
status = DCGMTest._normalize_status(m.group(2))
if name and len(name) < 160:
subtests.append({"name": name, "status": status, "raw": stripped})
return subtests
@staticmethod
def print_results(results: dict, console: Optional[Console] = None):
c = console or Console()
if results.get("error"):
c.print(f"[bold red]DCGM error: {results['error']}[/bold red]")
return
passed = results.get("passed", False)
c.print("[bold green]✓ DCGM diag PASSED[/bold green]" if passed else "[bold red]✗ DCGM diag FAILED[/bold red]")
subtests = results.get("subtests", [])
if subtests:
table = Table(box=None, padding=(0, 1))
table.add_column("Subtest")
table.add_column("Status", style="bold")
for s in subtests:
table.add_row(s.get("name", ""), s.get("status", ""))
c.print(table)

View File

@ -171,6 +171,10 @@ class HealthCheck:
gpu_health.append({"index": i, "status": worst, "checks": checks})
system_health = self._check_system()
for key in ("fabricmanager", "retired_pages", "kernel_errors"):
item = system_health.get(key, {})
if isinstance(item, dict) and item.get("status") == "FAIL":
overall_pass = False
return {
"passed": overall_pass,
@ -228,6 +232,9 @@ class HealthCheck:
rdma_devs = os.listdir("/sys/class/infiniband_verbs")
nccl_env = {k: v for k, v in os.environ.items() if k.startswith("NCCL_")}
fabric = self._check_fabricmanager()
retired = self._check_retired_pages()
kernel_errors = self._check_kernel_errors()
return {
"nvidia_persistenced": {"installed": persistd, "running": persistd_running},
@ -238,6 +245,41 @@ class HealthCheck:
"infiniband_devices": ib_devs,
"rdma_devices": rdma_devs,
"nccl_env_vars": nccl_env,
"fabricmanager": fabric,
"retired_pages": retired,
"kernel_errors": kernel_errors,
}
def _check_fabricmanager(self) -> dict:
r = self._run_cmd(["systemctl", "is-active", "nvidia-fabricmanager"], timeout=5)
active = r == "active"
logs = self._run_cmd(["journalctl", "-u", "nvidia-fabricmanager", "-n", "200", "--no-pager"], timeout=10) or ""
has_error = "ERROR" in logs.upper() or "FAILED" in logs.upper()
return {
"active": active,
"has_error_logs": has_error,
"status": "PASS" if active and not has_error else "FAIL",
}
def _check_retired_pages(self) -> dict:
raw = self._run_cmd(["nvidia-smi", "-q", "-d", "PAGE_RETIREMENT"], timeout=30) or ""
nums = [int(x) for x in __import__("re").findall(r"Retired Pages.*?:\s*(\d+)", raw, flags=__import__("re").I)]
pending = "Pending Page Blacklist" in raw and "Yes" in raw
total = sum(nums)
return {
"retired_pages": total,
"pending_blacklist": pending,
"status": "PASS" if total == 0 and not pending else "FAIL",
}
def _check_kernel_errors(self) -> dict:
raw = self._run_cmd(["dmesg", "--ctime", "--level=err,crit,alert,emerg"], timeout=10) or ""
upper = raw.upper()
hits = [line for line in raw.splitlines() if any(k in line.upper() for k in ("XID", "AER", "PCIE", "NVRM"))]
return {
"count": len(hits),
"tail": hits[-20:],
"status": "PASS" if not hits else "FAIL",
}
@staticmethod

View File

@ -5,6 +5,8 @@ import os
import re
import shutil
import subprocess
import statistics
import sys
from datetime import datetime
from typing import Optional
@ -70,6 +72,38 @@ class NCCLTest:
return p
return None
def _message_sizes(self) -> list[str]:
return list(self.nccl_cfg.get("message_sizes") or ["1M", "256M", "2G"])
def _repeats(self) -> int:
return int(self.nccl_cfg.get("repeats", 3))
def _max_stddev_pct(self) -> float:
return float(self.nccl_cfg.get("max_stddev_pct", 3))
def _runtime_env(self) -> dict:
env = {**os.environ, "NCCL_DEBUG": "WARN"}
lib_dirs = []
nccl_home = env.get("NCCL_HOME") or self.nccl_cfg.get("nccl_home")
if nccl_home:
lib_dirs.append(os.path.join(str(nccl_home), "lib"))
for path in sys.path:
lib_dirs.append(os.path.join(path, "nvidia", "nccl", "lib"))
venv_root = os.path.dirname(os.path.dirname(sys.executable))
lib_dirs.extend(glob.glob(os.path.join(venv_root, "lib", "python*", "site-packages", "nvidia", "nccl", "lib")))
existing = env.get("LD_LIBRARY_PATH", "")
valid_dirs = []
for d in lib_dirs:
if d and os.path.isdir(d) and d not in valid_dirs:
valid_dirs.append(d)
if valid_dirs:
env["LD_LIBRARY_PATH"] = ":".join(valid_dirs + ([existing] if existing else []))
return env
def run(self) -> dict:
gpu_count = 0
if TORCH_AVAILABLE:
@ -89,7 +123,7 @@ class NCCLTest:
if self.nccl_cfg.get("test_reduce_scatter", False):
tests.append(("reduce_scatter_perf", "ReduceScatter"))
if self.nccl_cfg.get("test_allgather", False):
tests.append(("allgather_perf", "AllGather"))
tests.append(("all_gather_perf", "AllGather"))
if self.nccl_cfg.get("test_sendrecv", False):
tests.append(("sendrecv_perf", "SendRecv"))
@ -170,39 +204,7 @@ class NCCLTest:
if not binary:
return {"status": "SKIP", "error": f"{binary_name} not found"}
cmd = [
binary,
"-b", "8M",
"-e", "8G",
"-f", "2",
"-g", str(gpu_count),
"-w", "5",
"-n", "20",
]
try:
env = os.environ.copy()
env["NCCL_DEBUG"] = "WARN"
r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env)
combined = r.stdout + r.stderr
# Check for NCCL/CUDA compatibility errors
if "CUDA driver version is insufficient" in combined or \
"Test NCCL failure" in combined:
error_msg = "NCCL/CUDA driver version mismatch" \
if "CUDA driver version" in combined \
else "NCCL test failure (library incompatibility)"
return {"status": "FAIL", "error": error_msg}
if r.returncode != 0:
return {"status": "FAIL", "error": r.stderr[:300]}
return self._parse_nccl_output(r.stdout, min_bw)
except subprocess.TimeoutExpired:
return {"status": "FAIL", "error": "timeout"}
except Exception as e:
return {"status": "FAIL", "error": str(e)}
return self._run_nccl_matrix([binary, "-g", str(gpu_count)], min_bw)
def _run_one_nccl_test_mpirun(self, binary_name: str, label: str,
gpu_count: int, mpirun: str, min_bw: float) -> dict:
@ -218,37 +220,64 @@ class NCCLTest:
"-x", "NCCL_DEBUG=WARN",
"-x", "CUDA_VISIBLE_DEVICES=" + ",".join(str(i) for i in range(gpu_count)),
binary,
"-b", "8",
"-e", "256M",
"-f", "2",
"-g", "1",
"-w", "5",
"-n", "20",
]
return self._run_nccl_matrix(cmd, min_bw)
def _run_nccl_matrix(self, base_cmd: list[str], min_bw: float) -> dict:
size_results = []
failures = []
env = self._runtime_env()
try:
env = os.environ.copy()
env["NCCL_DEBUG"] = "WARN"
r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env)
combined = r.stdout + r.stderr
if "CUDA driver version is insufficient" in combined or \
"Test NCCL failure" in combined:
error_msg = "NCCL/CUDA driver version mismatch" \
if "CUDA driver version" in combined \
else "NCCL test failure (library incompatibility)"
return {"status": "FAIL", "error": error_msg}
if r.returncode != 0:
return {"status": "FAIL", "error": r.stderr[:300]}
return self._parse_nccl_output(r.stdout, min_bw)
for size in self._message_sizes():
runs = []
for _ in range(self._repeats()):
cmd = [*base_cmd, "-b", size, "-e", size, "-f", "2", "-w", "5", "-n", "20"]
r = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=env)
combined = r.stdout + r.stderr
if "CUDA driver version is insufficient" in combined or "Test NCCL failure" in combined:
failures.append({"size": size, "error": "NCCL/CUDA/library failure"})
continue
if r.returncode != 0:
failures.append({"size": size, "error": r.stderr[:300]})
continue
parsed = self._parse_nccl_output(r.stdout, min_bw)
runs.append(parsed.get("best_busbw_gbps", 0))
if runs:
worst = min(runs)
mean = sum(runs) / len(runs)
std_pct = (statistics.pstdev(runs) / mean * 100) if len(runs) > 1 and mean else 0
size_results.append({
"size": size,
"runs_busbw_gbps": [round(v, 1) for v in runs],
"worst_busbw_gbps": round(worst, 1),
"mean_busbw_gbps": round(mean, 1),
"stddev_pct": round(std_pct, 2),
"status": "PASS" if worst >= min_bw and std_pct <= self._max_stddev_pct() else "FAIL",
})
else:
size_results.append({"size": size, "status": "FAIL", "runs_busbw_gbps": []})
except subprocess.TimeoutExpired:
return {"status": "FAIL", "error": "timeout"}
except Exception as e:
return {"status": "FAIL", "error": str(e)}
best_bus = max((r.get("mean_busbw_gbps", 0) for r in size_results), default=0)
worst_bus = min((r.get("worst_busbw_gbps", 0) for r in size_results if r.get("runs_busbw_gbps")), default=0)
passed = bool(size_results) and all(r.get("status") == "PASS" for r in size_results) and not failures
return {
"status": "PASS" if passed else "FAIL",
"best_busbw_gbps": round(best_bus, 1),
"worst_busbw_gbps": round(worst_bus, 1),
"min_required_gbps": min_bw,
"max_stddev_pct": self._max_stddev_pct(),
"by_size": size_results,
"failures": failures,
}
@staticmethod
def _parse_nccl_output(stdout: str, min_bw: float) -> dict:
"""Parse nccl-tests tabular output and extract bandwidth results."""
@ -363,7 +392,7 @@ dist.destroy_process_group()
r = subprocess.run(
[torchrun_cmd, f"--nproc_per_node={gpu_count}", tmp.name],
capture_output=True, text=True, timeout=120,
env={**os.environ, "NCCL_DEBUG": "WARN"},
env=self._runtime_env(),
)
os.unlink(tmp.name)
@ -390,10 +419,15 @@ dist.destroy_process_group()
}
return {
"passed": all_passed,
# torchrun fallback is a functional smoke only. It never proves
# production bus bandwidth, so it must not satisfy acceptance.
"passed": False,
"functional_passed": all_passed,
"source": "torchrun_fallback",
"tests": tests,
"gpu_count": gpu_count,
"error": None if all_passed else "torchrun functional NCCL smoke failed",
"acceptance_gap": "nccl-tests bus bandwidth was not measured",
}
except Exception as e:
return {"passed": False, "source": "torchrun_fallback", "error": str(e)}
@ -410,7 +444,8 @@ dist.destroy_process_group()
if source == "torchrun_fallback":
# Connectivity check mode
verdict = "[bold green]✓ NCCL Connectivity OK[/bold green]" if passed else "[bold red]✗ NCCL Connectivity FAILED[/bold red]"
functional = results.get("functional_passed", passed)
verdict = "[bold yellow]⚠ NCCL bus BW NOT VERIFIED[/bold yellow]" if functional else "[bold red]✗ NCCL Connectivity FAILED[/bold red]"
c.print(f"{verdict} [dim](basic check via torchrun)[/dim]")
tests = results.get("tests", {})
@ -427,7 +462,7 @@ dist.destroy_process_group()
else:
c.print(f" [{s_color}]{op_name}[/{s_color}]")
c.print("\n[yellow]Note: functional connectivity test only (no performance data)[/yellow]")
c.print("\n[yellow]Note: functional connectivity test only (no bus bandwidth data; acceptance FAIL)[/yellow]")
else:
# nccl-tests mode
verdict = "[bold green]✓ NCCL tests PASSED[/bold green]" if passed else "[bold yellow]⚠ NCCL tests WARNING[/bold yellow]"
@ -448,12 +483,16 @@ dist.destroy_process_group()
if by_size:
t = Table(box=None, padding=(0, 1))
t.add_column("Size", style="bold", justify="right")
t.add_column("Time (us)", justify="right")
t.add_column("Alg BW (GB/s)", justify="right")
t.add_column("Bus BW (GB/s)", justify="right")
t.add_column("Worst Bus BW", justify="right")
t.add_column("Mean Bus BW", justify="right")
t.add_column("StdDev", justify="right")
t.add_column("Status", justify="right")
for r in by_size:
sz = r.get("size", 0)
sz_str = f"{sz/1024:.0f}K" if sz < 1048576 else f"{sz/1048576:.0f}M"
t.add_row(sz_str, f"{r.get('time_us',0):.1f}",
f"{r.get('algbw_gbps',0):.1f}", f"{r.get('busbw_gbps',0):.1f}")
t.add_row(
str(r.get("size", "")),
f"{r.get('worst_busbw_gbps', 0):.1f}",
f"{r.get('mean_busbw_gbps', 0):.1f}",
f"{r.get('stddev_pct', 0):.2f}%",
r.get("status", "?"),
)
c.print(t)

188
modules/nvlink_test.py Normal file
View File

@ -0,0 +1,188 @@
"""NVLink / NVSwitch production acceptance checks."""
import re
import shutil
import subprocess
from datetime import datetime
from typing import Optional
from rich.console import Console
from rich.table import Table
class NVLinkTest:
def __init__(self, config: dict):
self.config = config
self.console = Console()
self.cfg = config.get("nvlink", {})
def _run(self, args: list[str], timeout: int = 60) -> tuple[int, str, str]:
if not shutil.which("nvidia-smi"):
return 127, "", "nvidia-smi not found"
r = subprocess.run(["nvidia-smi", *args], capture_output=True, text=True, timeout=timeout)
return r.returncode, r.stdout, r.stderr
def run(self) -> dict:
expected_links = int(self.cfg.get("expected_links_per_gpu", 18))
expected_speed = float(self.cfg.get("expected_link_speed_gbps", 25))
require_zero_errors = bool(self.cfg.get("require_zero_errors", True))
rc_s, out_s, err_s = self._run(["nvlink", "-s"])
rc_c, out_c, err_c = self._run(["nvlink", "-c"])
rc_e, out_e, err_e = self._run(["nvlink", "-e"])
if rc_s != 0:
return {
"passed": False,
"error": (err_s or out_s or "nvidia-smi nvlink -s failed")[:1000],
"timestamp": datetime.now().isoformat(),
}
links = self._parse_status(out_s)
if not links:
return {
"passed": False,
"error": "no NVLink status entries parsed from nvidia-smi nvlink -s",
"raw_status": out_s[-4000:],
"timestamp": datetime.now().isoformat(),
}
speeds = self._parse_speeds(out_c) if rc_c == 0 else {}
status_speeds = self._parse_speeds(out_s)
for gpu, gpu_speeds in status_speeds.items():
speeds.setdefault(gpu, {}).update({k: v for k, v in gpu_speeds.items() if k not in speeds.get(gpu, {})})
errors = self._parse_errors(out_e) if rc_e == 0 else {}
gpu_results = []
overall = True
for gpu, gpu_links in sorted(links.items(), key=lambda x: int(x[0])):
active = sum(1 for l in gpu_links.values() if l.get("active"))
inactive = [lid for lid, l in gpu_links.items() if not l.get("active")]
speed_bad = []
for lid in gpu_links:
speed = speeds.get(gpu, {}).get(lid)
if speed is not None and speed < expected_speed:
speed_bad.append({"link": lid, "speed_gbps": speed})
err_bad = []
if require_zero_errors:
for lid, counters in errors.get(gpu, {}).items():
total = sum(v for v in counters.values() if isinstance(v, int))
if total:
err_bad.append({"link": lid, "counters": counters})
passed = active == expected_links and not inactive and not speed_bad and not err_bad
if not passed:
overall = False
gpu_results.append({
"gpu": int(gpu),
"active_links": active,
"expected_links": expected_links,
"inactive_links": inactive,
"speed_issues": speed_bad,
"error_issues": err_bad,
"passed": passed,
})
return {
"passed": overall,
"expected_links_per_gpu": expected_links,
"expected_link_speed_gbps": expected_speed,
"require_zero_errors": require_zero_errors,
"gpus": gpu_results,
"raw_status": out_s[-4000:],
"raw_speed": out_c[-4000:] if out_c else "",
"raw_errors": out_e[-4000:] if out_e else "",
"timestamp": datetime.now().isoformat(),
}
@staticmethod
def _parse_status(text: str) -> dict[str, dict[str, dict]]:
result: dict[str, dict[str, dict]] = {}
gpu = None
for line in text.splitlines():
m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
if m_gpu:
gpu = m_gpu.group(1)
result.setdefault(gpu, {})
continue
if gpu is None:
continue
m_link = re.search(r"Link\s+(\d+).*?(Active|Inactive|Disabled|Off|Down)", line, re.I)
if m_link:
state = m_link.group(2)
result[gpu][m_link.group(1)] = {
"state": state,
"active": state.lower() == "active",
"raw": line.strip(),
}
continue
m_speed = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I)
if m_speed:
result[gpu][m_speed.group(1)] = {
"state": "Active",
"active": True,
"raw": line.strip(),
}
return result
@staticmethod
def _parse_speeds(text: str) -> dict[str, dict[str, float]]:
result: dict[str, dict[str, float]] = {}
gpu = None
for line in text.splitlines():
m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
if m_gpu:
gpu = m_gpu.group(1)
result.setdefault(gpu, {})
continue
if gpu is None:
continue
m_link = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I)
if m_link:
result[gpu][m_link.group(1)] = float(m_link.group(2))
return result
@staticmethod
def _parse_errors(text: str) -> dict[str, dict[str, dict[str, int]]]:
result: dict[str, dict[str, dict[str, int]]] = {}
gpu = None
link = None
for line in text.splitlines():
m_gpu = re.search(r"GPU\s+(\d+)", line, re.I)
if m_gpu:
gpu = m_gpu.group(1)
result.setdefault(gpu, {})
continue
m_link = re.search(r"Link\s+(\d+)", line, re.I)
if m_link and gpu is not None:
link = m_link.group(1)
result[gpu].setdefault(link, {})
if gpu is None or link is None:
continue
for name in ("CRC", "Replay", "Recovery"):
m = re.search(rf"{name}[^0-9]*(\d+)", line, re.I)
if m:
result[gpu][link][name.lower()] = int(m.group(1))
return result
@staticmethod
def print_results(results: dict, console: Optional[Console] = None):
c = console or Console()
if results.get("error"):
c.print(f"[bold red]NVLink error: {results['error']}[/bold red]")
return
passed = results.get("passed", False)
c.print("[bold green]✓ NVLink PASSED[/bold green]" if passed else "[bold red]✗ NVLink FAILED[/bold red]")
table = Table(box=None, padding=(0, 1))
table.add_column("GPU", style="bold")
table.add_column("Active Links", justify="right")
table.add_column("Issues")
for g in results.get("gpus", []):
issues = []
if g.get("inactive_links"):
issues.append("inactive=" + ",".join(g["inactive_links"]))
if g.get("speed_issues"):
issues.append(f"speed={len(g['speed_issues'])}")
if g.get("error_issues"):
issues.append(f"errors={len(g['error_issues'])}")
table.add_row(str(g["gpu"]), f"{g['active_links']}/{g['expected_links']}", "; ".join(issues) or "OK")
c.print(table)

View File

@ -93,8 +93,8 @@ class ReportGenerator:
def _generate_html(self, results: dict, output: str) -> str:
import socket
hostname = socket.gethostname()
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
hostname = results.get("hostname") or socket.gethostname()
timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
sections = []
@ -178,8 +178,8 @@ class ReportGenerator:
def _generate_markdown(self, results: dict, output: str) -> str:
import socket
hostname = socket.gethostname()
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
hostname = results.get("hostname") or socket.gethostname()
timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S")
lines: list[str] = []
@ -201,6 +201,21 @@ class ReportGenerator:
# --- Summary table ---
summary_items = self._build_summary(results)
if summary_items:
verdict, failures, missing = self._overall_acceptance_verdict(summary_items)
lines.append("## Overall Acceptance Verdict\n")
lines.append(f"**Result: {verdict}**")
lines.append("")
if failures:
lines.append("Failed or unverified items:")
for name, status in failures:
lines.append(f"- {name}: {status}")
lines.append("")
if missing:
lines.append("Missing required evidence:")
for name in missing:
lines.append(f"- {name}")
lines.append("")
lines.append("## Summary\n")
lines.append("| Test | Result |")
lines.append("|------|--------|")
@ -319,8 +334,6 @@ class ReportGenerator:
if use_abs and thr:
if val >= thr:
status = "PASS"
elif val >= thr * 0.9:
status = "WARN"
else:
status = "FAIL"
lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |")
@ -331,33 +344,194 @@ class ReportGenerator:
overall_status = status
lines.append("")
if use_abs:
if any(not row.get("passed", False) for row in (comp_data.get("consistency", {}) or {}).values()):
overall_status = "FAIL"
lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n")
else:
overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL")
lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n")
consistency = comp_data.get("consistency", {}) or {}
if consistency:
lines.append("### Compute Consistency\n")
lines.append("| DType | Min | Mean | Max | Spread | Limit | Status |")
lines.append("|-------|-----|------|-----|--------|-------|--------|")
for dt, row in consistency.items():
status = "PASS" if row.get("passed") else "FAIL"
lines.append(
f"| {dt.upper()} | {row.get('min_tflops', 0):.1f} | "
f"{row.get('mean_tflops', 0):.1f} | {row.get('max_tflops', 0):.1f} | "
f"{row.get('spread_pct', 0):.2f}% | <= {row.get('max_allowed_pct', 3)}% | {status} |"
)
lines.append("")
per_gpu = comp_data.get("per_gpu", []) or []
dtype_order = [dt for dt in per_dtype.keys() if not isinstance(per_dtype.get(dt), str)]
if per_gpu and dtype_order:
lines.append("### Compute Per-GPU TFLOPS\n")
headers = ["GPU", *[dt.upper() for dt in dtype_order]]
lines.append("| " + " | ".join(headers) + " |")
lines.append("|" + "|".join(["---"] * len(headers)) + "|")
for row in per_gpu:
cells = [str(row.get("index", ""))]
for dt in dtype_order:
val = row.get(dt, "")
cells.append(f"{val:.1f}" if isinstance(val, (int, float)) else str(val))
lines.append("| " + " | ".join(cells) + " |")
lines.append("")
# --- NCCL ---
nvlink = results.get("nvlink")
if nvlink and not nvlink.get("error"):
lines.append("## NVLink/NVSwitch\n")
lines.append(f"**Overall: {'PASS' if nvlink.get('passed') else 'FAIL'}**\n")
lines.append("| GPU | Active Links | Issues |")
lines.append("|-----|--------------|--------|")
for g in nvlink.get("gpus", []):
issues = []
if g.get("inactive_links"):
issues.append("inactive=" + ",".join(g["inactive_links"]))
if g.get("speed_issues"):
issues.append(f"speed issues={len(g['speed_issues'])}")
if g.get("error_issues"):
issues.append(f"errors={len(g['error_issues'])}")
lines.append(f"| {g.get('gpu')} | {g.get('active_links')}/{g.get('expected_links')} | {', '.join(issues) or 'OK'} |")
lines.append("")
elif nvlink and nvlink.get("error"):
lines.append("## NVLink/NVSwitch\n")
lines.append(f"**Overall: FAIL** ({nvlink.get('error')})\n")
dcgm = results.get("dcgm")
if dcgm and not dcgm.get("error"):
lines.append("## DCGM Diagnostic\n")
lines.append(f"**Overall: {'PASS' if dcgm.get('passed') else 'FAIL'}**\n")
if dcgm.get("subtests"):
lines.append("| Subtest | Status |")
lines.append("|---------|--------|")
for s in dcgm.get("subtests", []):
lines.append(f"| {s.get('name', '')} | {s.get('status', '')} |")
lines.append("")
elif dcgm and dcgm.get("error"):
lines.append("## DCGM Diagnostic\n")
lines.append(f"**Overall: FAIL** ({dcgm.get('error')})\n")
# --- NCCL ---
nccl = results.get("nccl")
if nccl and not nccl.get("error"):
lines.append("## NCCL Multi-GPU\n")
lines.append(f"Source: {nccl.get('source', 'unknown')} | "
f"GPUs: {nccl.get('gpu_count', '?')}\n")
if nccl.get("source") == "torchrun_fallback":
lines.append("> Functional NCCL smoke only: nccl-tests bus bandwidth was not measured, so this does not satisfy production acceptance.\n")
tests = nccl.get("tests", {})
if tests:
lines.append("| Operation | Bus BW (GB/s) | Threshold | Status |")
lines.append("|-----------|---------------|-----------|--------|")
lines.append("> Summary reports the best Bus BW observed for each operation. PASS/FAIL is evaluated across every tested message size and repeat run shown in the detail table below.\n")
lines.append("| Operation | Best Bus BW (GB/s) | Failed Sizes | Threshold | Status |")
lines.append("|-----------|--------------------|--------------|-----------|--------|")
for op, data in tests.items():
if isinstance(data, dict) and not data.get("error"):
bw = data.get("best_busbw_gbps", 0)
req = data.get("min_required_gbps", 0)
status = data.get("status", "?")
lines.append(f"| {op} | {bw:.1f} | >= {req:.0f} | {status} |")
failed_sizes = [
str(row.get("size", "?"))
for row in data.get("by_size", [])
if row.get("status") != "PASS"
]
failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-"
lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {_format_gbps(req)} | {status} |")
elif isinstance(data, dict) and data.get("error"):
lines.append(f"| {op} | - | - | ERROR: {data['error']} |")
lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |")
lines.append("")
for op, data in tests.items():
by_size = data.get("by_size", []) if isinstance(data, dict) else []
if not by_size:
continue
lines.append(f"### NCCL {op} by size\n")
lines.append("| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |")
lines.append("|------|---------------------|-------|------|--------|-----------|--------|")
for row in by_size:
runs = ", ".join(str(v) for v in row.get("runs_busbw_gbps", []))
lines.append(
f"| {row.get('size', '')} | {runs} | "
f"{row.get('worst_busbw_gbps', 0):.1f} | "
f"{row.get('mean_busbw_gbps', 0):.1f} | "
f"{row.get('stddev_pct', 0):.2f}% | "
f">= {_format_gbps(data.get('min_required_gbps', 0))} | "
f"{row.get('status', '?')} |"
)
lines.append("")
passed = nccl.get("passed", False)
lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
multinode = results.get("multinode_nccl")
if multinode and not multinode.get("error"):
lines.append("## Multi-node NCCL / Cross Leaf\n")
lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n")
if multinode.get("artifact_dir"):
lines.append(f"- **Artifacts:** `{multinode.get('artifact_dir')}`")
hosts = multinode.get("hosts", [])
if hosts:
host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts)
lines.append(f"- **Hosts:** {host_text}")
preflight = multinode.get("preflight", {})
if preflight.get("checks"):
failed_checks = [c for c in preflight["checks"] if c.get("status") == "FAIL"]
warn_checks = [c for c in preflight["checks"] if c.get("status") == "WARN"]
lines.append(f"- **Preflight:** {'PASS' if not failed_checks else 'FAIL'}"
f"{f' ({len(warn_checks)} warnings)' if warn_checks else ''}")
lines.append("")
for op, data in (multinode.get("tests") or {}).items():
lines.append(f"### Multi-node NCCL {op}\n")
lines.append("| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |")
lines.append("|----------|----------------------|-------------|-----------|------------|-----------|--------|")
for topo in data.get("topologies", []):
threshold = topo.get("min_required_gbps", 0) or 0
threshold_text = f">= {_format_gbps(threshold)} GB/s" if threshold else "-"
cuda_visible = topo.get("cuda_visible_devices") or "-"
lines.append(
f"| {topo.get('label', '')} | {cuda_visible} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | "
f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | "
f"{threshold_text} | {topo.get('status', '?')} |"
)
lines.append("")
diag_rows = []
for topo in data.get("topologies", []):
net = topo.get("network") or {}
if net:
diag_rows.append((topo, net))
if diag_rows:
lines.append("| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |")
lines.append("|----------|--------------|-----------------|------------------|-------------------|")
for topo, net in diag_rows:
networks = ", ".join(net.get("networks") or []) or "unknown"
gdr = net.get("gpu_direct_rdma", "UNKNOWN")
enabled = ", ".join(net.get("gdr_enabled_hcas") or []) or "-"
disabled = ", ".join(net.get("gdr_disabled_hcas") or []) or "-"
lines.append(f"| {topo.get('label', '')} | {networks} | {gdr} | {enabled} | {disabled} |")
lines.append("")
failed_topos = [topo for topo in data.get("topologies", []) if topo.get("status") == "FAIL"]
if failed_topos:
lines.append("| Topology | Return Code | Error / Output Tail |")
lines.append("|----------|-------------|---------------------|")
for topo in failed_topos:
tail = topo.get("error") or topo.get("stderr_tail") or topo.get("stdout_tail") or ""
tail = str(tail).replace("\n", " ").replace("|", "\\|")[-240:]
lines.append(f"| {topo.get('label', '')} | {topo.get('returncode', '')} | {tail} |")
lines.append("")
lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n")
elif multinode and multinode.get("error"):
lines.append("## Multi-node NCCL / Cross Leaf\n")
lines.append(f"**Overall: FAIL** ({multinode.get('error')})\n")
preflight = multinode.get("preflight", {})
if preflight.get("checks"):
lines.append("| Check | Status | Detail |")
lines.append("|-------|--------|--------|")
for check in preflight["checks"]:
detail = str(check.get("detail", "")).replace("\n", " ")
lines.append(f"| {check.get('name', '')} | {check.get('status', '')} | {detail} |")
lines.append("")
# --- Stress Test ---
stress = results.get("stress")
if stress and not stress.get("error"):
@ -368,6 +542,21 @@ class ReportGenerator:
source = stress.get("source", "unknown")
lines.append(f"- **Source:** {source}")
lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)")
telemetry = stress.get("telemetry") or {}
if telemetry:
lines.append(f"- **Telemetry samples:** {telemetry.get('samples', 0)}")
lines.append(f"- **Max temp:** {telemetry.get('max_temp_c', {})}")
lines.append(f"- **Avg power:** {telemetry.get('avg_power_w', {})}")
lines.append(f"- **Temp delta:** {telemetry.get('temp_delta_c', 'N/A')} C")
lines.append(f"- **TFLOPS jitter:** {telemetry.get('tflops_jitter_pct', 'N/A')}%")
lines.append(f"- **Steady TFLOPS samples:** {telemetry.get('steady_tflops_samples', 0)}")
lines.append(f"- **Throttle events:** {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}")
lines.append(f"- **XID events:** {len(telemetry.get('xid_events', []))}")
failures = telemetry.get("failures") or []
if failures:
lines.append("- **Failure reasons:**")
for reason in failures:
lines.append(f" - {reason}")
lines.append(f"- **Result: {'PASS' if passed else 'FAIL'}**")
lines.append("")
@ -378,26 +567,70 @@ class ReportGenerator:
lines.append(f"**Overall: SKIP** [{rdma.get('reason', 'no IB hardware detected')}]\n")
elif rdma and not rdma.get("error"):
lines.append("## RDMA/InfiniBand\n")
rdma_legacy_note = self._rdma_legacy_note(rdma)
if rdma_legacy_note:
lines.append(f"> {rdma_legacy_note}\n")
port_checks = rdma.get("port_checks", [])
if port_checks:
lines.append("### RDMA Port Checks\n")
lines.append("| Device | Port | State | Rate | Required | Status |")
lines.append("|--------|------|-------|------|----------|--------|")
for p in port_checks:
lines.append(
f"| {p.get('device', '')} | {p.get('port', '')} | "
f"{p.get('state', '')} | {p.get('rate', '')} | "
f">= {p.get('min_rate_gbps', 400):.0f}Gbps ACTIVE | {p.get('status', '?')} |"
)
lines.append("")
bw_tests = rdma.get("bandwidth_tests", [])
lat_tests = rdma.get("latency_tests", [])
if bw_tests or lat_tests:
ibping_tests = rdma.get("ibping_tests", [])
if bw_tests or lat_tests or ibping_tests:
lines.append("| Test | Value | Threshold | Status |")
lines.append("|------|-------|-----------|--------|")
for bt in bw_tests:
if not bt.get("error"):
if bt.get("error"):
lines.append(f"| {bt.get('test', 'ib_bw')} | {bt.get('error')} | required runnable test | {bt.get('status', 'FAIL')} |")
else:
threshold, status = self._rdma_bandwidth_verdict(bt)
lines.append(f"| {bt['test']} | {bt.get('bandwidth_gbps', 0):.1f} GB/s | "
f">= {bt.get('min_required_gbps', 0)} GB/s | {bt.get('status', '?')} |")
f">= {threshold:g} GB/s | {status} |")
for lt in lat_tests:
if not lt.get("error"):
if lt.get("error"):
lines.append(f"| {lt.get('test', 'ib_lat')} | {lt.get('error')} | required runnable test | {lt.get('status', 'FAIL')} |")
else:
threshold, status = self._rdma_latency_verdict(lt)
lines.append(f"| {lt['test']} | {lt.get('latency_us', 0):.2f} us | "
f"<= {lt.get('max_allowed_us', 0)} us | {lt.get('status', '?')} |")
f"<= {threshold:g} us | {status} |")
for it in ibping_tests:
direction = it.get("direction") or it.get("role", "N/A")
if it.get("error"):
lines.append(f"| {it.get('test', 'ibping')} | {it.get('error')} | bidirectional peer evidence | {it.get('status', 'FAIL')} |")
else:
lines.append(f"| {it['test']} | {direction} target={it.get('target', 'N/A')} count={it.get('count', 'N/A')} | "
f"0% packet loss | {it.get('status', '?')} |")
lines.append("")
fabric = rdma.get("fabric_counters") or {}
if fabric:
counters = fabric.get("counters", {})
lines.append(f"- **PFC/ECN/CNP/congestion counters checked:** {len(counters)}")
lines.append(f"- **PFC/ECN/CNP/congestion non-zero:** {'yes' if fabric.get('failed') else 'no'}")
if not counters:
lines.append("- **PFC/ECN/CNP/congestion evidence:** missing")
failures = rdma.get("failures") or []
if not failures:
failures = self._rdma_failure_reasons(rdma)
if failures:
lines.append("- **Failure reasons:**")
for reason in failures:
lines.append(f" - {reason}")
passed = rdma.get("passed", False)
lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n")
# --- Training ---
training = results.get("training")
if training and not training.get("error"):
training_status, training_detail, training_missing = self._training_verdict(training)
lines.append("## Training Simulation\n")
lines.append("| Metric | Value |")
lines.append("|--------|-------|")
@ -405,8 +638,14 @@ class ReportGenerator:
lines.append(f"| Params | {training.get('total_params_m', 0):.1f}M |")
lines.append(f"| Throughput | {training.get('throughput_tokens_per_sec', 0):.0f} tokens/sec |")
lines.append(f"| Avg Step Time | {training.get('avg_step_time_ms', 0):.1f} ms |")
lines.append(f"| Warmup Steps | {training.get('warmup_steps', 'N/A')} |")
lines.append(f"| Peak Memory | {training.get('peak_memory_gb', 0):.1f} GB |")
lines.append(f"| Final Loss | {training.get('final_loss', 'N/A')} |")
lines.append(f"| Step Jitter | {training.get('step_jitter_pct', 'N/A')}% |")
lines.append(f"| Distributed Mode | {training.get('distributed_mode', 'N/A')} |")
if training_missing:
lines.append(f"| Acceptance Gaps | missing {', '.join(training_missing)} |")
lines.append(f"| Verdict | {training_status} ({training_detail}) |")
lines.append("")
# --- Footer ---
@ -441,6 +680,114 @@ class ReportGenerator:
return bench["compute"]
return {}
@staticmethod
def _training_verdict(training: dict) -> tuple[str, str, list[str]]:
"""Return report status for both current and legacy training result schemas."""
tps = float(training.get("throughput_tokens_per_sec", 0) or 0)
if "passed" in training:
status = "PASS" if training.get("passed") else "FAIL"
return status, f"{tps:.0f} tokens/sec", []
required = ["passed", "step_jitter_pct", "distributed_mode", "loss_finite"]
missing = [k for k in required if k not in training]
return "UNVERIFIED", f"{tps:.0f} tokens/sec; legacy result lacks explicit acceptance verdict", missing
def _rdma_cfg_value(self, key: str, default: float) -> float:
try:
return float((self.config.get("rdma", {}) or {}).get(key, default))
except (TypeError, ValueError):
return default
def _rdma_bandwidth_verdict(self, row: dict) -> tuple[float, str]:
threshold = self._rdma_cfg_value("min_bandwidth_gbps", 47.0)
value = float(row.get("bandwidth_gbps", 0) or 0)
return threshold, "PASS" if value >= threshold else "FAIL"
def _rdma_latency_verdict(self, row: dict) -> tuple[float, str]:
name = row.get("test", "")
if name == "ib_write_lat":
threshold = self._rdma_cfg_value("max_write_latency_us", 2.0)
elif name == "ib_read_lat":
threshold = self._rdma_cfg_value("max_read_latency_us", 3.5)
else:
threshold = self._rdma_cfg_value("max_latency_us", 3.5)
value = float(row.get("latency_us", 0) or 0)
return threshold, "PASS" if 0 < value <= threshold else "FAIL"
def _rdma_legacy_note(self, rdma: dict) -> str:
"""Flag old RDMA result schemas whose embedded thresholds were looser."""
for row in rdma.get("bandwidth_tests", []) or []:
if row.get("min_required_gbps") != self._rdma_cfg_value("min_bandwidth_gbps", 47.0):
return (
"Legacy RDMA result re-evaluated with current PDF acceptance thresholds; "
"old WARN statuses and old 50GB/s/10us limits are not used for verdict."
)
for row in rdma.get("latency_tests", []) or []:
threshold, _ = self._rdma_latency_verdict(row)
if row.get("max_allowed_us") != threshold:
return (
"Legacy RDMA result re-evaluated with current PDF acceptance thresholds; "
"old WARN statuses and old 50GB/s/10us limits are not used for verdict."
)
return ""
def _rdma_failure_reasons(self, rdma: dict) -> list[str]:
failures = []
for row in rdma.get("bandwidth_tests", []) or []:
threshold, status = self._rdma_bandwidth_verdict(row)
if status != "PASS":
failures.append(
f"{row.get('test')} bandwidth {row.get('bandwidth_gbps', 0)}GB/s < {threshold:g}GB/s"
)
for row in rdma.get("latency_tests", []) or []:
threshold, status = self._rdma_latency_verdict(row)
if status != "PASS":
failures.append(
f"{row.get('test')} latency {row.get('latency_us', 0)}us > {threshold:g}us"
)
for row in rdma.get("ibping_tests", []) or []:
if row.get("status") != "PASS":
failures.append(f"{row.get('test')} failed")
return failures
@staticmethod
def _overall_acceptance_verdict(summary_items: list[tuple[str, str]]) -> tuple[str, list[tuple[str, str]], list[str]]:
"""PDF-style verdict for the report scope.
Full-suite reports require every single-node acceptance item. Standalone
reports, such as `--test multinode-nccl`, should only judge the items
that were actually requested instead of reporting unrelated evidence as
missing.
"""
single_node_required = [
"GPU Info",
"Health Check",
"Memory Bandwidth",
"Compute Throughput",
"NVLink/NVSwitch",
"NCCL",
"Stress Test",
"RDMA",
"DCGM",
"Training",
]
status_by_name = dict(summary_items)
present_single_node = [name for name in single_node_required if name in status_by_name]
if len(present_single_node) >= 3:
required = list(single_node_required)
if "Multi-node NCCL" in status_by_name:
required.append("Multi-node NCCL")
else:
required = list(status_by_name)
missing = [name for name in required if name not in status_by_name]
failures = [
(name, status)
for name, status in summary_items
if name in required and not str(status).startswith("PASS")
]
verdict = "PASS" if not missing and not failures else "FAIL"
return verdict, failures, missing
def _build_summary(self, results: dict) -> list[tuple[str, str]]:
"""Build summary verdict list from results."""
items = []
@ -473,7 +820,7 @@ class ReportGenerator:
d2d = mem.get("d2d_bandwidth_gbps") or 0
items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)"))
else:
eff = mem.get("efficiency_pct") or 0
eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0
verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL")
items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)"))
@ -491,25 +838,43 @@ class ReportGenerator:
rank = {"PASS": 0, "WARN": 1, "FAIL": 2}
worst_status = "PASS"
worst_dt = None
lowest_margin = None
for dt, thr in pass_thresholds.items():
val = per_dtype.get(dt)
if not isinstance(val, (int, float)):
continue
if val >= thr:
st = "PASS"
elif val >= thr * 0.9:
st = "WARN"
else:
st = "FAIL"
margin = val / thr if thr else 0
if lowest_margin is None or margin < lowest_margin:
lowest_margin = margin
worst_dt = dt
if rank[st] > rank[worst_status]:
worst_status = st
worst_dt = dt
if worst_dt:
items.append((
"Compute Throughput",
f"{worst_status} (worst {worst_dt.upper()} "
f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
))
consistency = comp.get("consistency", {}) or {}
failed_consistency = [
(dt, row)
for dt, row in consistency.items()
if not row.get("passed", False)
]
if failed_consistency:
worst_status = "FAIL"
fail_dt, fail_row = failed_consistency[0]
items.append((
"Compute Throughput",
f"FAIL ({fail_dt.upper()} spread "
f"{fail_row.get('spread_pct', 0):.2f}% > "
f"{fail_row.get('max_allowed_pct', 3)}%)"
))
else:
items.append((
"Compute Throughput",
f"{worst_status} (worst {worst_dt.upper()} "
f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})"
))
else:
items.append(("Compute Throughput", f"{worst_status}"))
else:
@ -521,16 +886,46 @@ class ReportGenerator:
else:
items.append(("Compute Throughput", "N/A"))
# NCCL
if "nvlink" in results:
nvl = results["nvlink"]
if nvl.get("error"):
items.append(("NVLink/NVSwitch", f"ERROR: {nvl['error']}"))
elif nvl.get("passed"):
items.append(("NVLink/NVSwitch", "PASS"))
else:
items.append(("NVLink/NVSwitch", "FAIL"))
if "dcgm" in results:
d = results["dcgm"]
if d.get("error"):
items.append(("DCGM", f"ERROR: {d['error']}"))
elif d.get("passed"):
items.append(("DCGM", "PASS"))
else:
items.append(("DCGM", "FAIL"))
# NCCL
if "nccl" in results:
n = results["nccl"]
if n.get("error"):
items.append(("NCCL", f"ERROR: {n['error']}"))
elif n.get("source") == "torchrun_fallback":
items.append(("NCCL", "FAIL (no nccl-tests bus BW)"))
elif n.get("passed"):
items.append(("NCCL", "PASS"))
else:
items.append(("NCCL", "FAIL"))
if "multinode_nccl" in results:
mn = results["multinode_nccl"]
if mn.get("error"):
items.append(("Multi-node NCCL", f"ERROR: {mn['error']}"))
elif mn.get("passed"):
items.append(("Multi-node NCCL", "PASS"))
else:
items.append(("Multi-node NCCL", "FAIL"))
# Stress
if "stress" in results:
s = results["stress"]
@ -559,7 +954,17 @@ class ReportGenerator:
if t.get("error"):
items.append(("Training", f"ERROR: {t['error']}"))
else:
tps = t.get("throughput_tokens_per_sec", 0)
items.append(("Training", f"PASS ({tps:.0f} tokens/sec)"))
status, detail, _missing = self._training_verdict(t)
items.append(("Training", f"{status} ({detail})"))
return items
def _format_gbps(value) -> str:
try:
numeric = float(value)
except (TypeError, ValueError):
return str(value)
if numeric.is_integer():
return f"{numeric:.0f}"
return f"{numeric:.2f}"

View File

@ -1,9 +1,10 @@
"""GPU stress test module — wraps gpu-burn for long-running stability tests."""
"""GPU stress test module — gpu-burn or PyTorch GEMM with telemetry."""
import glob
import os
import shutil
import subprocess
import threading
import time
from datetime import datetime
@ -46,7 +47,7 @@ class StressTest:
memory_pct = cfg.get("memory_pct", 90)
target_gpus = cfg.get("gpus", "all")
gpu_burn = self._find_gpu_burn()
gpu_burn = self._find_gpu_burn() if cfg.get("use_gpu_burn", False) else ""
if gpu_burn:
# Try gpu-burn first
@ -60,7 +61,7 @@ class StressTest:
return result
self.console.print("[yellow]gpu_burn not found, using PyTorch stress test[/yellow]")
self.console.print("[yellow]Using PyTorch stress test[/yellow]")
return self._run_pytorch_stress(duration_sec, memory_pct)
def _run_gpu_burn(self, gpu_burn: str, duration: int,
@ -77,12 +78,26 @@ class StressTest:
cmd.append(str(duration))
t0 = time.time()
xid_before = self._collect_xid_events()
interval = int(self.stress_cfg.get("telemetry_interval_sec", 1))
telemetry = []
stop_sampling = threading.Event()
sampler = threading.Thread(
target=self._sample_telemetry,
args=(telemetry, stop_sampling, interval),
daemon=True,
)
sampler.start()
try:
r = subprocess.run(cmd, capture_output=True, text=True, timeout=duration + 120)
elapsed = round(time.time() - t0, 1)
stop_sampling.set()
sampler.join(timeout=interval + 1)
output = r.stdout + r.stderr
passed = r.returncode == 0
xid_events = self._new_xid_events(xid_before, self._collect_xid_events())
telemetry_summary = self._evaluate_telemetry(telemetry, [], xid_events)
passed = r.returncode == 0 and telemetry_summary.get("passed", False)
gpu_results = []
for line in output.split("\n"):
@ -96,25 +111,36 @@ class StressTest:
"duration_sec": duration,
"elapsed_sec": elapsed,
"gpu_results": gpu_results,
"telemetry": telemetry_summary,
"raw_output_tail": output[-500:] if output else "",
"timestamp": datetime.now().isoformat(),
}
except subprocess.TimeoutExpired:
stop_sampling.set()
return {
"source": "gpu-burn",
"passed": False,
"duration_sec": duration,
"error": "timeout",
"telemetry": self._evaluate_telemetry(
telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events())
),
"timestamp": datetime.now().isoformat(),
}
except Exception as e:
stop_sampling.set()
return {
"source": "gpu-burn",
"passed": False,
"error": str(e),
"telemetry": self._evaluate_telemetry(
telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events())
),
"timestamp": datetime.now().isoformat(),
}
finally:
stop_sampling.set()
def _run_pytorch_stress(self, duration: int, memory_pct: int = 90) -> dict:
try:
@ -127,58 +153,79 @@ class StressTest:
gpu_count = torch.cuda.device_count()
self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs, target {memory_pct}% memory)[/cyan]")
dtype_name = self.stress_cfg.get("dtype", "bf16")
matrix_size = int(self.stress_cfg.get("matrix_size", 8192))
interval = int(self.stress_cfg.get("telemetry_interval_sec", 1))
dtype_map = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
dtype = dtype_map.get(dtype_name, torch.bfloat16)
gpu_status = {}
telemetry = []
stop_sampling = threading.Event()
t0 = time.time()
xid_before = self._collect_xid_events()
try:
sampler = threading.Thread(
target=self._sample_telemetry,
args=(telemetry, stop_sampling, interval),
daemon=True,
)
sampler.start()
tensors = {}
ballast = {}
pass_tflops = []
for i in range(gpu_count):
with torch.cuda.device(i):
# Get actual free memory (accounting for other processes)
free_mem, total_mem = torch.cuda.mem_get_info(i)
# Calculate allocation from configured memory_pct
target_mem = int(total_mem * memory_pct / 100)
# Cap at actual free memory with 5% safety margin
alloc_bytes = min(target_mem, int(free_mem * 0.95))
# matmul(A, A.T) needs 2x input memory (input + output)
mem_side = int((alloc_bytes / 4 / 2) ** 0.5)
# Cap compute matrix so a single matmul completes in ~2s on H100/H200
# (FP32 ≈ 67 TFLOPS → 2*4096³/67e12 ≈ 2s). Without this cap, a 141GB
# HBM yields side ≈ 131K → single matmul ~68s × 8 GPUs serial → loop
# overshoots a 60s duration request by 10×+.
MAX_COMPUTE_SIDE = 4096
side = min(mem_side, MAX_COMPUTE_SIDE)
actual_mem_mb = side * side * 4 / 1024 / 1024
side = matrix_size
elem = torch.tensor([], dtype=dtype).element_size()
compute_bytes = side * side * elem * 3
target_mem = min(int(total_mem * memory_pct / 100), int(free_mem * 0.90))
ballast_bytes = max(0, target_mem - compute_bytes)
if ballast_bytes:
ballast_elems = ballast_bytes // 2
ballast[i] = torch.empty(ballast_elems, device=f"cuda:{i}", dtype=torch.float16)
actual_mem_mb = (compute_bytes + ballast_bytes) / 1024 / 1024
total_mem_mb = total_mem / 1024 / 1024
free_mem_mb = free_mem / 1024 / 1024
self.console.print(
f" [dim]GPU {i}: total {total_mem_mb:.0f}MB, free {free_mem_mb:.0f}MB, "
f"alloc {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - "
f"matrix {side}x{side}[/dim]"
f"{dtype_name} matrix {side}x{side}[/dim]"
)
tensors[i] = (
torch.randn(side, side, device=f"cuda:{i}", dtype=dtype),
torch.randn(side, side, device=f"cuda:{i}", dtype=dtype),
torch.empty(side, side, device=f"cuda:{i}", dtype=dtype),
)
tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32)
self.console.print(f"\n[cyan]Starting stress test for {duration} seconds...[/cyan]")
elapsed_check = 0
while time.time() - t0 < duration:
loop_start = time.perf_counter()
# Dispatch matmul on all GPUs in parallel — do NOT synchronize between
# GPUs, otherwise the 8 GPUs run serially and overshoot the duration.
for i in range(gpu_count):
with torch.cuda.device(i):
tensors[i] = torch.matmul(tensors[i], tensors[i].T)
a, b, out = tensors[i]
torch.matmul(a, b, out=out)
# Single sync per pass — waits for all 8 streams concurrently
for i in range(gpu_count):
with torch.cuda.device(i):
torch.cuda.synchronize()
loop_elapsed = time.perf_counter() - loop_start
current_elapsed = time.time() - t0
if loop_elapsed > 0:
flops = gpu_count * 2 * (matrix_size ** 3)
pass_tflops.append({
"elapsed_sec": current_elapsed,
"tflops": flops / loop_elapsed / 1e12,
})
# Show progress every 10 seconds
current_elapsed = time.time() - t0
if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0:
self.console.print(f" [dim]Running {int(current_elapsed)}s / {duration}s[/dim]")
elapsed_check = current_elapsed
@ -198,21 +245,196 @@ class StressTest:
"duration_sec": duration,
"error": error_msg,
"gpu_status": gpu_status,
"telemetry": self._evaluate_telemetry(
telemetry, pass_tflops if "pass_tflops" in locals() else [],
self._new_xid_events(xid_before, self._collect_xid_events()),
),
}
finally:
stop_sampling.set()
tensors.clear()
ballast.clear()
torch.cuda.empty_cache()
elapsed = round(time.time() - t0, 1)
xid_events = self._new_xid_events(xid_before, self._collect_xid_events())
telemetry_summary = self._evaluate_telemetry(telemetry, pass_tflops, xid_events)
passed = all(v == "PASS" for v in gpu_status.values()) and telemetry_summary.get("passed", False)
return {
"source": "pytorch",
"passed": True,
"passed": passed,
"duration_sec": duration,
"elapsed_sec": elapsed,
"gpu_status": gpu_status,
"telemetry": telemetry_summary,
"timestamp": datetime.now().isoformat(),
}
def _sample_telemetry(self, telemetry: list, stop_event: threading.Event, interval: int):
query = "index,temperature.gpu,power.draw,clocks_throttle_reasons.active"
while not stop_event.is_set():
try:
r = subprocess.run(
["nvidia-smi", f"--query-gpu={query}", "--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=10,
)
if r.returncode == 0:
sample = {"time": time.time(), "gpus": []}
for line in r.stdout.splitlines():
parts = [p.strip() for p in line.split(",")]
if len(parts) >= 4:
sample["gpus"].append({
"index": int(parts[0]),
"temp_c": float(parts[1]),
"power_w": float(parts[2]),
"throttle": parts[3],
})
telemetry.append(sample)
except Exception:
pass
stop_event.wait(interval)
def _collect_xid_events(self) -> list[str]:
try:
r = subprocess.run(
["dmesg", "--color=never"],
capture_output=True, text=True, timeout=10,
)
if r.returncode != 0:
return []
return [
line.strip()
for line in r.stdout.splitlines()
if any(token in line.upper() for token in ("XID", "NVRM: XID"))
]
except Exception:
return []
@staticmethod
def _new_xid_events(before: list[str], after: list[str]) -> list[str]:
seen = set(before)
return [line for line in after if line not in seen]
def _evaluate_telemetry(self, telemetry: list, pass_tflops: list, xid_events: list[str] | None = None) -> dict:
cfg = self.stress_cfg
max_temp = float(cfg.get("max_temp_c", 80))
max_delta = float(cfg.get("max_temp_delta_c", 5))
min_power = float(cfg.get("min_power_watts", 630))
max_jitter = float(cfg.get("max_tflops_jitter_pct", 5))
require_jitter = bool(cfg.get("require_tflops_jitter", True))
duration = float(cfg.get("duration_sec", 60))
requested_warmup = float(cfg.get("warmup_sec", 60))
warmup_sec = min(requested_warmup, max(0.0, duration * 0.2))
min_steady_samples = int(cfg.get("min_steady_samples", 10))
temps = {}
powers = {}
throttle_bad = []
xid_events = xid_events or []
steady_telemetry = [
sample for sample in telemetry
if sample.get("time", 0) - telemetry[0].get("time", 0) >= warmup_sec
] if telemetry else []
evaluation_samples = steady_telemetry if len(steady_telemetry) >= min_steady_samples else telemetry
for sample in evaluation_samples:
for g in sample.get("gpus", []):
idx = g["index"]
temps.setdefault(idx, []).append(g["temp_c"])
powers.setdefault(idx, []).append(g["power_w"])
try:
bitmask = int(str(g["throttle"]), 16)
except ValueError:
bitmask = 0
real_throttle = bitmask & ~0x1
if real_throttle:
throttle_bad.append({
"gpu": idx,
"throttle": g["throttle"],
"real_throttle": f"0x{real_throttle:x}",
})
max_temps = {idx: max(vals) for idx, vals in temps.items() if vals}
avg_powers = {idx: sum(vals) / len(vals) for idx, vals in powers.items() if vals}
temp_delta = (max(max_temps.values()) - min(max_temps.values())) if len(max_temps) >= 2 else 0
jitter = 0
steady_tflops = []
for item in pass_tflops:
if isinstance(item, dict):
if float(item.get("elapsed_sec", 0)) >= warmup_sec:
steady_tflops.append(float(item.get("tflops", 0)))
else:
steady_tflops.append(float(item))
if len(steady_tflops) < 2 and pass_tflops:
steady_tflops = [
float(item.get("tflops", 0)) if isinstance(item, dict) else float(item)
for item in pass_tflops
]
if steady_tflops:
mean = sum(steady_tflops) / len(steady_tflops)
jitter = max(abs(v - mean) / mean * 100 for v in steady_tflops) if mean else 0
failures = []
temp_failures = {idx: v for idx, v in max_temps.items() if v > max_temp}
power_failures = {idx: v for idx, v in avg_powers.items() if v < min_power}
if not evaluation_samples:
failures.append("no telemetry samples available for evaluation")
if temp_failures:
failures.append(
"max temperature above threshold: "
+ ", ".join(f"GPU {idx} {val:.1f}C" for idx, val in sorted(temp_failures.items()))
)
if temp_delta > max_delta:
failures.append(f"GPU temperature delta {temp_delta:.1f}C exceeds {max_delta:.1f}C")
if power_failures:
failures.append(
"average steady-state power below threshold: "
+ ", ".join(f"GPU {idx} {val:.1f}W" for idx, val in sorted(power_failures.items()))
)
if throttle_bad:
failures.append(
f"non-idle throttle reasons observed in {len(throttle_bad)} samples "
f"(first: GPU {throttle_bad[0]['gpu']} {throttle_bad[0]['real_throttle']})"
)
if xid_events:
failures.append(f"{len(xid_events)} new XID/NVRM XID events observed")
if require_jitter and len(steady_tflops) < 2:
failures.append(
f"insufficient steady TFLOPS samples for jitter evaluation: {len(steady_tflops)} < 2"
)
if jitter > max_jitter:
failures.append(f"TFLOPS jitter {jitter:.2f}% exceeds {max_jitter:.2f}%")
passed = (
bool(evaluation_samples)
and all(v <= max_temp for v in max_temps.values())
and temp_delta <= max_delta
and all(v >= min_power for v in avg_powers.values())
and not throttle_bad
and not xid_events
and (not require_jitter or len(steady_tflops) >= 2)
and jitter <= max_jitter
)
return {
"passed": passed,
"samples": len(telemetry),
"steady_samples": len(evaluation_samples),
"warmup_sec": round(warmup_sec, 1),
"max_temp_c": {k: round(v, 1) for k, v in max_temps.items()},
"avg_power_w": {k: round(v, 1) for k, v in avg_powers.items()},
"temp_delta_c": round(temp_delta, 1),
"throttle_events": throttle_bad[:20],
"throttle_event_count": len(throttle_bad),
"xid_events": xid_events[-20:],
"tflops_jitter_pct": round(jitter, 2),
"steady_tflops_samples": len(steady_tflops),
"failures": failures,
"thresholds": {
"max_temp_c": max_temp,
"max_temp_delta_c": max_delta,
"min_power_w": min_power,
"max_tflops_jitter_pct": max_jitter,
"require_tflops_jitter": require_jitter,
"warmup_sec": requested_warmup,
"min_steady_samples": min_steady_samples,
},
}
@staticmethod
def print_results(results: dict, console: Console = None):
c = console or Console()
@ -245,5 +467,21 @@ class StressTest:
color = "green" if status == "PASS" else "red"
c.print(f" GPU {gid}: [{color}]{status}[/{color}]")
telemetry = results.get("telemetry") or {}
if telemetry:
c.print("\n Telemetry:")
c.print(f" Samples: {telemetry.get('samples', 0)} total, {telemetry.get('steady_samples', 0)} evaluated after {telemetry.get('warmup_sec', 0)}s warmup")
c.print(f" Avg steady power: {telemetry.get('avg_power_w', {})}")
c.print(f" Max steady temp: {telemetry.get('max_temp_c', {})}")
c.print(f" Temp delta: {telemetry.get('temp_delta_c', 'N/A')} C")
c.print(f" TFLOPS jitter: {telemetry.get('tflops_jitter_pct', 'N/A')}%")
c.print(f" Throttle events: {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}")
c.print(f" XID events: {len(telemetry.get('xid_events', []))}")
failures = telemetry.get("failures", [])
if failures:
c.print(" [red]Failure reasons:[/red]")
for reason in failures:
c.print(f" [red]- {reason}[/red]")
if results.get("error"):
c.print(f" [red]Error: {results['error']}[/red]")

View File

@ -1,8 +1,13 @@
"""Training simulation module - LLM training workload with PyTorch."""
import json
import os
import sys
import tempfile
import time
import subprocess
import shutil
import math
from datetime import datetime
from typing import Optional
@ -36,6 +41,7 @@ class TrainingSim:
batch_size = self.train_cfg.get("batch_size", 8)
seq_length = self.train_cfg.get("seq_length", 2048)
num_steps = self.train_cfg.get("num_steps", 50)
warmup_steps = int(self.train_cfg.get("warmup_steps", 5))
dtype_str = self.train_cfg.get("dtype", "bf16")
dtype_map = {
@ -47,7 +53,13 @@ class TrainingSim:
self.console.print(f"[cyan]Training Simulation[/cyan]")
self.console.print(f" Model: {model_name} | Batch: {batch_size} | Seq: {seq_length} | "
f"DType: {dtype_str} | Steps: {num_steps} | GPUs: {gpu_count}")
f"DType: {dtype_str} | Steps: {num_steps} | Warmup: {warmup_steps} | GPUs: {gpu_count}")
if self.train_cfg.get("mode", "ddp") == "ddp" and gpu_count > 1:
ddp_result = self._run_synthetic_ddp(gpu_count, batch_size, seq_length, num_steps, dtype_str)
if ddp_result.get("passed") or not self.train_cfg.get("allow_fallback", False):
return ddp_result
self.console.print("[yellow]DDP synthetic training failed, falling back to single-process synthetic path[/yellow]")
try:
from transformers import AutoModelForCausalLM, AutoTokenizer
@ -87,9 +99,10 @@ class TrainingSim:
BarColumn(), TextColumn("{task.completed}/{task.total}"),
TimeElapsedColumn(), console=self.console,
) as progress:
task = progress.add_task("Training steps...", total=num_steps)
total_steps = num_steps + warmup_steps
task = progress.add_task("Training steps...", total=total_steps)
for step in range(num_steps):
for step in range(total_steps):
torch.cuda.synchronize()
t0 = time.perf_counter()
@ -119,8 +132,15 @@ class TrainingSim:
progress.advance(task)
avg_step_time = sum(step_times) / len(step_times)
measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
avg_step_time = sum(measured_steps) / len(measured_steps)
throughput = batch_size * seq_length / avg_step_time
jitter = self._jitter_pct(measured_steps)
peak_mem = round(max(mem_usage) if mem_usage else 0, 2)
final_loss = float(loss.item()) if hasattr(loss, "item") else float("nan")
passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss)
if self.train_cfg.get("require_distributed", True):
passed = False
return {
"model": model_name,
@ -130,11 +150,18 @@ class TrainingSim:
"batch_size": batch_size,
"seq_length": seq_length,
"num_steps": num_steps,
"warmup_steps": warmup_steps,
"total_steps": total_steps,
"avg_step_time_ms": round(avg_step_time * 1000, 1),
"throughput_tokens_per_sec": round(throughput, 0),
"throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
"peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
"final_loss": round(loss.item(), 4) if hasattr(loss, 'item') else None,
"peak_memory_gb": peak_mem,
"final_loss": round(final_loss, 4),
"step_jitter_pct": round(jitter, 2),
"distributed_mode": "device_map",
"loss_finite": math.isfinite(final_loss),
"passed": passed,
"acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "",
"timestamp": datetime.now().isoformat(),
}
@ -142,6 +169,196 @@ class TrainingSim:
self.console.print(f"[yellow]Model loading failed: {e}[/yellow]")
return self._run_synthetic(gpu_count, batch_size, seq_length, num_steps, dtype)
def _run_synthetic_ddp(self, gpu_count: int, batch_size: int, seq_length: int,
num_steps: int, dtype_str: str) -> dict:
"""Run the 1.5B synthetic Transformer with one process per GPU."""
torchrun = os.path.join(os.path.dirname(sys.executable), "torchrun")
if not os.path.isfile(torchrun):
torchrun = shutil.which("torchrun") or ""
if not torchrun:
return {
"model": "synthetic_transformer_1.5b",
"gpu_count": gpu_count,
"distributed_mode": "ddp",
"passed": False,
"error": "torchrun not found",
"timestamp": datetime.now().isoformat(),
}
script = r'''
import json
import math
import os
import time
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
def main():
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
torch.cuda.set_device(local_rank)
dist.init_process_group("nccl")
global_batch = int(os.environ["TRAIN_BATCH_SIZE"])
local_batch = max(1, global_batch // world_size)
seq_length = int(os.environ["TRAIN_SEQ_LENGTH"])
num_steps = int(os.environ["TRAIN_NUM_STEPS"])
warmup_steps = int(os.environ.get("TRAIN_WARMUP_STEPS", "5"))
total_steps = num_steps + warmup_steps
dtype_name = os.environ.get("TRAIN_DTYPE", "bf16")
dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}.get(dtype_name, torch.bfloat16)
hidden_size = 4096
num_layers = 6
num_heads = 32
vocab_size = 32000
class SyntheticTransformer(torch.nn.Module):
def __init__(self):
super().__init__()
self.embed = torch.nn.Embedding(vocab_size, hidden_size)
self.layers = torch.nn.ModuleList([
torch.nn.TransformerEncoderLayer(
d_model=hidden_size,
nhead=num_heads,
dim_feedforward=hidden_size * 4,
batch_first=True,
dtype=dtype,
) for _ in range(num_layers)
])
self.head = torch.nn.Linear(hidden_size, vocab_size, dtype=dtype)
def forward(self, x):
h = self.embed(x).to(dtype)
for layer in self.layers:
h = layer(h)
return self.head(h)
model = SyntheticTransformer().cuda()
total_params = sum(p.numel() for p in model.parameters())
model = DDP(model, device_ids=[local_rank], output_device=local_rank)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
input_ids = torch.randint(0, vocab_size, (local_batch, seq_length), device="cuda")
step_times = []
last_loss = torch.tensor(float("nan"), device="cuda")
torch.cuda.reset_peak_memory_stats(local_rank)
for _ in range(total_steps):
torch.cuda.synchronize()
t0 = time.perf_counter()
with torch.amp.autocast("cuda", dtype=dtype, enabled=dtype in (torch.float16, torch.bfloat16)):
logits = model(input_ids)
loss = torch.nn.functional.cross_entropy(logits.reshape(-1, vocab_size), input_ids.reshape(-1))
loss.backward()
optimizer.step()
optimizer.zero_grad(set_to_none=True)
torch.cuda.synchronize()
step_times.append(time.perf_counter() - t0)
last_loss = loss.detach()
peak_mem = torch.tensor(torch.cuda.max_memory_allocated(local_rank) / 1024**3, device="cuda")
dist.all_reduce(peak_mem, op=dist.ReduceOp.MAX)
finite = torch.tensor(1 if math.isfinite(float(last_loss.item())) else 0, device="cuda")
dist.all_reduce(finite, op=dist.ReduceOp.MIN)
if dist.get_rank() == 0:
measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
avg_step = sum(measured_steps) / len(measured_steps)
mean = avg_step
jitter = max(abs(v - mean) / mean * 100 for v in measured_steps) if mean else 0.0
throughput = global_batch * seq_length / avg_step if avg_step else 0.0
print("TRAINING_DDP_JSON=" + json.dumps({
"model": "synthetic_transformer_1.5b",
"total_params_m": round(total_params / 1e6, 1),
"num_layers": num_layers,
"hidden_size": hidden_size,
"gpu_count": world_size,
"dtype": dtype_name,
"batch_size": global_batch,
"local_batch_size": local_batch,
"seq_length": seq_length,
"num_steps": num_steps,
"warmup_steps": warmup_steps,
"total_steps": total_steps,
"avg_step_time_ms": round(avg_step * 1000, 1),
"throughput_tokens_per_sec": round(throughput, 0),
"throughput_samples_per_sec": round(global_batch / avg_step, 2) if avg_step else 0,
"peak_memory_gb": round(float(peak_mem.item()), 2),
"final_loss": round(float(last_loss.item()), 4),
"step_jitter_pct": round(jitter, 2),
"distributed_mode": "ddp",
"loss_finite": bool(int(finite.item())),
}), flush=True)
dist.destroy_process_group()
if __name__ == "__main__":
main()
'''
tmp = tempfile.NamedTemporaryFile("w", suffix="_training_ddp.py", delete=False)
tmp.write(script)
tmp.close()
env = {
**os.environ,
"TRAIN_BATCH_SIZE": str(batch_size),
"TRAIN_SEQ_LENGTH": str(seq_length),
"TRAIN_NUM_STEPS": str(num_steps),
"TRAIN_WARMUP_STEPS": str(int(self.train_cfg.get("warmup_steps", 5))),
"TRAIN_DTYPE": dtype_str,
"NCCL_DEBUG": os.environ.get("NCCL_DEBUG", "WARN"),
}
cmd = [torchrun, f"--nproc_per_node={gpu_count}", tmp.name]
self.console.print(f" Running synthetic 1.5B DDP via torchrun ({gpu_count} processes)...")
try:
timeout = int(self.train_cfg.get("timeout_sec", max(600, num_steps * 180)))
r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=env)
except subprocess.TimeoutExpired:
os.unlink(tmp.name)
return {
"model": "synthetic_transformer_1.5b",
"gpu_count": gpu_count,
"distributed_mode": "ddp",
"passed": False,
"error": "training_ddp_timeout",
"timestamp": datetime.now().isoformat(),
}
finally:
if os.path.exists(tmp.name):
try:
os.unlink(tmp.name)
except OSError:
pass
marker = "TRAINING_DDP_JSON="
payload = None
for line in (r.stdout + "\n" + r.stderr).splitlines():
if marker in line:
payload = line.split(marker, 1)[1].strip()
if r.returncode != 0 or not payload:
return {
"model": "synthetic_transformer_1.5b",
"gpu_count": gpu_count,
"distributed_mode": "ddp",
"passed": False,
"error": (r.stderr or r.stdout or "training_ddp_failed")[-1000:],
"timestamp": datetime.now().isoformat(),
}
result = json.loads(payload)
loss_value = float(result.get("final_loss", "nan"))
passed = self._acceptance_pass(
float(result.get("throughput_tokens_per_sec", 0)),
float(result.get("step_jitter_pct", 999)),
float(result.get("peak_memory_gb", 999)),
loss_value,
) and bool(result.get("loss_finite", False)) and result.get("gpu_count") == gpu_count
result.update({
"passed": passed,
"timestamp": datetime.now().isoformat(),
})
return result
def _run_synthetic(self, gpu_count, batch_size, seq_length, num_steps, dtype) -> dict:
self.console.print(" Running synthetic training benchmark...")
@ -170,11 +387,17 @@ class TrainingSim:
h = layer(h)
return self.head(h)
model = SyntheticTransformer().cuda()
model = SyntheticTransformer()
total_params = sum(p.numel() for p in model.parameters())
self.console.print(f" Synthetic params: {total_params / 1e6:.1f}M")
distributed_mode = "single_gpu"
if gpu_count > 1:
model = torch.nn.DataParallel(model).cuda()
distributed_mode = "data_parallel"
else:
model = model.cuda()
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
@ -183,14 +406,17 @@ class TrainingSim:
step_times = []
mem_usage = []
warmup_steps = int(self.train_cfg.get("warmup_steps", 5))
total_steps = num_steps + warmup_steps
with Progress(
SpinnerColumn(), TextColumn("[progress.description]{task.description}"),
BarColumn(), TextColumn("{task.completed}/{task.total}"),
TimeElapsedColumn(), console=self.console,
) as progress:
task = progress.add_task("Synthetic training...", total=num_steps)
task = progress.add_task("Synthetic training...", total=total_steps)
for step in range(num_steps):
for step in range(total_steps):
torch.cuda.synchronize()
t0 = time.perf_counter()
@ -206,14 +432,22 @@ class TrainingSim:
elapsed = time.perf_counter() - t0
step_times.append(elapsed)
mem_used = torch.cuda.max_memory_allocated() / 1024**3
mem_used = max(torch.cuda.max_memory_allocated(i) for i in range(gpu_count)) / 1024**3
mem_usage.append(mem_used)
torch.cuda.reset_peak_memory_stats()
for i in range(gpu_count):
torch.cuda.reset_peak_memory_stats(i)
progress.advance(task)
avg_step_time = sum(step_times) / len(step_times)
measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times
avg_step_time = sum(measured_steps) / len(measured_steps)
throughput = batch_size * seq_length / avg_step_time
jitter = self._jitter_pct(measured_steps)
peak_mem = round(max(mem_usage) if mem_usage else 0, 2)
final_loss = float(loss.item())
passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss)
if self.train_cfg.get("require_distributed", True):
passed = False
return {
"model": "synthetic_transformer",
@ -225,14 +459,36 @@ class TrainingSim:
"batch_size": batch_size,
"seq_length": seq_length,
"num_steps": num_steps,
"warmup_steps": warmup_steps,
"total_steps": total_steps,
"avg_step_time_ms": round(avg_step_time * 1000, 1),
"throughput_tokens_per_sec": round(throughput, 0),
"throughput_samples_per_sec": round(batch_size / avg_step_time, 2),
"peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2),
"final_loss": round(loss.item(), 4),
"peak_memory_gb": peak_mem,
"final_loss": round(final_loss, 4),
"step_jitter_pct": round(jitter, 2),
"distributed_mode": distributed_mode,
"loss_finite": math.isfinite(final_loss),
"passed": passed,
"acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "",
"timestamp": datetime.now().isoformat(),
}
@staticmethod
def _jitter_pct(step_times: list[float]) -> float:
if not step_times:
return 0.0
mean = sum(step_times) / len(step_times)
return max(abs(v - mean) / mean * 100 for v in step_times) if mean else 0.0
def _acceptance_pass(self, throughput: float, jitter: float, peak_mem: float, loss_value: float) -> bool:
return (
throughput >= float(self.train_cfg.get("min_tokens_per_sec", 45000))
and jitter <= float(self.train_cfg.get("max_step_jitter_pct", 3))
and peak_mem <= float(self.train_cfg.get("max_peak_memory_gb", 70))
and math.isfinite(loss_value)
)
@staticmethod
def print_results(results: dict, console: Console = None):
c = console or Console()
@ -254,11 +510,15 @@ class TrainingSim:
("Batch Size", str(results.get("batch_size", "N/A"))),
("Seq Length", str(results.get("seq_length", "N/A"))),
("Steps", str(results.get("num_steps", "N/A"))),
("Warmup Steps", str(results.get("warmup_steps", "N/A"))),
("Avg Step Time", f"{results.get('avg_step_time_ms', 'N/A')} ms"),
("Throughput", f"{results.get('throughput_tokens_per_sec', 'N/A')} tokens/s"),
("Samples/sec", f"{results.get('throughput_samples_per_sec', 'N/A')}"),
("Peak Memory", f"{results.get('peak_memory_gb', 'N/A')} GB"),
("Final Loss", str(results.get("final_loss", "N/A"))),
("Step Jitter", f"{results.get('step_jitter_pct', 'N/A')}%"),
("Distributed Mode", results.get("distributed_mode", "N/A")),
("Verdict", "PASS" if results.get("passed") else "FAIL"),
]
for label, val in metrics:
table.add_row(label, str(val))

View File

@ -0,0 +1,291 @@
#include <cublasLt.h>
#include <cuda_bf16.h>
#include <cuda_fp8.h>
#include <cuda_runtime.h>
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <numeric>
#include <string>
#include <vector>
#define CHECK_CUDA(call) \
do { \
cudaError_t status = (call); \
if (status != cudaSuccess) { \
std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(status)); \
std::exit(1); \
} \
} while (0)
#define CHECK_CUBLAS(call) \
do { \
cublasStatus_t status = (call); \
if (status != CUBLAS_STATUS_SUCCESS) { \
std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__, \
__LINE__, static_cast<int>(status)); \
std::exit(1); \
} \
} while (0)
__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) {
size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
size_t stride = blockDim.x * gridDim.x;
for (size_t i = tid; i < count; i += stride) {
ptr[i] = __nv_fp8_e4m3(value);
}
}
struct Args {
int matrix_size = 8192;
int warmup = 20;
int iterations = 200;
int first_gpu = 0;
int gpu_count = -1;
size_t workspace_mb = 256;
int fast_accum = 1;
};
static Args parse_args(int argc, char **argv) {
Args args;
for (int i = 1; i < argc; ++i) {
auto need = [&](const char *name) {
if (i + 1 >= argc) {
std::fprintf(stderr, "Missing value for %s\n", name);
std::exit(2);
}
return argv[++i];
};
if (!std::strcmp(argv[i], "--matrix-size")) {
args.matrix_size = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--warmup")) {
args.warmup = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--iterations")) {
args.iterations = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--first-gpu")) {
args.first_gpu = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--gpu-count")) {
args.gpu_count = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--workspace-mb")) {
args.workspace_mb = static_cast<size_t>(std::atoll(need(argv[i])));
} else if (!std::strcmp(argv[i], "--fast-accum")) {
args.fast_accum = std::atoi(need(argv[i]));
} else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) {
std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] "
"[--iterations N] [--first-gpu N] [--gpu-count N] "
"[--workspace-mb N] [--fast-accum 0|1]");
std::exit(0);
} else {
std::fprintf(stderr, "Unknown argument: %s\n", argv[i]);
std::exit(2);
}
}
return args;
}
static double run_one_gpu(int gpu, const Args &args) {
CHECK_CUDA(cudaSetDevice(gpu));
const int64_t m = args.matrix_size;
const int64_t n = args.matrix_size;
const int64_t k = args.matrix_size;
const size_t a_elems = static_cast<size_t>(m) * k;
const size_t b_elems = static_cast<size_t>(k) * n;
const size_t d_elems = static_cast<size_t>(m) * n;
__nv_fp8_e4m3 *d_a = nullptr;
__nv_fp8_e4m3 *d_b = nullptr;
__nv_bfloat16 *d_d = nullptr;
void *workspace = nullptr;
float *d_scale_a = nullptr;
float *d_scale_b = nullptr;
const float scale = 1.0f;
const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL;
CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3)));
CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3)));
CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16)));
CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes));
CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float)));
CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice));
CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice));
const int threads = 256;
const int blocks = 4096;
fill_fp8<<<blocks, threads>>>(d_a, a_elems, 0.01f);
fill_fp8<<<blocks, threads>>>(d_b, b_elems, 0.01f);
CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16)));
CHECK_CUDA(cudaGetLastError());
CHECK_CUDA(cudaDeviceSynchronize());
cublasLtHandle_t lt;
cublasLtMatmulDesc_t op_desc;
cublasLtMatrixLayout_t a_desc, b_desc, d_desc;
cublasLtMatmulPreference_t preference;
CHECK_CUBLAS(cublasLtCreate(&lt));
CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F));
// cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed.
// With square GEMMs this keeps the benchmark FLOP count identical to the PDF
// acceptance shape while satisfying the library's FP8 kernel constraints.
cublasOperation_t transa = CUBLAS_OP_T;
cublasOperation_t transb = CUBLAS_OP_N;
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a,
sizeof(d_scale_a)));
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b,
sizeof(d_scale_b)));
int8_t fast_accum = args.fast_accum ? 1 : 0;
CHECK_CUBLAS(cublasLtMatmulDescSetAttribute(
op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum,
sizeof(fast_accum)));
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k));
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k));
CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m));
CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference));
CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute(
preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes,
sizeof(workspace_bytes)));
cublasLtMatmulHeuristicResult_t heuristic;
int returned = 0;
CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic(
lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic,
&returned));
if (returned == 0) {
std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu);
std::exit(1);
}
auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
int32_t value = -1;
size_t written = 0;
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
&heuristic.algo, attr, &value, sizeof(value), &written));
return static_cast<int>(value);
};
auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
uint32_t value = 0;
size_t written = 0;
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
&heuristic.algo, attr, &value, sizeof(value), &written));
return static_cast<int>(value);
};
auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) {
uint16_t value = 0;
size_t written = 0;
CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute(
&heuristic.algo, attr, &value, sizeof(value), &written));
return static_cast<int>(value);
};
const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID);
const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID);
const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM);
const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID);
const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID);
const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID);
const float alpha = 1.0f;
const float beta = 0.0f;
auto matmul = [&]() {
CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc,
&beta, d_d, d_desc, d_d, d_desc,
&heuristic.algo, workspace, workspace_bytes, 0));
};
for (int i = 0; i < args.warmup; ++i) {
matmul();
}
CHECK_CUDA(cudaDeviceSynchronize());
cudaEvent_t start, stop;
CHECK_CUDA(cudaEventCreate(&start));
CHECK_CUDA(cudaEventCreate(&stop));
CHECK_CUDA(cudaEventRecord(start));
for (int i = 0; i < args.iterations; ++i) {
matmul();
}
CHECK_CUDA(cudaEventRecord(stop));
CHECK_CUDA(cudaEventSynchronize(stop));
float elapsed_ms = 0.0f;
CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop));
const double flops =
2.0 * static_cast<double>(m) * static_cast<double>(n) *
static_cast<double>(k) * static_cast<double>(args.iterations);
const double tflops = flops / (static_cast<double>(elapsed_ms) / 1000.0) / 1e12;
std::printf(
" {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, "
"\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, "
"\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n",
gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape,
(gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ",");
std::fflush(stdout);
CHECK_CUDA(cudaEventDestroy(start));
CHECK_CUDA(cudaEventDestroy(stop));
CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference));
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc));
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc));
CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc));
CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc));
CHECK_CUBLAS(cublasLtDestroy(lt));
CHECK_CUDA(cudaFree(d_a));
CHECK_CUDA(cudaFree(d_b));
CHECK_CUDA(cudaFree(d_d));
CHECK_CUDA(cudaFree(workspace));
CHECK_CUDA(cudaFree(d_scale_a));
CHECK_CUDA(cudaFree(d_scale_b));
CHECK_CUDA(cudaDeviceSynchronize());
return tflops;
}
int main(int argc, char **argv) {
Args args = parse_args(argc, argv);
int device_count = 0;
CHECK_CUDA(cudaGetDeviceCount(&device_count));
if (args.gpu_count < 0) {
args.gpu_count = device_count - args.first_gpu;
}
if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) {
std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n",
args.first_gpu, args.gpu_count, device_count);
return 2;
}
std::vector<double> values;
std::printf("{\n");
std::printf(" \"source\": \"cuBLASLt\",\n");
std::printf(" \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n");
std::printf(" \"matrix_size\": %d,\n", args.matrix_size);
std::printf(" \"warmup\": %d,\n", args.warmup);
std::printf(" \"iterations\": %d,\n", args.iterations);
std::printf(" \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0);
std::printf(" \"per_gpu\": [\n");
for (int i = 0; i < args.gpu_count; ++i) {
int gpu = args.first_gpu + i;
double tflops = run_one_gpu(gpu, args);
values.push_back(tflops);
}
double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size();
auto minmax = std::minmax_element(values.begin(), values.end());
double spread = ((*minmax.second - *minmax.first) / mean) * 100.0;
std::printf(" ],\n");
std::printf(" \"mean_tflops\": %.1f,\n", mean);
std::printf(" \"min_tflops\": %.1f,\n", *minmax.first);
std::printf(" \"max_tflops\": %.1f,\n", *minmax.second);
std::printf(" \"spread_pct\": %.2f\n", spread);
std::printf("}\n");
return mean >= 1400.0 ? 0 : 1;
}

View File

@ -0,0 +1,425 @@
#!/usr/bin/env bash
set -euo pipefail
# Deep-diagnose multi-node NCCL behavior from the coordinator node.
# Default values match the current 2-node H100 cross-leaf investigation.
MODE="${1:-all}"
MPI_BIN="${MPI_BIN:-/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun}"
NCCL_TESTS_DIR="${NCCL_TESTS_DIR:-/data/nccl-tests-latest/build}"
HOSTS="${HOSTS:-172.72.8.12:8,172.72.8.16:8}"
PEER_HOST="${PEER_HOST:-172.72.8.16}"
SSH_USER="${SSH_USER:-root}"
HCAS="${HCAS:-mlx5_0 mlx5_1 mlx5_6 mlx5_7}"
HCA_CSV="${HCA_CSV:-mlx5_0,mlx5_1,mlx5_6,mlx5_7}"
OUT_DIR="${OUT_DIR:-/tmp/nccl_deep_diagnose_$(date +%Y%m%d_%H%M%S)}"
BEGIN_SIZE="${BEGIN_SIZE:-16G}"
END_SIZE="${END_SIZE:-16G}"
WARMUP_ITERS="${WARMUP_ITERS:-10}"
ITERS="${ITERS:-10}"
GRAPH_WARMUP_ITERS="${GRAPH_WARMUP_ITERS:-1}"
GRAPH_ITERS="${GRAPH_ITERS:-1}"
SWEEP_WARMUP_ITERS="${SWEEP_WARMUP_ITERS:-3}"
SWEEP_ITERS="${SWEEP_ITERS:-5}"
NCCL_LD_LIBRARY_PATH="${NCCL_LD_LIBRARY_PATH:-/usr/mpi/gcc/openmpi-4.1.9a1/lib:/tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.4/targets/x86_64-linux/lib}"
DEFAULT_NCCL_DEBUG="${NCCL_DEBUG:-WARN}"
COUNTERS="${COUNTERS:-port_xmit_data port_rcv_data port_xmit_packets port_rcv_packets port_xmit_wait port_xmit_discards port_rcv_errors port_rcv_remote_physical_errors port_rcv_switch_relay_errors port_xmit_constraint_errors port_rcv_constraint_errors symbol_error link_error_recovery link_downed local_link_integrity_errors excessive_buffer_overrun_errors VL15_dropped}"
HW_COUNTERS="${HW_COUNTERS:-roce_adp_retrans roce_adp_retrans_to roce_slow_restart roce_slow_restart_cnps roce_slow_restart_trans packet_seq_err out_of_sequence out_of_buffer duplicate_request implied_nak_seq_err local_ack_timeout_err req_transport_retries_exceeded rnr_nak_retry_err rx_write_requests rx_read_requests}"
mkdir -p "$OUT_DIR"
mpi_base=(
"$MPI_BIN"
--allow-run-as-root
--mca btl_openib_warn_no_device_params_found 0
--mca btl_tcp_if_include bond0
--mca oob_tcp_if_include bond0
--mca plm_rsh_args "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes -o ConnectTimeout=10"
-H "$HOSTS"
--map-by ppr:8:node
-np 16
)
base_exports=(
LD_LIBRARY_PATH
NCCL_IB_GID_INDEX NCCL_IB_SL NCCL_IB_TC NCCL_SOCKET_IFNAME
NCCL_DEBUG NCCL_DEBUG_SUBSYS NCCL_IB_TIMEOUT NCCL_IB_HCA
NCCL_NET_PLUGIN NCCL_NVLS_ENABLE NCCL_NET_GDR_LEVEL NCCL_NET_GDR_READ
NCCL_DMABUF_ENABLE NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION
NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS
NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER
NCCL_IB_AR_THRESHOLD
)
set_common_env() {
unset NCCL_DEBUG_SUBSYS NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION
unset NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS
unset NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER
unset NCCL_IB_AR_THRESHOLD
export LD_LIBRARY_PATH="$NCCL_LD_LIBRARY_PATH"
export NCCL_IB_GID_INDEX="${NCCL_IB_GID_INDEX:-3}"
export NCCL_IB_SL="${NCCL_IB_SL:-5}"
export NCCL_IB_TC="${NCCL_IB_TC:-136}"
export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-bond0}"
export NCCL_DEBUG="$DEFAULT_NCCL_DEBUG"
export NCCL_IB_TIMEOUT="${NCCL_IB_TIMEOUT:-22}"
export NCCL_IB_HCA="$HCA_CSV"
export NCCL_NET_PLUGIN="${NCCL_NET_PLUGIN:-none}"
export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-1}"
export NCCL_NET_GDR_LEVEL="${NCCL_NET_GDR_LEVEL:-5}"
export NCCL_NET_GDR_READ="${NCCL_NET_GDR_READ:-1}"
export NCCL_DMABUF_ENABLE="${NCCL_DMABUF_ENABLE:-0}"
}
mpi_xargs() {
for name in "${base_exports[@]}"; do
if [[ -n "${!name+x}" ]]; then
printf -- '-x\n%s\n' "$name"
fi
done
}
run_nccl() {
local op="$1"
local bin="$2"
local log="$3"
local warmup="$4"
local iters="$5"
mapfile -t xargs < <(mpi_xargs)
"${mpi_base[@]}" "${xargs[@]}" \
"$bin" -b "$BEGIN_SIZE" -e "$END_SIZE" -g 1 -f 2 -w "$warmup" -n "$iters" \
>"$log" 2>&1
awk -v op="$op" '/Avg bus bandwidth/ {print op, $0}' "$log"
}
read_one_snapshot() {
local host_label="$1"
local out="$2"
: >"$out"
for hca in $HCAS; do
for c in $COUNTERS; do
local f="/sys/class/infiniband/$hca/ports/1/counters/$c"
if [[ -r "$f" ]]; then
printf '%s %s counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out"
fi
done
for c in $HW_COUNTERS; do
local f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c"
if [[ -r "$f" ]]; then
printf '%s %s hw_counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out"
fi
done
done
}
read_remote_snapshot() {
local out="$1"
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \
"HCAS='$HCAS' COUNTERS='$COUNTERS' HW_COUNTERS='$HW_COUNTERS' bash -s" <<'EOS' >"$out"
for hca in $HCAS; do
for c in $COUNTERS; do
f="/sys/class/infiniband/$hca/ports/1/counters/$c"
if [ -r "$f" ]; then
printf '%s %s counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)"
fi
done
for c in $HW_COUNTERS; do
f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c"
if [ -r "$f" ]; then
printf '%s %s hw_counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)"
fi
done
done
EOS
}
summarize_counter_delta() {
local before_a="$1"
local before_b="$2"
local after_a="$3"
local after_b="$4"
local out="$5"
python3 - "$before_a" "$before_b" "$after_a" "$after_b" >"$out" <<'PY'
import pathlib
import sys
interesting = {
"port_xmit_wait", "port_xmit_discards", "port_rcv_errors",
"port_rcv_remote_physical_errors", "port_rcv_switch_relay_errors",
"port_xmit_constraint_errors", "port_rcv_constraint_errors",
"symbol_error", "link_error_recovery", "link_downed",
"local_link_integrity_errors", "excessive_buffer_overrun_errors",
"VL15_dropped", "roce_adp_retrans", "roce_adp_retrans_to",
"roce_slow_restart", "roce_slow_restart_cnps", "roce_slow_restart_trans",
"packet_seq_err", "out_of_sequence", "out_of_buffer",
"duplicate_request", "implied_nak_seq_err", "local_ack_timeout_err",
"req_transport_retries_exceeded", "rnr_nak_retry_err",
}
def load(path):
data = {}
for line in pathlib.Path(path).read_text().splitlines():
parts = line.split()
if len(parts) != 5:
continue
host, hca, kind, counter, value = parts
try:
data[(host, hca, kind, counter)] = int(value)
except ValueError:
pass
return data
before = {}
after = {}
before.update(load(sys.argv[1]))
before.update(load(sys.argv[2]))
after.update(load(sys.argv[3]))
after.update(load(sys.argv[4]))
print("NONZERO_DELTAS")
for key in sorted(set(before) | set(after)):
delta = after.get(key, 0) - before.get(key, 0)
if not delta:
continue
host, hca, kind, counter = key
if counter in {"port_xmit_data", "port_rcv_data"}:
gib = delta * 4 / (1024 ** 3)
print(f"{host} {hca} {kind} {counter} {delta} words4B {gib:.2f} GiB")
else:
print(f"{host} {hca} {kind} {counter} {delta}")
print("ERROR_OR_CONGESTION_DELTAS")
seen = False
for key in sorted(set(before) | set(after)):
delta = after.get(key, 0) - before.get(key, 0)
if delta and key[3] in interesting:
seen = True
print(*key, delta)
if not seen:
print("none")
PY
}
run_counter_case() {
local op="$1"
local bin="$2"
local extra="${3:-}"
set_common_env
if [[ -n "$extra" ]]; then
eval "export $extra"
fi
local dir="$OUT_DIR/${op}_counter"
mkdir -p "$dir"
read_one_snapshot "$(hostname)" "$dir/before.local"
read_remote_snapshot "$dir/before.remote"
run_nccl "$op" "$bin" "$dir/${op}.log" "$WARMUP_ITERS" "$ITERS"
read_one_snapshot "$(hostname)" "$dir/after.local"
read_remote_snapshot "$dir/after.remote"
summarize_counter_delta "$dir/before.local" "$dir/before.remote" "$dir/after.local" "$dir/after.remote" "$dir/counter_delta.txt"
echo "$dir"
}
summarize_graph_log() {
local log="$1"
local out="$2"
python3 - "$log" >"$out" <<'PY'
from pathlib import Path
import collections
import re
import sys
text = Path(sys.argv[1]).read_text(errors="ignore")
print("avg_busbw", (re.findall(r"Avg bus bandwidth\s*:\s*([0-9.]+)", text) or ["NA"])[-1])
print("nccl_version", sorted(set(re.findall(r"NCCL version ([^\s]+)", text))))
print("plugin_missing", len(re.findall(r"Could not find: none libnccl-net-none\.so", text)))
print("gdr_enabled_lines", len(re.findall(r"GPU Direct RDMA Enabled", text)))
print("using_hca")
for value, count in collections.Counter(re.findall(r"NET/IB : Using \[(.*?)\]; OOB", text)).most_common(4):
print(f" {count} {value}")
print("pattern_counts")
patterns = re.findall(
r"Pattern (\d+), crossNic (\d+), nChannels (\d+), bw ([0-9.]+)/([0-9.]+), type ([^,]+), sameChannels (\d+)",
text,
)
for key, count in collections.Counter(patterns).most_common():
print(f" {count} {key}")
print("channel_summary")
for value, count in collections.Counter(
re.findall(r"(\d+ coll channels, \d+ collnet channels, \d+ nvls channels, \d+ p2p channels, \d+ p2p channels per peer)", text)
).most_common():
print(f" {count} {value}")
print("p2p_chunks", collections.Counter(re.findall(r"P2P Chunksize set to (\d+)", text)))
print("check_p2p", collections.Counter(re.findall(r"Check P2P Type ([^\n]+)", text)))
for token in ["NET/IB/0/GDRDMA", "NET/IB/1/GDRDMA", "NET/IB/2/GDRDMA", "NET/IB/3/GDRDMA", "P2P/CUMEM", "P2P/IPC", "SHM"]:
print(token, text.count(token))
print("channel_edge_lines", len([line for line in text.splitlines() if "Channel " in line and ("via NET/IB" in line or "via P2P" in line)]))
PY
}
run_graph_case() {
local op="$1"
local bin="$2"
local extra="${3:-}"
set_common_env
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=INIT,NET,GRAPH,TUNING,COLL
if [[ -n "$extra" ]]; then
eval "export $extra"
fi
local dir="$OUT_DIR/graph"
mkdir -p "$dir"
local log="$dir/${op}.log"
run_nccl "$op" "$bin" "$log" "$GRAPH_WARMUP_ITERS" "$GRAPH_ITERS"
summarize_graph_log "$log" "$dir/${op}_summary.txt"
echo "$dir/${op}_summary.txt"
}
run_pxn_sweep() {
local dir="$OUT_DIR/pxn_sweep"
mkdir -p "$dir"
local cases=(
"baseline|"
"nvls_off|NCCL_NVLS_ENABLE=0"
"qps4_split1|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1"
"qps8_split1|NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1"
"qps4_split0|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0"
"channels16|NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16"
"buff8m|NCCL_BUFFSIZE=8388608"
"p2pchunk4m|NCCL_P2P_NET_CHUNKSIZE=4194304"
"netpeer8|NCCL_NCHANNELS_PER_NET_PEER=8"
"ar0|NCCL_IB_AR_THRESHOLD=0"
)
: >"$dir/summary.txt"
for item in "${cases[@]}"; do
local name="${item%%|*}"
local extra="${item#*|}"
set_common_env
export NCCL_PXN_DISABLE=1
if [[ -n "$extra" ]]; then
eval "export $extra"
fi
local log="$dir/${name}.log"
{
echo "===== CASE $name ====="
echo "extra: ${extra:-none}"
run_nccl "alltoall" "$NCCL_TESTS_DIR/alltoall_perf" "$log" "$SWEEP_WARMUP_ITERS" "$SWEEP_ITERS"
awk '/Avg bus bandwidth/ {print}' "$log" | tail -1
} | tee -a "$dir/summary.txt"
done
echo "$dir/summary.txt"
}
run_preflight() {
set_common_env
local out="$OUT_DIR/preflight.txt"
{
echo "===== LOCAL ====="
echo "hostname: $(hostname)"
echo "mpirun: $MPI_BIN"
if [[ -x "$MPI_BIN" ]]; then
"$MPI_BIN" --version 2>&1 | sed -n '1p'
else
echo "MISSING executable: $MPI_BIN"
fi
for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do
if [[ -x "$bin" ]]; then
echo "OK executable: $bin"
else
echo "MISSING executable: $bin"
fi
done
for hca in $HCAS; do
local state="/sys/class/infiniband/$hca/ports/1/state"
local rate="/sys/class/infiniband/$hca/ports/1/rate"
if [[ -r "$state" ]]; then
echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)"
else
echo "MISSING HCA path: $hca"
fi
done
echo "===== REMOTE ====="
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \
"MPI_BIN='$MPI_BIN' NCCL_TESTS_DIR='$NCCL_TESTS_DIR' HCAS='$HCAS' bash -s" <<'EOS'
echo "hostname: $(hostname)"
echo "mpirun: $MPI_BIN"
if [ -x "$MPI_BIN" ]; then
"$MPI_BIN" --version 2>&1 | sed -n '1p'
else
echo "MISSING executable: $MPI_BIN"
fi
for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do
if [ -x "$bin" ]; then
echo "OK executable: $bin"
else
echo "MISSING executable: $bin"
fi
done
for hca in $HCAS; do
state="/sys/class/infiniband/$hca/ports/1/state"
rate="/sys/class/infiniband/$hca/ports/1/rate"
if [ -r "$state" ]; then
echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)"
else
echo "MISSING HCA path: $hca"
fi
done
EOS
} | tee "$out"
echo "$out"
}
usage() {
cat <<EOF
Usage: $0 [preflight|all|allreduce-counter|alltoall-counter|graph|pxn-sweep]
Outputs are written to: $OUT_DIR
Common overrides:
HOSTS, PEER_HOST, HCAS, HCA_CSV, MPI_BIN, NCCL_TESTS_DIR,
NCCL_LD_LIBRARY_PATH, BEGIN_SIZE, END_SIZE, WARMUP_ITERS, ITERS
EOF
}
case "$MODE" in
preflight)
run_preflight
;;
all)
run_preflight
run_counter_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
run_counter_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
run_graph_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
run_graph_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
run_pxn_sweep
;;
allreduce-counter)
run_counter_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
;;
alltoall-counter)
run_counter_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
;;
graph)
run_graph_case allreduce "$NCCL_TESTS_DIR/all_reduce_perf" ""
run_graph_case alltoall_pxn "$NCCL_TESTS_DIR/alltoall_perf" "NCCL_PXN_DISABLE=1"
;;
pxn-sweep)
run_pxn_sweep
;;
-h|--help|help)
usage
;;
*)
usage
exit 2
;;
esac
echo "OUT_DIR=$OUT_DIR"

View File

@ -0,0 +1,169 @@
#!/usr/bin/env bash
set -euo pipefail
# Collect a lightweight NCCL/RDMA environment snapshot on one node.
# This script does not run NCCL workloads and is safe to use before deeper tests.
HOST="$(hostname 2>/dev/null || echo unknown)"
TS="$(date +%Y%m%d_%H%M%S)"
OUT_FILE="${1:-${OUT_FILE:-/tmp/nccl_environment_snapshot_${HOST}_${TS}.md}}"
PDF_ALLREDUCE_BUSBW="${PDF_ALLREDUCE_BUSBW:-491.84}"
PDF_ALLTOALL_BUSBW="${PDF_ALLTOALL_BUSBW:-76.54}"
PLUGIN_SEARCH_ROOTS="${PLUGIN_SEARCH_ROOTS:-/usr /opt /tmp /root}"
mkdir -p "$(dirname "$OUT_FILE")"
shopt -s nullglob
have_cmd() {
command -v "$1" >/dev/null 2>&1
}
emit_cmd() {
local title="$1"
shift
{
echo
echo "### $title"
echo
echo '```text'
"$@" 2>&1 || true
echo '```'
} >>"$OUT_FILE"
}
active_400g_hcas=()
non_400g_rows=()
{
echo "# NCCL/RDMA 环境快照"
echo
echo "- Host: \`$HOST\`"
echo "- Time: \`$(date -Is 2>/dev/null || date)\`"
echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`"
echo
echo "## HCA / Port 状态"
echo
echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |"
echo "|---|---:|---|---|---:|---|---|"
} >"$OUT_FILE"
hca_paths=(/sys/class/infiniband/mlx5_*)
if ((${#hca_paths[@]})); then
for hca_path in "${hca_paths[@]}"; do
hca="$(basename "$hca_path")"
for port_path in "$hca_path"/ports/*; do
[[ -d "$port_path" ]] || continue
port="$(basename "$port_path")"
state="$(cat "$port_path/state" 2>/dev/null || echo NA)"
phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)"
rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)"
layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)"
is_400g="NO"
if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then
is_400g="YES"
active_400g_hcas+=("$hca")
else
non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer")
fi
printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \
"$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE"
done
done
else
printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE"
fi
{
echo
echo "## Rail 摘要"
echo
if ((${#active_400g_hcas[@]})); then
hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")"
echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`"
echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`"
echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`"
else
echo "- Active 400G IB rail count: \`0\`"
echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`"
fi
echo
echo "Non-400G / non-IB / down ports:"
echo
if ((${#non_400g_rows[@]})); then
for row in "${non_400g_rows[@]}"; do
echo "- \`$row\`"
done
else
echo "- none"
fi
echo
echo "## PDF 目标换算"
echo
echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`"
echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`"
} >>"$OUT_FILE"
python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true
import sys
busbw = float(sys.argv[1])
rail_count = int(sys.argv[2])
algbw = busbw / 1.875
raw = rail_count * 50.0
print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`")
if rail_count:
pct = algbw / raw * 100
print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`")
if algbw > raw:
print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.")
else:
print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.")
else:
print("- Interpretation: no active 400G IB rail was detected.")
PY
{
echo
echo "## NCCL Net Plugin / SHARP 文件"
echo
echo '```text'
} >>"$OUT_FILE"
read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS"
find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \
2>/dev/null | sort >>"$OUT_FILE" || true
if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then
echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE"
fi
echo '```' >>"$OUT_FILE"
if have_cmd dpkg; then
emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'"
else
emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'"
fi
if have_cmd nvidia-smi; then
emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m
else
emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'"
fi
if have_cmd ibstat; then
emit_cmd "ibstat" ibstat
fi
{
echo
echo "## 建议判断"
echo
echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境,不能直接按 PDF 阈值判断等价。"
echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`NCCL 可能只能走 internal IB plugin。"
echo "3. 若要追 PDF 2x8 目标,请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。"
echo
echo "Snapshot written to: \`$OUT_FILE\`"
} >>"$OUT_FILE"
echo "$OUT_FILE"

277
scripts/pytorch_fp8_path_bench.py Executable file
View File

@ -0,0 +1,277 @@
#!/usr/bin/env python3
"""Compare FP8 GEMM paths used for H100/H200 acceptance debugging.
Paths:
A. torch._scaled_mm eager, default accumulation
B. torch._scaled_mm eager, use_fast_accum=True
C. CUDA Graph replay of torch._scaled_mm(out=..., use_fast_accum=True)
D. Transformer Engine Linear under fp8_autocast, when installed
"""
from __future__ import annotations
import argparse
import json
import statistics
import sys
import time
from typing import Any, Callable
import torch
def tflops_from_ms(matrix_size: int, iterations: int, elapsed_ms: float) -> float:
flops = 2.0 * matrix_size * matrix_size * matrix_size * iterations
return flops / (elapsed_ms / 1000.0) / 1e12
def cuda_event_bench(
name: str,
matrix_size: int,
iterations: int,
warmup: int,
func: Callable[[int], Any],
) -> dict[str, Any]:
for i in range(warmup):
func(i)
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
wall_start = time.perf_counter()
start.record()
for i in range(iterations):
func(i)
end.record()
torch.cuda.synchronize()
wall_elapsed = time.perf_counter() - wall_start
elapsed_ms = start.elapsed_time(end)
return {
"name": name,
"status": "ok",
"matrix_size": matrix_size,
"iterations": iterations,
"warmup": warmup,
"event_ms_total": round(elapsed_ms, 3),
"event_us_per_iter": round(elapsed_ms * 1000.0 / iterations, 3),
"wall_ms_total": round(wall_elapsed * 1000.0, 3),
"tflops": round(tflops_from_ms(matrix_size, iterations, elapsed_ms), 1),
}
def make_fp8_inputs(matrix_size: int, pools: int, device: str) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
a = [
torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
for _ in range(pools)
]
b = [
torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn)
for _ in range(pools)
]
torch.cuda.synchronize()
return a, b
def bench_scaled_mm(args: argparse.Namespace) -> list[dict[str, Any]]:
device = f"cuda:{args.gpu_index}"
torch.cuda.set_device(args.gpu_index)
scale_a = torch.tensor(1.0, device=device)
scale_b = torch.tensor(1.0, device=device)
pools_a, pools_b = make_fp8_inputs(args.matrix_size, args.pools, device)
results: list[dict[str, Any]] = []
def eager_default(i: int) -> torch.Tensor:
idx = i % args.pools
return torch._scaled_mm(
pools_a[idx],
pools_b[idx].T,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
)
def eager_fast(i: int) -> torch.Tensor:
idx = i % args.pools
return torch._scaled_mm(
pools_a[idx],
pools_b[idx].T,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True,
)
results.append(
cuda_event_bench(
"A_eager_scaled_mm_default",
args.matrix_size,
args.iterations,
args.warmup,
eager_default,
)
)
results.append(
cuda_event_bench(
"B_eager_scaled_mm_fast_accum",
args.matrix_size,
args.iterations,
args.warmup,
eager_fast,
)
)
graph_out = torch.empty(
(args.matrix_size, args.matrix_size),
device=device,
dtype=torch.bfloat16,
)
static_a = pools_a[0]
static_b_t = pools_b[0].T
try:
side_stream = torch.cuda.Stream()
side_stream.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(side_stream):
for _ in range(max(3, args.warmup // 2)):
torch._scaled_mm(
static_a,
static_b_t,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True,
out=graph_out,
)
torch.cuda.current_stream().wait_stream(side_stream)
torch.cuda.synchronize()
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
torch._scaled_mm(
static_a,
static_b_t,
scale_a=scale_a,
scale_b=scale_b,
out_dtype=torch.bfloat16,
use_fast_accum=True,
out=graph_out,
)
def graph_replay(_: int) -> None:
graph.replay()
results.append(
cuda_event_bench(
"C_cuda_graph_scaled_mm_fast_accum",
args.matrix_size,
args.iterations,
3,
graph_replay,
)
)
except Exception as exc: # noqa: BLE001
results.append(
{
"name": "C_cuda_graph_scaled_mm_fast_accum",
"status": "unavailable",
"reason": f"{type(exc).__name__}: {exc}",
}
)
return results
def bench_transformer_engine(args: argparse.Namespace) -> dict[str, Any]:
try:
import transformer_engine.pytorch as te # type: ignore[import-not-found]
from transformer_engine.common.recipe import DelayedScaling, Format # type: ignore[import-not-found]
except Exception as exc: # noqa: BLE001
return {
"name": "D_transformer_engine_fp8_linear",
"status": "unavailable",
"reason": f"{type(exc).__name__}: {exc}",
}
device = f"cuda:{args.gpu_index}"
x = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=torch.bfloat16)
layer = te.Linear(
args.matrix_size,
args.matrix_size,
bias=False,
params_dtype=torch.bfloat16,
device=device,
)
recipe = DelayedScaling(fp8_format=Format.HYBRID)
def run(_: int) -> torch.Tensor:
with te.fp8_autocast(enabled=True, fp8_recipe=recipe):
return layer(x)
try:
result = cuda_event_bench(
"D_transformer_engine_fp8_linear",
args.matrix_size,
args.iterations,
args.warmup,
run,
)
except Exception as exc: # noqa: BLE001
return {
"name": "D_transformer_engine_fp8_linear",
"status": "error",
"reason": f"{type(exc).__name__}: {exc}",
}
result["note"] = "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead."
return result
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--matrix-size", type=int, default=8192)
parser.add_argument("--warmup", type=int, default=20)
parser.add_argument("--iterations", type=int, default=100)
parser.add_argument("--gpu-index", type=int, default=0)
parser.add_argument("--pools", type=int, default=4)
args = parser.parse_args()
if not torch.cuda.is_available():
print(json.dumps({"error": "cuda unavailable"}, indent=2))
return 1
if not hasattr(torch, "_scaled_mm") or not hasattr(torch, "float8_e4m3fn"):
print(json.dumps({"error": "torch FP8 _scaled_mm unavailable"}, indent=2))
return 1
torch.cuda.set_device(args.gpu_index)
props = torch.cuda.get_device_properties(args.gpu_index)
payload = {
"source": "pytorch_fp8_path_bench",
"torch": torch.__version__,
"cuda": torch.version.cuda,
"gpu_index": args.gpu_index,
"gpu_name": props.name,
"matrix_size": args.matrix_size,
"warmup": args.warmup,
"iterations": args.iterations,
"results": [],
}
try:
payload["results"].extend(bench_scaled_mm(args))
payload["results"].append(bench_transformer_engine(args))
except torch.cuda.OutOfMemoryError as exc:
payload["error"] = f"CUDA OOM: {exc}"
print(json.dumps(payload, indent=2))
return 1
ok_values = [r["tflops"] for r in payload["results"] if r.get("status") == "ok"]
if ok_values:
payload["summary"] = {
"max_tflops": round(max(ok_values), 1),
"min_tflops": round(min(ok_values), 1),
"mean_tflops": round(statistics.mean(ok_values), 1),
}
print(json.dumps(payload, indent=2))
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -0,0 +1,45 @@
#!/usr/bin/env bash
set -uo pipefail
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
WARMUP="${WARMUP:-20}"
ITERATIONS="${ITERATIONS:-200}"
GPU_COUNT="${GPU_COUNT:-8}"
FIRST_GPU="${FIRST_GPU:-0}"
WORKSPACE_MB="${WORKSPACE_MB:-256}"
if [[ ! -x "$NVCC" ]]; then
echo "nvcc not found: $NVCC" >&2
exit 1
fi
mkdir -p "$OUT_DIR" "$PROJECT_DIR/build"
HOST="$(hostname 2>/dev/null || echo unknown)"
TS="$(date +%Y%m%d_%H%M%S)"
BIN="$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
REPORT="$OUT_DIR/cublaslt_fp8_gemm_${HOST}_${TS}.json"
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
-lcublasLt -lcublas -o "$BIN"
set +e
"$BIN" \
--matrix-size "$MATRIX_SIZE" \
--warmup "$WARMUP" \
--iterations "$ITERATIONS" \
--first-gpu "$FIRST_GPU" \
--gpu-count "$GPU_COUNT" \
--workspace-mb "$WORKSPACE_MB" \
| tee "$REPORT"
status=${PIPESTATUS[0]}
set -e
echo "Report written to: $REPORT"
exit "$status"

View File

@ -0,0 +1,93 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}"
CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}"
NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}"
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
MATRIX_SIZE="${MATRIX_SIZE:-8192}"
WARMUP="${WARMUP:-20}"
ITERATIONS="${ITERATIONS:-100}"
GPU_INDEX="${GPU_INDEX:-0}"
WORKSPACE_MB="${WORKSPACE_MB:-256}"
VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY'
import site
print(site.getsitepackages()[0])
PY
)"
export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}"
mkdir -p "$PROJECT_DIR/build" "$OUT_DIR"
HOST="$(hostname 2>/dev/null || echo unknown)"
TS="$(date +%Y%m%d_%H%M%S)"
PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json"
CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json"
COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json"
"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \
--matrix-size "$MATRIX_SIZE" \
--warmup "$WARMUP" \
--iterations "$ITERATIONS" \
--gpu-index "$GPU_INDEX" | tee "$PY_REPORT"
"$NVCC" -O3 -std=c++17 -arch=sm_90 \
"$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \
-lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench"
"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \
--matrix-size "$MATRIX_SIZE" \
--warmup "$WARMUP" \
--iterations "$ITERATIONS" \
--first-gpu "$GPU_INDEX" \
--gpu-count 1 \
--workspace-mb "$WORKSPACE_MB" \
--fast-accum 1 | tee "$CUBLAS_REPORT"
"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY'
import json
import pathlib
import sys
py_report = pathlib.Path(sys.argv[1])
cublas_report = pathlib.Path(sys.argv[2])
combined_report = pathlib.Path(sys.argv[3])
with py_report.open() as f:
py_payload = json.load(f)
with cublas_report.open() as f:
cublas_payload = json.load(f)
combined = {
"source": "fp8_path_comparison",
"host": cublas_payload.get("host"),
"matrix_size": py_payload.get("matrix_size"),
"gpu_index": py_payload.get("gpu_index"),
"pytorch": py_payload,
"cublaslt": cublas_payload,
"results": [],
}
combined["results"].extend(py_payload.get("results", []))
per_gpu = cublas_payload.get("per_gpu", [])
if per_gpu:
row = dict(per_gpu[0])
row.update({
"name": "E_direct_cublaslt_fast_accum",
"status": "ok",
"tflops": row.pop("fp8_tflops"),
"matrix_size": cublas_payload.get("matrix_size"),
"iterations": cublas_payload.get("iterations"),
"warmup": cublas_payload.get("warmup"),
"fast_accum": cublas_payload.get("fast_accum"),
"note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.",
})
combined["results"].append(row)
combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8")
print(f"Combined report written to: {combined_report}")
PY
echo "$COMBINED_REPORT"

View File

@ -0,0 +1,134 @@
#!/usr/bin/env bash
set -uo pipefail
# Run the single-node H100 acceptance suite and keep the raw report paths stable.
# The suite itself still lives in gpu_tester.py; this wrapper only standardizes
# snapshot/report naming for repeated machine-level runs.
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/default.yaml}"
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
FORMAT="${FORMAT:-md}"
DRY_RUN=0
SNAPSHOT=1
usage() {
cat <<'EOF'
Usage: run_h100_single_node_all.sh [options]
Options:
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
--config PATH gpu_tester config file (default: configs/default.yaml)
--out-dir PATH Report output directory (default: reports)
--format FORMAT Report format: md, json, or html (default: md)
--no-snapshot Do not run nccl_environment_snapshot.sh first
--dry-run Print commands without running them
-h, --help Show this help
EOF
}
while (($#)); do
case "$1" in
--python)
PYTHON_BIN="$2"
shift 2
;;
--config)
CONFIG_FILE="$2"
shift 2
;;
--out-dir)
OUT_DIR="$2"
shift 2
;;
--format)
FORMAT="$2"
shift 2
;;
--no-snapshot)
SNAPSHOT=0
shift
;;
--dry-run)
DRY_RUN=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
echo "Unsupported format: $FORMAT" >&2
exit 2
fi
if [[ ! -x "$PYTHON_BIN" ]]; then
PYTHON_BIN="$(command -v python3 || true)"
fi
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
exit 1
fi
HOST="$(hostname 2>/dev/null || echo unknown)"
TS="$(date +%Y%m%d_%H%M%S)"
mkdir -p "$OUT_DIR"
SNAPSHOT_FILE="$OUT_DIR/nccl_environment_snapshot_${HOST}_${TS}.md"
REPORT_FILE="$OUT_DIR/h100_single_node_all_${HOST}_${TS}.${FORMAT}"
snapshot_cmd=(bash "$PROJECT_DIR/scripts/nccl_environment_snapshot.sh" "$SNAPSHOT_FILE")
test_cmd=(
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
--config "$CONFIG_FILE"
--test all
--report
--format "$FORMAT"
--output "$REPORT_FILE"
)
echo "Project: $PROJECT_DIR"
echo "Host: $HOST"
echo "Config: $CONFIG_FILE"
echo "Report: $REPORT_FILE"
if ((SNAPSHOT)); then
echo "Snapshot: $SNAPSHOT_FILE"
fi
if ((DRY_RUN)); then
if ((SNAPSHOT)); then
printf 'DRY RUN snapshot:'
printf ' %q' "${snapshot_cmd[@]}"
printf '\n'
fi
printf 'DRY RUN test:'
printf ' %q' "${test_cmd[@]}"
printf '\n'
exit 0
fi
if ((SNAPSHOT)); then
"${snapshot_cmd[@]}"
fi
"${test_cmd[@]}"
status=$?
echo "Report written to: $REPORT_FILE"
if ((SNAPSHOT)); then
echo "Snapshot written to: $SNAPSHOT_FILE"
fi
exit "$status"

View File

@ -0,0 +1,147 @@
#!/usr/bin/env bash
set -uo pipefail
# Run a two-node, eight-GPU-per-node NCCL evidence pass across the six
# collectives used by the single-node H100 acceptance flow.
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml}"
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
FORMAT="${FORMAT:-md}"
DRY_RUN=0
RUN_PREFLIGHT=1
PREFLIGHT_ONLY=0
usage() {
cat <<'EOF'
Usage: run_multinode_nccl_all_collectives.sh [options]
Options:
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
--config PATH Config file (default: configs/multinode_nccl_nccl227_all_collectives_2x8.yaml)
--out-dir PATH Report output directory (default: reports)
--format FORMAT Report format: md, json, or html (default: md)
--no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight
--preflight-only Run only the preflight check, not the workload
--dry-run Print commands without running them
-h, --help Show this help
EOF
}
while (($#)); do
case "$1" in
--python)
PYTHON_BIN="$2"
shift 2
;;
--config)
CONFIG_FILE="$2"
shift 2
;;
--out-dir)
OUT_DIR="$2"
shift 2
;;
--format)
FORMAT="$2"
shift 2
;;
--no-preflight)
RUN_PREFLIGHT=0
shift
;;
--preflight-only)
PREFLIGHT_ONLY=1
shift
;;
--dry-run)
DRY_RUN=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
echo "Unsupported format: $FORMAT" >&2
exit 2
fi
if [[ ! -x "$PYTHON_BIN" ]]; then
PYTHON_BIN="$(command -v python3 || true)"
fi
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
exit 1
fi
TS="$(date +%Y%m%d_%H%M%S)"
mkdir -p "$OUT_DIR"
REPORT_FILE="$OUT_DIR/multinode_nccl_all_collectives_${TS}.${FORMAT}"
ARTIFACT_DIR="$OUT_DIR/multinode_nccl_all_collectives_${TS}_artifacts"
PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
RUN_CMD=(
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
--config "$CONFIG_FILE"
--test multinode-nccl
--report
--format "$FORMAT"
--output "$REPORT_FILE"
)
echo "Project: $PROJECT_DIR"
echo "Config: $CONFIG_FILE"
echo "Report: $REPORT_FILE"
echo "Artifacts: $ARTIFACT_DIR"
echo "Collectives: allreduce, alltoall, broadcast, reducescatter, allgather, sendrecv"
echo "Topology: 2 nodes x 8 GPUs per node; 16G"
if ((DRY_RUN)); then
if ((RUN_PREFLIGHT)); then
printf 'DRY RUN preflight:'
printf ' %q' "${PREFLIGHT_CMD[@]}"
printf '\n'
fi
if ((PREFLIGHT_ONLY)); then
exit 0
fi
printf 'DRY RUN workload:'
printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
printf ' %q' "${RUN_CMD[@]}"
printf '\n'
exit 0
fi
if ((RUN_PREFLIGHT)); then
"${PREFLIGHT_CMD[@]}"
preflight_status=$?
if ((preflight_status != 0)); then
echo "Preflight failed with exit code $preflight_status" >&2
exit "$preflight_status"
fi
fi
if ((PREFLIGHT_ONLY)); then
exit 0
fi
mkdir -p "$ARTIFACT_DIR"
MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${RUN_CMD[@]}"
status=$?
echo "Report written to: $REPORT_FILE"
echo "Artifacts written to: $ARTIFACT_DIR"
exit "$status"

View File

@ -0,0 +1,147 @@
#!/usr/bin/env bash
set -uo pipefail
# Run the formal cross-node NCCL PDF matrix for the current two-node H100 pair.
# This wrapper standardizes the command, output naming, and preflight hook; the
# actual benchmark implementation remains in gpu_tester.py / MultiNodeNCCLTest.
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)"
PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}"
CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_pdf_matrix.yaml}"
OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}"
FORMAT="${FORMAT:-md}"
DRY_RUN=0
RUN_PREFLIGHT=1
PREFLIGHT_ONLY=0
usage() {
cat <<'EOF'
Usage: run_multinode_nccl_pdf_matrix.sh [options]
Options:
--python PATH Python executable (default: /root/gpu-test-venv/bin/python)
--config PATH Matrix config file (default: configs/multinode_nccl_nccl227_pdf_matrix.yaml)
--out-dir PATH Report output directory (default: reports)
--format FORMAT Report format: md, json, or html (default: md)
--no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight
--preflight-only Run only the preflight check, not the matrix workload
--dry-run Print commands without running them
-h, --help Show this help
EOF
}
while (($#)); do
case "$1" in
--python)
PYTHON_BIN="$2"
shift 2
;;
--config)
CONFIG_FILE="$2"
shift 2
;;
--out-dir)
OUT_DIR="$2"
shift 2
;;
--format)
FORMAT="$2"
shift 2
;;
--no-preflight)
RUN_PREFLIGHT=0
shift
;;
--preflight-only)
PREFLIGHT_ONLY=1
shift
;;
--dry-run)
DRY_RUN=1
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "Unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then
echo "Unsupported format: $FORMAT" >&2
exit 2
fi
if [[ ! -x "$PYTHON_BIN" ]]; then
PYTHON_BIN="$(command -v python3 || true)"
fi
if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then
echo "Python executable not found. Set --python or PYTHON_BIN." >&2
exit 1
fi
TS="$(date +%Y%m%d_%H%M%S)"
mkdir -p "$OUT_DIR"
REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}"
ARTIFACT_DIR="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}_artifacts"
PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight)
MATRIX_CMD=(
"$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py"
--config "$CONFIG_FILE"
--test multinode-nccl
--report
--format "$FORMAT"
--output "$REPORT_FILE"
)
echo "Project: $PROJECT_DIR"
echo "Config: $CONFIG_FILE"
echo "Report: $REPORT_FILE"
echo "Artifacts: $ARTIFACT_DIR"
echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G"
if ((DRY_RUN)); then
if ((RUN_PREFLIGHT)); then
printf 'DRY RUN preflight:'
printf ' %q' "${PREFLIGHT_CMD[@]}"
printf '\n'
fi
if ((PREFLIGHT_ONLY)); then
exit 0
fi
printf 'DRY RUN matrix:'
printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR"
printf ' %q' "${MATRIX_CMD[@]}"
printf '\n'
exit 0
fi
if ((RUN_PREFLIGHT)); then
"${PREFLIGHT_CMD[@]}"
preflight_status=$?
if ((preflight_status != 0)); then
echo "Preflight failed with exit code $preflight_status" >&2
exit "$preflight_status"
fi
fi
if ((PREFLIGHT_ONLY)); then
exit 0
fi
mkdir -p "$ARTIFACT_DIR"
MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${MATRIX_CMD[@]}"
status=$?
echo "Report written to: $REPORT_FILE"
echo "Artifacts written to: $ARTIFACT_DIR"
exit "$status"