diff --git a/.gitignore b/.gitignore index 934bb96..2347ffb 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,12 @@ __pycache__/ dist/ build/ reports/ +reports_* +H100*.md +test_all*.md +docs/h100_test_all_metrics_guide_cn.md +docs/multinode_nccl_concepts.md +docs/multinode_nccl_deep_diagnose_runbook.md *.egg .eggs/ *.log @@ -14,4 +20,6 @@ reports/ .venv/ venv/ .qoder/* +.playwright-mcp/ .claude/settings.local.json +.omx/ diff --git a/configs/default.yaml b/configs/default.yaml index a432c11..cd214e4 100644 --- a/configs/default.yaml +++ b/configs/default.yaml @@ -48,6 +48,55 @@ nccl: test_allgather: false test_sendrecv: false +multinode_nccl: + enabled: false + mode: sweep + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: null # null = tools.install_dir/nccl-tests/build + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + begin_size: 1k + end_size: 16g + step_factor: 2 + warmup_iters: 10 + gpus_per_rank: 1 + timeout_sec: 1800 + socket_ifname: bond0 + oob_tcp_ifname: bond0 + plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: null + min_nchannels: null + net_plugin: none + nvls_enable: 1 + split_data_on_qps: null + extra_env: {} + min_peak_busbw_gbps: + allreduce: 480 + alltoall: 75 + stress: duration_sec: 600 # 10 min — reaches thermal steady state, validates throttle/jitter beyond warmup use_doubles: false diff --git a/configs/multinode_nccl_diagnostic.yaml b/configs/multinode_nccl_diagnostic.yaml new file mode 100644 index 0000000..0e6479d --- /dev/null +++ b/configs/multinode_nccl_diagnostic.yaml @@ -0,0 +1,62 @@ +tools: + install_dir: /opt/gpu-test-tools + +report: + output_dir: ./reports + format: md + +multinode_nccl: + enabled: true + mode: diagnostic + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: null + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + label: 2 nodes x 8 GPUs diagnostic + begin_size: 256M + end_size: 256M + step_factor: 2 + warmup_iters: 1 + iters: 3 + gpus_per_rank: 1 + timeout_sec: 600 + debug: INFO + socket_ifname: bond0 + oob_tcp_ifname: bond0 + plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: null + min_nchannels: null + net_plugin: none + nvls_enable: 1 + split_data_on_qps: null + extra_env: + NCCL_DEBUG_SUBSYS: INIT,NET + NCCL_NET_GDR_LEVEL: 5 + NCCL_NET_GDR_READ: 1 + NCCL_DMABUF_ENABLE: 0 + min_peak_busbw_gbps: + allreduce: 480 + alltoall: 75 diff --git a/configs/multinode_nccl_nccl227_16g.yaml b/configs/multinode_nccl_nccl227_16g.yaml new file mode 100644 index 0000000..5f57a4b --- /dev/null +++ b/configs/multinode_nccl_nccl227_16g.yaml @@ -0,0 +1,62 @@ +tools: + install_dir: /opt/gpu-test-tools + +report: + output_dir: ./reports + format: md + +multinode_nccl: + enabled: true + mode: large-message-nccl-2.27.7 + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: /data/nccl-tests-latest/build + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + label: 2 nodes x 8 GPUs NCCL 2.27.7 16G + begin_size: 16G + end_size: 16G + step_factor: 2 + warmup_iters: 1 + iters: 3 + gpus_per_rank: 1 + timeout_sec: 1200 + debug: INFO + socket_ifname: bond0 + oob_tcp_ifname: bond0 + plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: null + min_nchannels: null + net_plugin: none + nvls_enable: 1 + split_data_on_qps: null + extra_env: + NCCL_DEBUG_SUBSYS: INIT,NET + NCCL_NET_GDR_LEVEL: 5 + NCCL_NET_GDR_READ: 1 + NCCL_DMABUF_ENABLE: 0 + min_peak_busbw_gbps: + allreduce: 480 + alltoall: 75 diff --git a/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml b/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml new file mode 100644 index 0000000..1e5d464 --- /dev/null +++ b/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml @@ -0,0 +1,72 @@ +tools: + install_dir: /opt/gpu-test-tools + +report: + output_dir: ./reports + format: md + +multinode_nccl: + enabled: true + mode: cross-leaf-all-collectives-nccl-2.27.7 + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: /data/nccl-tests-latest/build + tests: + - all_reduce_perf + - alltoall_perf + - broadcast_perf + - reduce_scatter_perf + - all_gather_perf + - sendrecv_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + label: 2 nodes x 8 GPUs (all collectives evidence run) + op_env: + alltoall: + NCCL_PXN_DISABLE: 1 + begin_size: 16G + end_size: 16G + step_factor: 2 + warmup_iters: 10 + gpus_per_rank: 1 + timeout_sec: 1800 + debug: INFO + socket_ifname: bond0 + oob_tcp_ifname: bond0 + plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: null + min_nchannels: null + net_plugin: none + nvls_enable: 1 + split_data_on_qps: null + extra_env: + NCCL_DEBUG_SUBSYS: INIT,NET + NCCL_NET_GDR_LEVEL: 5 + NCCL_NET_GDR_READ: 1 + NCCL_DMABUF_ENABLE: 0 + min_peak_busbw_gbps: + allreduce: 491.84 + alltoall: 76.54 + broadcast: 0 + reducescatter: 0 + allgather: 0 + sendrecv: 0 diff --git a/configs/multinode_nccl_nccl227_auto_16g.yaml b/configs/multinode_nccl_nccl227_auto_16g.yaml new file mode 100644 index 0000000..f547bff --- /dev/null +++ b/configs/multinode_nccl_nccl227_auto_16g.yaml @@ -0,0 +1,62 @@ +tools: + install_dir: /opt/gpu-test-tools + +report: + output_dir: ./reports + format: md + +multinode_nccl: + enabled: true + mode: large-message-nccl-2.27.7-auto + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: /data/nccl-tests-latest/build + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + label: 2 nodes x 8 GPUs NCCL 2.27.7 auto 16G + begin_size: 16G + end_size: 16G + step_factor: 2 + warmup_iters: 1 + iters: 3 + gpus_per_rank: 1 + timeout_sec: 1200 + debug: INFO + socket_ifname: bond0 + oob_tcp_ifname: bond0 + plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: null + min_nchannels: null + net_plugin: none + nvls_enable: 1 + split_data_on_qps: null + extra_env: + NCCL_DEBUG_SUBSYS: INIT,NET + NCCL_NET_GDR_LEVEL: 5 + NCCL_NET_GDR_READ: 1 + NCCL_DMABUF_ENABLE: 0 + min_peak_busbw_gbps: + allreduce: 480 + alltoall: 75 diff --git a/configs/multinode_nccl_nccl227_diagnostic.yaml b/configs/multinode_nccl_nccl227_diagnostic.yaml new file mode 100644 index 0000000..64c0479 --- /dev/null +++ b/configs/multinode_nccl_nccl227_diagnostic.yaml @@ -0,0 +1,62 @@ +tools: + install_dir: /opt/gpu-test-tools + +report: + output_dir: ./reports + format: md + +multinode_nccl: + enabled: true + mode: diagnostic-nccl-2.27.7 + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: /data/nccl-tests-latest/build + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + label: 2 nodes x 8 GPUs NCCL 2.27.7 + begin_size: 256M + end_size: 256M + step_factor: 2 + warmup_iters: 1 + iters: 3 + gpus_per_rank: 1 + timeout_sec: 600 + debug: INFO + socket_ifname: bond0 + oob_tcp_ifname: bond0 + plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: null + min_nchannels: null + net_plugin: none + nvls_enable: 1 + split_data_on_qps: null + extra_env: + NCCL_DEBUG_SUBSYS: INIT,NET + NCCL_NET_GDR_LEVEL: 5 + NCCL_NET_GDR_READ: 1 + NCCL_DMABUF_ENABLE: 0 + min_peak_busbw_gbps: + allreduce: 480 + alltoall: 75 diff --git a/configs/multinode_nccl_nccl227_pdf_matrix.yaml b/configs/multinode_nccl_nccl227_pdf_matrix.yaml new file mode 100644 index 0000000..2c33573 --- /dev/null +++ b/configs/multinode_nccl_nccl227_pdf_matrix.yaml @@ -0,0 +1,91 @@ +tools: + install_dir: /opt/gpu-test-tools + +report: + output_dir: ./reports + format: md + +multinode_nccl: + enabled: true + mode: cross-leaf-pdf-matrix-nccl-2.27.7 + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: /data/nccl-tests-latest/build + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 1 + label: 2 nodes x 1 GPU (PDF 2 machines 2 GPUs) + min_peak_busbw_gbps: + allreduce: 48.90 + alltoall: 27.25 + - nodes: 2 + gpus_per_node: 2 + label: 2 nodes x 2 GPUs (PDF 2 machines 4 GPUs) + min_peak_busbw_gbps: + allreduce: 136.93 + alltoall: 54.41 + - nodes: 2 + gpus_per_node: 4 + label: 2 nodes x 4 GPUs (PDF 2 machines 8 GPUs) + cuda_visible_devices: 0,1,4,5 + op_env: + alltoall: + NCCL_IB_QPS_PER_CONNECTION: 4 + NCCL_MIN_NCHANNELS: 4 + NCCL_IB_SPLIT_DATA_ON_QPS: 1 + min_peak_busbw_gbps: + allreduce: 335.48 + alltoall: 73.73 + - nodes: 2 + gpus_per_node: 8 + label: 2 nodes x 8 GPUs (PDF 2 machines 16 GPUs) + op_env: + alltoall: + NCCL_PXN_DISABLE: 1 + min_peak_busbw_gbps: + allreduce: 491.84 + alltoall: 76.54 + begin_size: 16G + end_size: 16G + step_factor: 2 + warmup_iters: 10 + gpus_per_rank: 1 + timeout_sec: 1800 + debug: INFO + socket_ifname: bond0 + oob_tcp_ifname: bond0 + plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: null + min_nchannels: null + net_plugin: none + nvls_enable: 1 + split_data_on_qps: null + extra_env: + NCCL_DEBUG_SUBSYS: INIT,NET + NCCL_NET_GDR_LEVEL: 5 + NCCL_NET_GDR_READ: 1 + NCCL_DMABUF_ENABLE: 0 + min_peak_busbw_gbps: + allreduce: 0 + alltoall: 0 diff --git a/configs/multinode_nccl_nccl227_sweep.yaml b/configs/multinode_nccl_nccl227_sweep.yaml new file mode 100644 index 0000000..f46a4ab --- /dev/null +++ b/configs/multinode_nccl_nccl227_sweep.yaml @@ -0,0 +1,62 @@ +tools: + install_dir: /opt/gpu-test-tools + +report: + output_dir: ./reports + format: md + +multinode_nccl: + enabled: true + mode: sweep-nccl-2.27.7 + hosts: + - name: nccl-gpu-1 + addr: 172.72.8.12 + slots: 8 + - name: nccl-gpu-2 + addr: 172.72.8.16 + slots: 8 + ssh_user: root + ssh_preflight: true + mpirun_path: /usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun + mpi_ld_preload: null + extra_ld_library_path: + - /usr/mpi/gcc/openmpi-4.1.9a1/lib + - /tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu + - /usr/local/cuda-12.4/targets/x86_64-linux/lib + nccl_tests_dir: /data/nccl-tests-latest/build + tests: + - all_reduce_perf + - alltoall_perf + topologies: + - nodes: 2 + gpus_per_node: 8 + label: 2 nodes x 8 GPUs NCCL 2.27.7 sweep + begin_size: 1M + end_size: 4G + step_factor: 4 + warmup_iters: 2 + iters: 5 + gpus_per_rank: 1 + timeout_sec: 1200 + debug: INFO + socket_ifname: bond0 + oob_tcp_ifname: bond0 + plm_rsh_args: "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o ServerAliveInterval=30" + ib_gid_index: 3 + ib_sl: 5 + ib_tc: 136 + ib_hca: mlx5_0,mlx5_1,mlx5_6,mlx5_7 + ib_timeout: 22 + qps_per_connection: null + min_nchannels: null + net_plugin: none + nvls_enable: 1 + split_data_on_qps: null + extra_env: + NCCL_DEBUG_SUBSYS: INIT,NET + NCCL_NET_GDR_LEVEL: 5 + NCCL_NET_GDR_READ: 1 + NCCL_DMABUF_ENABLE: 0 + min_peak_busbw_gbps: + allreduce: 480 + alltoall: 75 diff --git a/gpu_tester.py b/gpu_tester.py index 4cfa47c..35d89de 100644 --- a/gpu_tester.py +++ b/gpu_tester.py @@ -5,6 +5,7 @@ import argparse import json import os import signal +import socket import sys import time from datetime import datetime @@ -25,6 +26,9 @@ from modules.nccl_test import NCCLTest from modules.training_sim import TrainingSim from modules.stress_test import StressTest from modules.rdma_test import RDMATest +from modules.nvlink_test import NVLinkTest +from modules.dcgm_test import DCGMTest +from modules.multinode_nccl_test import MultiNodeNCCLTest from modules.report import ReportGenerator from modules.gpu_specs import detect_gpu_type, get_gpu_specs, get_gpu_label, get_supported_gpus, validate_driver_compatibility @@ -32,43 +36,125 @@ DEFAULT_CONFIG = { "benchmark": { "memory": {"size_mb": 4096, "iterations": 10, "nvbandwidth_buffer_mb": 512, "nvbandwidth_samples": 3}, "compute": { - "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8"], - "matrix_size": 4096, - "warmup": 10, - "iterations": 100, + "dtypes": ["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"], + "matrix_size": 8192, + "warmup": 50, + "iterations": 500, + "use_compile": True, }, }, - "health": {"temp_warning": 80, "temp_critical": 90, "power_limit": None}, + "health": {"temp_warning": 75, "temp_critical": 85, "power_limit": None}, "nccl": { "min_bandwidth_gbps": None, "test_allreduce": True, "test_alltoall": True, "test_broadcast": True, - "test_reduce_scatter": False, - "test_allgather": False, - "test_sendrecv": False, + "test_reduce_scatter": True, + "test_allgather": True, + "test_sendrecv": True, + "message_sizes": ["1M", "256M", "2G"], + "repeats": 3, + "max_stddev_pct": 3, + }, + "multinode_nccl": { + "enabled": False, + "mode": "sweep", + "hosts": [ + {"name": "nccl-gpu-1", "addr": "172.72.8.12", "slots": 8}, + {"name": "nccl-gpu-2", "addr": "172.72.8.16", "slots": 8}, + ], + "ssh_user": "root", + "ssh_preflight": True, + "mpirun_path": "/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun", + "mpi_ld_preload": None, + "extra_ld_library_path": [ + "/usr/mpi/gcc/openmpi-4.1.9a1/lib", + "/root/gpu-test-venv/lib/python3.10/site-packages/nvidia/nccl/lib", + "/usr/local/cuda-12.4/targets/x86_64-linux/lib", + ], + "nccl_tests_dir": None, + "tests": ["all_reduce_perf", "alltoall_perf"], + "topologies": [{"nodes": 2, "gpus_per_node": 8}], + "begin_size": "1k", + "end_size": "16g", + "step_factor": 2, + "warmup_iters": 10, + "gpus_per_rank": 1, + "timeout_sec": 1800, + "socket_ifname": "bond0", + "ib_gid_index": 3, + "ib_sl": 5, + "ib_tc": 136, + "ib_hca": "mlx5_0,mlx5_1,mlx5_6,mlx5_7", + "ib_timeout": 22, + "qps_per_connection": 4, + "min_nchannels": 4, + "net_plugin": "none", + "nvls_enable": 1, + "split_data_on_qps": 1, + "min_peak_busbw_gbps": {"allreduce": 480, "alltoall": 75}, }, "stress": { - "duration_sec": 60, + "duration_sec": 1800, + "production_duration_sec": 1800, + "use_gpu_burn": False, "use_doubles": False, "use_tensor_cores": True, "memory_pct": 90, "gpus": "all", + "dtype": "bf16", + "matrix_size": 24576, + "telemetry_interval_sec": 1, + "warmup_sec": 60, + "min_steady_samples": 10, + "max_temp_c": 80, + "max_temp_delta_c": 5, + "min_power_watts": 630, + "max_tflops_jitter_pct": 5, + "require_tflops_jitter": True, }, "rdma": { - "min_bandwidth_gbps": 50, - "max_latency_us": 10, + "min_bandwidth_gbps": 47, + "min_port_rate_gbps": 400, + "max_latency_us": 3.5, + "max_write_latency_us": 2.0, + "max_read_latency_us": 3.5, "ib_iterations": 1000, - "msg_size": 65536, + "msg_size": 4194304, + "latency_msg_size": 8, "ib_device": None, "ib_port": 1, + "server_addr": None, + "ibping_target": None, + "ibping_count": 5, + "role": "auto", + "pfc_ecn_counters": True, + }, + "nvlink": { + "expected_links_per_gpu": 18, + "expected_link_speed_gbps": 25, + "require_zero_errors": True, + }, + "dcgm": { + "diag_level": 3, + "timeout_sec": 1200, + "expected_num_gpus": 8, + "json_output": True, + "require_subtests": True, }, "training": { - "model": "gpt2", + "model": "synthetic_1.5b", "batch_size": 8, "seq_length": 2048, "num_steps": 50, + "warmup_steps": 5, "dtype": "bf16", + "mode": "ddp", + "synthetic_params_b": 1.5, + "min_tokens_per_sec": 45000, + "max_step_jitter_pct": 3, + "max_peak_memory_gb": 70, + "require_distributed": True, }, "report": {"output_dir": "./reports", "format": "json"}, "tools": {"install_dir": "/opt/gpu-test-tools"}, @@ -131,7 +217,7 @@ def interactive_menu(config: dict): if not check_prerequisites(console): return - results_store: dict = {"timestamp": datetime.now().isoformat(), "tests": {}} + results_store: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname(), "tests": {}} menu_items = [ ("1", "GPU Information", "gpu_info"), @@ -139,10 +225,13 @@ def interactive_menu(config: dict): ("3", "Memory Benchmark (nvbandwidth)", "memory_bench"), ("4", "Compute Benchmark", "compute_bench"), ("5", "NCCL Multi-GPU Test", "nccl"), - ("6", "GPU Stress Test (gpu-burn)", "stress"), + ("6", "GPU Stress Test (PyTorch/gpu-burn)", "stress"), ("7", "RDMA/IB Test", "rdma"), - ("8", "Training Simulation", "training"), - ("9", "Full Test Suite (All Tests)", "all"), + ("8", "NVLink/NVSwitch Test", "nvlink"), + ("9", "DCGM Diagnostic", "dcgm"), + ("10", "Training Simulation", "training"), + ("11", "Multi-node NCCL Test", "multinode_nccl"), + ("12", "Full Test Suite (All Tests)", "all"), ("0", "Generate Report", "report"), ] @@ -164,9 +253,12 @@ def interactive_menu(config: dict): "memory_bench": "HBM bandwidth via nvbandwidth", "compute_bench": "GEMM TFLOPS across FP32/TF32/FP16/BF16/FP8", "nccl": "AllReduce, AllToAll, Broadcast via nccl-tests", - "stress": "Long-running GPU stress via gpu-burn", + "stress": "Long-running high-power GEMM stress with telemetry", "rdma": "InfiniBand bandwidth & latency (ib_write_bw)", + "nvlink": "NVLink links, speed, and error counters", + "dcgm": "DCGM diag -r 3 production diagnostic", "training": "Simulate LLM training with PyTorch", + "multinode_nccl": "Cross-node NCCL via mpirun/nccl-tests", "all": "Run all tests sequentially", "report": "Export results to JSON/HTML", } @@ -257,12 +349,30 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict: m.print_results(result) return result + elif test_name == "nvlink": + m = NVLinkTest(config) + result = m.run() + m.print_results(result) + return result + + elif test_name == "dcgm": + m = DCGMTest(config) + result = m.run() + m.print_results(result) + return result + elif test_name == "training": m = TrainingSim(config) result = m.run() m.print_results(result) return result + elif test_name == "multinode_nccl": + m = MultiNodeNCCLTest(config) + result = m.run() + m.print_results(result) + return result + elif test_name == "all": return _run_full_suite(config, console) @@ -280,17 +390,21 @@ def _run_test(test_name: str, config: dict, console: Console) -> dict: def _run_full_suite(config: dict, console: Console) -> dict: """Run all tests sequentially.""" console.print(Panel("[bold cyan]Running Full Test Suite[/bold cyan]", box=box.DOUBLE)) - all_results: dict = {"timestamp": datetime.now().isoformat()} + all_results: dict = {"timestamp": datetime.now().isoformat(), "hostname": socket.gethostname()} tests = [ ("gpu_info", "GPU Information", GPUInfo), ("health", "Health Check", HealthCheck), ("memory_bench", "Memory Benchmark", lambda c: Benchmark(c)), ("compute_bench", "Compute Benchmark", lambda c: Benchmark(c)), + ("nvlink", "NVLink/NVSwitch Test", NVLinkTest), ("nccl", "NCCL Test", NCCLTest), ("stress", "GPU Stress Test", StressTest), ("rdma", "RDMA/IB Test", RDMATest), + ("dcgm", "DCGM Diagnostic", DCGMTest), ("training", "Training Simulation", TrainingSim), ] + if (config.get("multinode_nccl", {}) or {}).get("enabled"): + tests.append(("multinode_nccl", "Multi-node NCCL Test", MultiNodeNCCLTest)) for i, (key, name, mod_cls) in enumerate(tests, 1): console.print(f"\n[bold cyan][{i}/{len(tests)}] {name}[/bold cyan]") @@ -313,14 +427,49 @@ def _run_full_suite(config: dict, console: Console) -> dict: # Summary console.print("\n" + "=" * 60) # Only count test results, exclude metadata like timestamp - test_results = {k: v for k, v in all_results.items() if k != "timestamp"} - passed = sum(1 for v in test_results.values() if not isinstance(v, dict) or "error" not in v) + test_results = {k: v for k, v in all_results.items() if k not in ("timestamp", "hostname")} + passed = sum(1 for v in test_results.values() if _test_result_passed(v)) total = len(test_results) color = "green" if passed == total else ("yellow" if passed > 0 else "red") console.print(f"[bold {color}]Suite complete: {passed}/{total} tests passed[/bold {color}]") return all_results +def _test_result_passed(result) -> bool: + """Strict production verdict helper for full-suite exit status.""" + if not isinstance(result, dict): + return True + if result.get("error"): + return False + if result.get("skipped") or result.get("status") == "SKIP": + return False + if result.get("source") == "torchrun_fallback": + return False + if "passed" in result: + return bool(result.get("passed")) + if "memory" in result: + mem = result["memory"] + if isinstance(mem, dict) and "passed" in mem: + return bool(mem.get("passed")) + if mem.get("error") or mem.get("source") == "pytorch": + return False + eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0 + return eff >= 80 + if "compute" in result: + comp = result["compute"] + if isinstance(comp, dict) and "passed" in comp: + return bool(comp.get("passed")) + thresholds = comp.get("pass_thresholds_tflops", {}) or {} + per_dtype = comp.get("per_dtype_tflops", {}) + for dt, threshold in thresholds.items(): + val = per_dtype.get(dt) + if not isinstance(val, (int, float)) or val < threshold: + return False + consistency = comp.get("consistency", {}) + return not any(not c.get("passed", False) for c in consistency.values()) + return True + + def main(): gpu_list_str = " / ".join(g.upper() for g in get_supported_gpus()) parser = argparse.ArgumentParser( @@ -335,15 +484,18 @@ Examples: python gpu_tester.py --test benchmark --type memory python gpu_tester.py --test benchmark --type compute --dtype fp16 python gpu_tester.py --test nccl # NCCL test + python gpu_tester.py --test multinode-nccl # Cross-node NCCL test + python gpu_tester.py --test nvlink # NVLink/NVSwitch test + python gpu_tester.py --test dcgm # DCGM diagnostic python gpu_tester.py --test training # Training sim python gpu_tester.py --test all # Full suite python gpu_tester.py --report --format json --output report.json """, ) - parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "stress", "rdma", "training", "all"], + parser.add_argument("--test", choices=["gpu-info", "health", "benchmark", "nccl", "multinode-nccl", "stress", "rdma", "nvlink", "dcgm", "training", "all"], help="Run a specific test") parser.add_argument("--type", choices=["memory", "compute"], help="Benchmark type (with --test benchmark)") - parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8"], + parser.add_argument("--dtype", choices=["fp32", "tf32", "fp16", "bf16", "fp8", "fp64", "int8"], help="Compute benchmark dtype (with --test benchmark --type compute)") parser.add_argument("--interactive", action="store_true", help="Force interactive mode") parser.add_argument("--report", action="store_true", help="Generate report from last results") @@ -397,8 +549,11 @@ Examples: "health": "health", "benchmark": None, "nccl": "nccl", + "multinode-nccl": "multinode_nccl", "stress": "stress", "rdma": "rdma", + "nvlink": "nvlink", + "dcgm": "dcgm", "training": "training", "all": "all", } @@ -415,19 +570,30 @@ Examples: result = bench.run() Benchmark.print_results(result) if args.report: - ReportGenerator(config).generate({"benchmark": result, "timestamp": datetime.now().isoformat()}, + ReportGenerator(config).generate({ + "benchmark": result, + "timestamp": datetime.now().isoformat(), + "hostname": socket.gethostname(), + }, fmt=args.format, output=args.output) + sys.exit(0 if _test_result_passed(result) else 1) elif args.test == "all": results = _run_full_suite(config, console) if args.report: ReportGenerator(config).generate(results, fmt=args.format, output=args.output) - has_errors = any("error" in v for v in results.values() if isinstance(v, dict)) - sys.exit(1 if has_errors else 0) + failed = any(not _test_result_passed(v) for k, v in results.items() if k not in ("timestamp", "hostname")) + sys.exit(1 if failed else 0) else: result = _run_test(test_map[args.test], config, console) if args.report and result: - ReportGenerator(config).generate({args.test: result, "timestamp": datetime.now().isoformat()}, + report_key = test_map[args.test] or args.test + ReportGenerator(config).generate({ + report_key: result, + "timestamp": datetime.now().isoformat(), + "hostname": socket.gethostname(), + }, fmt=args.format, output=args.output) + sys.exit(0 if _test_result_passed(result) else 1) if __name__ == "__main__": diff --git a/modules/dcgm_test.py b/modules/dcgm_test.py new file mode 100644 index 0000000..e7b4f49 --- /dev/null +++ b/modules/dcgm_test.py @@ -0,0 +1,231 @@ +"""DCGM diagnostic acceptance wrapper.""" + +import json +import os +import re +import shutil +import signal +import subprocess +from datetime import datetime +from typing import Optional + +from rich.console import Console +from rich.table import Table + + +class DCGMTest: + def __init__(self, config: dict): + self.config = config + self.console = Console() + self.cfg = config.get("dcgm", {}) + + def run(self) -> dict: + dcgmi = shutil.which("dcgmi") + if not dcgmi: + return { + "passed": False, + "error": "dcgmi not found", + "timestamp": datetime.now().isoformat(), + } + + level = str(self.cfg.get("diag_level", 3)) + timeout = int(self.cfg.get("timeout_sec", 1200)) + cmd = [dcgmi, "diag", "-r", level] + expected_gpus = self.cfg.get("expected_num_gpus") + if expected_gpus: + cmd.extend(["-n", f"gpu:{int(expected_gpus)}"]) + if self.cfg.get("json_output", True): + cmd.append("-j") + + try: + r = self._run_with_process_group_timeout(cmd, timeout) + except subprocess.TimeoutExpired as e: + output = ((e.output or "") + "\n" + (e.stderr or "")).strip() + return { + "passed": False, + "error": f"dcgmi diag -r {level} timeout after {timeout}s", + "command": cmd, + "raw_output_tail": output[-8000:], + "timestamp": datetime.now().isoformat(), + } + + output = r.stdout + "\n" + r.stderr + subtests = self._parse_json_output(output) or self._parse_output(output) + strict_statuses = {"PASS"} + failed = [s for s in subtests if s["status"] not in strict_statuses] + require_subtests = bool(self.cfg.get("require_subtests", True)) + passed = r.returncode == 0 and not failed and (bool(subtests) or not require_subtests) + return { + "passed": passed, + "returncode": r.returncode, + "level": int(level), + "command": cmd, + "expected_num_gpus": int(expected_gpus) if expected_gpus else None, + "subtests": subtests, + "raw_output_tail": output[-8000:], + "timestamp": datetime.now().isoformat(), + } + + @staticmethod + def _run_with_process_group_timeout(cmd: list[str], timeout: int) -> subprocess.CompletedProcess: + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + start_new_session=True, + ) + try: + stdout, stderr = proc.communicate(timeout=timeout) + except subprocess.TimeoutExpired as e: + try: + os.killpg(proc.pid, signal.SIGTERM) + stdout, stderr = proc.communicate(timeout=10) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + stdout, stderr = proc.communicate(timeout=10) + raise subprocess.TimeoutExpired(cmd, timeout, output=stdout, stderr=stderr) from e + return subprocess.CompletedProcess(cmd, proc.returncode, stdout, stderr) + + @classmethod + def _parse_json_output(cls, output: str) -> list[dict]: + text = output.strip() + if not text: + return [] + try: + payload = json.loads(text) + except json.JSONDecodeError: + m = re.search(r"(\{.*\})", text, re.S) + if not m: + return [] + try: + payload = json.loads(m.group(1)) + except json.JSONDecodeError: + return [] + + dcgm_payload = payload.get("DCGM Diagnostic") if isinstance(payload, dict) else None + if isinstance(dcgm_payload, dict): + parsed = cls._parse_dcgm_diagnostic_json(dcgm_payload) + if parsed: + return parsed + + subtests = [] + + def walk(node, path: list[str]): + if isinstance(node, dict): + node_name = ( + node.get("name") + or node.get("testName") + or node.get("test_name") + or node.get("category") + or node.get("category_name") + ) + child_path = [*path, str(node_name)] if node_name else path + status = node.get("status") or node.get("result") or node.get("Result") + if isinstance(status, str): + name = ( + node_name + or " / ".join(path[-3:]) + ) + normalized = cls._normalize_status(status) + if normalized: + subtests.append({ + "name": str(name)[:160], + "status": normalized, + "raw": json.dumps(node, default=str)[:1000], + }) + for key, value in node.items(): + walk(value, [*child_path, str(key)]) + elif isinstance(node, list): + for idx, item in enumerate(node): + walk(item, [*path, str(idx)]) + + walk(payload, []) + return subtests + + @classmethod + def _parse_dcgm_diagnostic_json(cls, payload: dict) -> list[dict]: + subtests = [] + for category in payload.get("test_categories", []) or []: + category_name = str(category.get("category") or "DCGM") + for test in category.get("tests", []) or []: + test_name = str(test.get("name") or "unnamed") + for result in test.get("results", []) or []: + status = cls._normalize_status(str(result.get("status", ""))) + if not status: + continue + entity_group = result.get("entity_group") or "entity" + entity_id = result.get("entity_id", "unknown") + name = f"{category_name}/{test_name}/{entity_group}{entity_id}" + subtests.append({ + "name": name[:160], + "status": status, + "raw": json.dumps(result, default=str)[:1000], + }) + summary = test.get("test_summary") or {} + status = cls._normalize_status(str(summary.get("status", ""))) + if status: + subtests.append({ + "name": f"{category_name}/{test_name}/summary"[:160], + "status": status, + "raw": json.dumps(summary, default=str)[:1000], + }) + return subtests + + @staticmethod + def _normalize_status(status: str) -> str: + s = status.strip().upper() + aliases = { + "PASS": "PASS", + "PASSED": "PASS", + "OK": "PASS", + "FAIL": "FAIL", + "FAILED": "FAIL", + "ERROR": "ERROR", + "WARN": "WARN", + "WARNING": "WARN", + "SKIP": "SKIP", + "SKIPPED": "SKIP", + "NOT_RUN": "SKIP", + "NOT RUN": "SKIP", + } + return aliases.get(s, s if s in {"PASS", "FAIL", "ERROR", "WARN", "SKIP"} else "") + + @staticmethod + def _parse_output(output: str) -> list[dict]: + subtests = [] + for line in output.splitlines(): + stripped = line.strip() + if not stripped: + continue + m = re.search(r"(.+?)\s*[:|]\s*(PASS|FAIL|WARN|ERROR|SKIP)\b", stripped, re.I) + if not m: + m = re.search(r"\b(PASS|FAIL|WARN|ERROR|SKIP)\b\s*[-:|]\s*(.+)", stripped, re.I) + if m: + status = DCGMTest._normalize_status(m.group(1)) + name = m.group(2).strip() + else: + continue + else: + name = m.group(1).strip(" .|-") + status = DCGMTest._normalize_status(m.group(2)) + if name and len(name) < 160: + subtests.append({"name": name, "status": status, "raw": stripped}) + return subtests + + @staticmethod + def print_results(results: dict, console: Optional[Console] = None): + c = console or Console() + if results.get("error"): + c.print(f"[bold red]DCGM error: {results['error']}[/bold red]") + return + passed = results.get("passed", False) + c.print("[bold green]✓ DCGM diag PASSED[/bold green]" if passed else "[bold red]✗ DCGM diag FAILED[/bold red]") + subtests = results.get("subtests", []) + if subtests: + table = Table(box=None, padding=(0, 1)) + table.add_column("Subtest") + table.add_column("Status", style="bold") + for s in subtests: + table.add_row(s.get("name", ""), s.get("status", "")) + c.print(table) diff --git a/modules/health_check.py b/modules/health_check.py index dd64071..1e446f6 100644 --- a/modules/health_check.py +++ b/modules/health_check.py @@ -171,6 +171,10 @@ class HealthCheck: gpu_health.append({"index": i, "status": worst, "checks": checks}) system_health = self._check_system() + for key in ("fabricmanager", "retired_pages", "kernel_errors"): + item = system_health.get(key, {}) + if isinstance(item, dict) and item.get("status") == "FAIL": + overall_pass = False return { "passed": overall_pass, @@ -228,6 +232,9 @@ class HealthCheck: rdma_devs = os.listdir("/sys/class/infiniband_verbs") nccl_env = {k: v for k, v in os.environ.items() if k.startswith("NCCL_")} + fabric = self._check_fabricmanager() + retired = self._check_retired_pages() + kernel_errors = self._check_kernel_errors() return { "nvidia_persistenced": {"installed": persistd, "running": persistd_running}, @@ -238,6 +245,41 @@ class HealthCheck: "infiniband_devices": ib_devs, "rdma_devices": rdma_devs, "nccl_env_vars": nccl_env, + "fabricmanager": fabric, + "retired_pages": retired, + "kernel_errors": kernel_errors, + } + + def _check_fabricmanager(self) -> dict: + r = self._run_cmd(["systemctl", "is-active", "nvidia-fabricmanager"], timeout=5) + active = r == "active" + logs = self._run_cmd(["journalctl", "-u", "nvidia-fabricmanager", "-n", "200", "--no-pager"], timeout=10) or "" + has_error = "ERROR" in logs.upper() or "FAILED" in logs.upper() + return { + "active": active, + "has_error_logs": has_error, + "status": "PASS" if active and not has_error else "FAIL", + } + + def _check_retired_pages(self) -> dict: + raw = self._run_cmd(["nvidia-smi", "-q", "-d", "PAGE_RETIREMENT"], timeout=30) or "" + nums = [int(x) for x in __import__("re").findall(r"Retired Pages.*?:\s*(\d+)", raw, flags=__import__("re").I)] + pending = "Pending Page Blacklist" in raw and "Yes" in raw + total = sum(nums) + return { + "retired_pages": total, + "pending_blacklist": pending, + "status": "PASS" if total == 0 and not pending else "FAIL", + } + + def _check_kernel_errors(self) -> dict: + raw = self._run_cmd(["dmesg", "--ctime", "--level=err,crit,alert,emerg"], timeout=10) or "" + upper = raw.upper() + hits = [line for line in raw.splitlines() if any(k in line.upper() for k in ("XID", "AER", "PCIE", "NVRM"))] + return { + "count": len(hits), + "tail": hits[-20:], + "status": "PASS" if not hits else "FAIL", } @staticmethod diff --git a/modules/nccl_test.py b/modules/nccl_test.py index fd9ab6a..9bc47d1 100644 --- a/modules/nccl_test.py +++ b/modules/nccl_test.py @@ -5,6 +5,8 @@ import os import re import shutil import subprocess +import statistics +import sys from datetime import datetime from typing import Optional @@ -70,6 +72,38 @@ class NCCLTest: return p return None + def _message_sizes(self) -> list[str]: + return list(self.nccl_cfg.get("message_sizes") or ["1M", "256M", "2G"]) + + def _repeats(self) -> int: + return int(self.nccl_cfg.get("repeats", 3)) + + def _max_stddev_pct(self) -> float: + return float(self.nccl_cfg.get("max_stddev_pct", 3)) + + def _runtime_env(self) -> dict: + env = {**os.environ, "NCCL_DEBUG": "WARN"} + lib_dirs = [] + + nccl_home = env.get("NCCL_HOME") or self.nccl_cfg.get("nccl_home") + if nccl_home: + lib_dirs.append(os.path.join(str(nccl_home), "lib")) + + for path in sys.path: + lib_dirs.append(os.path.join(path, "nvidia", "nccl", "lib")) + + venv_root = os.path.dirname(os.path.dirname(sys.executable)) + lib_dirs.extend(glob.glob(os.path.join(venv_root, "lib", "python*", "site-packages", "nvidia", "nccl", "lib"))) + + existing = env.get("LD_LIBRARY_PATH", "") + valid_dirs = [] + for d in lib_dirs: + if d and os.path.isdir(d) and d not in valid_dirs: + valid_dirs.append(d) + if valid_dirs: + env["LD_LIBRARY_PATH"] = ":".join(valid_dirs + ([existing] if existing else [])) + return env + def run(self) -> dict: gpu_count = 0 if TORCH_AVAILABLE: @@ -89,7 +123,7 @@ class NCCLTest: if self.nccl_cfg.get("test_reduce_scatter", False): tests.append(("reduce_scatter_perf", "ReduceScatter")) if self.nccl_cfg.get("test_allgather", False): - tests.append(("allgather_perf", "AllGather")) + tests.append(("all_gather_perf", "AllGather")) if self.nccl_cfg.get("test_sendrecv", False): tests.append(("sendrecv_perf", "SendRecv")) @@ -170,39 +204,7 @@ class NCCLTest: if not binary: return {"status": "SKIP", "error": f"{binary_name} not found"} - cmd = [ - binary, - "-b", "8M", - "-e", "8G", - "-f", "2", - "-g", str(gpu_count), - "-w", "5", - "-n", "20", - ] - - try: - env = os.environ.copy() - env["NCCL_DEBUG"] = "WARN" - r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env) - - combined = r.stdout + r.stderr - # Check for NCCL/CUDA compatibility errors - if "CUDA driver version is insufficient" in combined or \ - "Test NCCL failure" in combined: - error_msg = "NCCL/CUDA driver version mismatch" \ - if "CUDA driver version" in combined \ - else "NCCL test failure (library incompatibility)" - return {"status": "FAIL", "error": error_msg} - - if r.returncode != 0: - return {"status": "FAIL", "error": r.stderr[:300]} - - return self._parse_nccl_output(r.stdout, min_bw) - - except subprocess.TimeoutExpired: - return {"status": "FAIL", "error": "timeout"} - except Exception as e: - return {"status": "FAIL", "error": str(e)} + return self._run_nccl_matrix([binary, "-g", str(gpu_count)], min_bw) def _run_one_nccl_test_mpirun(self, binary_name: str, label: str, gpu_count: int, mpirun: str, min_bw: float) -> dict: @@ -218,37 +220,64 @@ class NCCLTest: "-x", "NCCL_DEBUG=WARN", "-x", "CUDA_VISIBLE_DEVICES=" + ",".join(str(i) for i in range(gpu_count)), binary, - "-b", "8", - "-e", "256M", - "-f", "2", "-g", "1", - "-w", "5", - "-n", "20", ] + return self._run_nccl_matrix(cmd, min_bw) + + def _run_nccl_matrix(self, base_cmd: list[str], min_bw: float) -> dict: + size_results = [] + failures = [] + env = self._runtime_env() + try: - env = os.environ.copy() - env["NCCL_DEBUG"] = "WARN" - r = subprocess.run(cmd, capture_output=True, text=True, timeout=180, env=env) - - combined = r.stdout + r.stderr - if "CUDA driver version is insufficient" in combined or \ - "Test NCCL failure" in combined: - error_msg = "NCCL/CUDA driver version mismatch" \ - if "CUDA driver version" in combined \ - else "NCCL test failure (library incompatibility)" - return {"status": "FAIL", "error": error_msg} - - if r.returncode != 0: - return {"status": "FAIL", "error": r.stderr[:300]} - - return self._parse_nccl_output(r.stdout, min_bw) + for size in self._message_sizes(): + runs = [] + for _ in range(self._repeats()): + cmd = [*base_cmd, "-b", size, "-e", size, "-f", "2", "-w", "5", "-n", "20"] + r = subprocess.run(cmd, capture_output=True, text=True, timeout=300, env=env) + combined = r.stdout + r.stderr + if "CUDA driver version is insufficient" in combined or "Test NCCL failure" in combined: + failures.append({"size": size, "error": "NCCL/CUDA/library failure"}) + continue + if r.returncode != 0: + failures.append({"size": size, "error": r.stderr[:300]}) + continue + parsed = self._parse_nccl_output(r.stdout, min_bw) + runs.append(parsed.get("best_busbw_gbps", 0)) + if runs: + worst = min(runs) + mean = sum(runs) / len(runs) + std_pct = (statistics.pstdev(runs) / mean * 100) if len(runs) > 1 and mean else 0 + size_results.append({ + "size": size, + "runs_busbw_gbps": [round(v, 1) for v in runs], + "worst_busbw_gbps": round(worst, 1), + "mean_busbw_gbps": round(mean, 1), + "stddev_pct": round(std_pct, 2), + "status": "PASS" if worst >= min_bw and std_pct <= self._max_stddev_pct() else "FAIL", + }) + else: + size_results.append({"size": size, "status": "FAIL", "runs_busbw_gbps": []}) except subprocess.TimeoutExpired: return {"status": "FAIL", "error": "timeout"} except Exception as e: return {"status": "FAIL", "error": str(e)} + best_bus = max((r.get("mean_busbw_gbps", 0) for r in size_results), default=0) + worst_bus = min((r.get("worst_busbw_gbps", 0) for r in size_results if r.get("runs_busbw_gbps")), default=0) + passed = bool(size_results) and all(r.get("status") == "PASS" for r in size_results) and not failures + return { + "status": "PASS" if passed else "FAIL", + "best_busbw_gbps": round(best_bus, 1), + "worst_busbw_gbps": round(worst_bus, 1), + "min_required_gbps": min_bw, + "max_stddev_pct": self._max_stddev_pct(), + "by_size": size_results, + "failures": failures, + } + @staticmethod def _parse_nccl_output(stdout: str, min_bw: float) -> dict: """Parse nccl-tests tabular output and extract bandwidth results.""" @@ -363,7 +392,7 @@ dist.destroy_process_group() r = subprocess.run( [torchrun_cmd, f"--nproc_per_node={gpu_count}", tmp.name], capture_output=True, text=True, timeout=120, - env={**os.environ, "NCCL_DEBUG": "WARN"}, + env=self._runtime_env(), ) os.unlink(tmp.name) @@ -390,10 +419,15 @@ dist.destroy_process_group() } return { - "passed": all_passed, + # torchrun fallback is a functional smoke only. It never proves + # production bus bandwidth, so it must not satisfy acceptance. + "passed": False, + "functional_passed": all_passed, "source": "torchrun_fallback", "tests": tests, "gpu_count": gpu_count, + "error": None if all_passed else "torchrun functional NCCL smoke failed", + "acceptance_gap": "nccl-tests bus bandwidth was not measured", } except Exception as e: return {"passed": False, "source": "torchrun_fallback", "error": str(e)} @@ -410,7 +444,8 @@ dist.destroy_process_group() if source == "torchrun_fallback": # Connectivity check mode - verdict = "[bold green]✓ NCCL Connectivity OK[/bold green]" if passed else "[bold red]✗ NCCL Connectivity FAILED[/bold red]" + functional = results.get("functional_passed", passed) + verdict = "[bold yellow]⚠ NCCL bus BW NOT VERIFIED[/bold yellow]" if functional else "[bold red]✗ NCCL Connectivity FAILED[/bold red]" c.print(f"{verdict} [dim](basic check via torchrun)[/dim]") tests = results.get("tests", {}) @@ -427,7 +462,7 @@ dist.destroy_process_group() else: c.print(f" [{s_color}]{op_name}[/{s_color}]") - c.print("\n[yellow]Note: functional connectivity test only (no performance data)[/yellow]") + c.print("\n[yellow]Note: functional connectivity test only (no bus bandwidth data; acceptance FAIL)[/yellow]") else: # nccl-tests mode verdict = "[bold green]✓ NCCL tests PASSED[/bold green]" if passed else "[bold yellow]⚠ NCCL tests WARNING[/bold yellow]" @@ -448,12 +483,16 @@ dist.destroy_process_group() if by_size: t = Table(box=None, padding=(0, 1)) t.add_column("Size", style="bold", justify="right") - t.add_column("Time (us)", justify="right") - t.add_column("Alg BW (GB/s)", justify="right") - t.add_column("Bus BW (GB/s)", justify="right") + t.add_column("Worst Bus BW", justify="right") + t.add_column("Mean Bus BW", justify="right") + t.add_column("StdDev", justify="right") + t.add_column("Status", justify="right") for r in by_size: - sz = r.get("size", 0) - sz_str = f"{sz/1024:.0f}K" if sz < 1048576 else f"{sz/1048576:.0f}M" - t.add_row(sz_str, f"{r.get('time_us',0):.1f}", - f"{r.get('algbw_gbps',0):.1f}", f"{r.get('busbw_gbps',0):.1f}") + t.add_row( + str(r.get("size", "")), + f"{r.get('worst_busbw_gbps', 0):.1f}", + f"{r.get('mean_busbw_gbps', 0):.1f}", + f"{r.get('stddev_pct', 0):.2f}%", + r.get("status", "?"), + ) c.print(t) diff --git a/modules/nvlink_test.py b/modules/nvlink_test.py new file mode 100644 index 0000000..ecf257b --- /dev/null +++ b/modules/nvlink_test.py @@ -0,0 +1,188 @@ +"""NVLink / NVSwitch production acceptance checks.""" + +import re +import shutil +import subprocess +from datetime import datetime +from typing import Optional + +from rich.console import Console +from rich.table import Table + + +class NVLinkTest: + def __init__(self, config: dict): + self.config = config + self.console = Console() + self.cfg = config.get("nvlink", {}) + + def _run(self, args: list[str], timeout: int = 60) -> tuple[int, str, str]: + if not shutil.which("nvidia-smi"): + return 127, "", "nvidia-smi not found" + r = subprocess.run(["nvidia-smi", *args], capture_output=True, text=True, timeout=timeout) + return r.returncode, r.stdout, r.stderr + + def run(self) -> dict: + expected_links = int(self.cfg.get("expected_links_per_gpu", 18)) + expected_speed = float(self.cfg.get("expected_link_speed_gbps", 25)) + require_zero_errors = bool(self.cfg.get("require_zero_errors", True)) + + rc_s, out_s, err_s = self._run(["nvlink", "-s"]) + rc_c, out_c, err_c = self._run(["nvlink", "-c"]) + rc_e, out_e, err_e = self._run(["nvlink", "-e"]) + + if rc_s != 0: + return { + "passed": False, + "error": (err_s or out_s or "nvidia-smi nvlink -s failed")[:1000], + "timestamp": datetime.now().isoformat(), + } + + links = self._parse_status(out_s) + if not links: + return { + "passed": False, + "error": "no NVLink status entries parsed from nvidia-smi nvlink -s", + "raw_status": out_s[-4000:], + "timestamp": datetime.now().isoformat(), + } + speeds = self._parse_speeds(out_c) if rc_c == 0 else {} + status_speeds = self._parse_speeds(out_s) + for gpu, gpu_speeds in status_speeds.items(): + speeds.setdefault(gpu, {}).update({k: v for k, v in gpu_speeds.items() if k not in speeds.get(gpu, {})}) + errors = self._parse_errors(out_e) if rc_e == 0 else {} + + gpu_results = [] + overall = True + for gpu, gpu_links in sorted(links.items(), key=lambda x: int(x[0])): + active = sum(1 for l in gpu_links.values() if l.get("active")) + inactive = [lid for lid, l in gpu_links.items() if not l.get("active")] + speed_bad = [] + for lid in gpu_links: + speed = speeds.get(gpu, {}).get(lid) + if speed is not None and speed < expected_speed: + speed_bad.append({"link": lid, "speed_gbps": speed}) + err_bad = [] + if require_zero_errors: + for lid, counters in errors.get(gpu, {}).items(): + total = sum(v for v in counters.values() if isinstance(v, int)) + if total: + err_bad.append({"link": lid, "counters": counters}) + + passed = active == expected_links and not inactive and not speed_bad and not err_bad + if not passed: + overall = False + gpu_results.append({ + "gpu": int(gpu), + "active_links": active, + "expected_links": expected_links, + "inactive_links": inactive, + "speed_issues": speed_bad, + "error_issues": err_bad, + "passed": passed, + }) + + return { + "passed": overall, + "expected_links_per_gpu": expected_links, + "expected_link_speed_gbps": expected_speed, + "require_zero_errors": require_zero_errors, + "gpus": gpu_results, + "raw_status": out_s[-4000:], + "raw_speed": out_c[-4000:] if out_c else "", + "raw_errors": out_e[-4000:] if out_e else "", + "timestamp": datetime.now().isoformat(), + } + + @staticmethod + def _parse_status(text: str) -> dict[str, dict[str, dict]]: + result: dict[str, dict[str, dict]] = {} + gpu = None + for line in text.splitlines(): + m_gpu = re.search(r"GPU\s+(\d+)", line, re.I) + if m_gpu: + gpu = m_gpu.group(1) + result.setdefault(gpu, {}) + continue + if gpu is None: + continue + m_link = re.search(r"Link\s+(\d+).*?(Active|Inactive|Disabled|Off|Down)", line, re.I) + if m_link: + state = m_link.group(2) + result[gpu][m_link.group(1)] = { + "state": state, + "active": state.lower() == "active", + "raw": line.strip(), + } + continue + m_speed = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I) + if m_speed: + result[gpu][m_speed.group(1)] = { + "state": "Active", + "active": True, + "raw": line.strip(), + } + return result + + @staticmethod + def _parse_speeds(text: str) -> dict[str, dict[str, float]]: + result: dict[str, dict[str, float]] = {} + gpu = None + for line in text.splitlines(): + m_gpu = re.search(r"GPU\s+(\d+)", line, re.I) + if m_gpu: + gpu = m_gpu.group(1) + result.setdefault(gpu, {}) + continue + if gpu is None: + continue + m_link = re.search(r"Link\s+(\d+).*?([0-9.]+)\s*GB/s", line, re.I) + if m_link: + result[gpu][m_link.group(1)] = float(m_link.group(2)) + return result + + @staticmethod + def _parse_errors(text: str) -> dict[str, dict[str, dict[str, int]]]: + result: dict[str, dict[str, dict[str, int]]] = {} + gpu = None + link = None + for line in text.splitlines(): + m_gpu = re.search(r"GPU\s+(\d+)", line, re.I) + if m_gpu: + gpu = m_gpu.group(1) + result.setdefault(gpu, {}) + continue + m_link = re.search(r"Link\s+(\d+)", line, re.I) + if m_link and gpu is not None: + link = m_link.group(1) + result[gpu].setdefault(link, {}) + if gpu is None or link is None: + continue + for name in ("CRC", "Replay", "Recovery"): + m = re.search(rf"{name}[^0-9]*(\d+)", line, re.I) + if m: + result[gpu][link][name.lower()] = int(m.group(1)) + return result + + @staticmethod + def print_results(results: dict, console: Optional[Console] = None): + c = console or Console() + if results.get("error"): + c.print(f"[bold red]NVLink error: {results['error']}[/bold red]") + return + passed = results.get("passed", False) + c.print("[bold green]✓ NVLink PASSED[/bold green]" if passed else "[bold red]✗ NVLink FAILED[/bold red]") + table = Table(box=None, padding=(0, 1)) + table.add_column("GPU", style="bold") + table.add_column("Active Links", justify="right") + table.add_column("Issues") + for g in results.get("gpus", []): + issues = [] + if g.get("inactive_links"): + issues.append("inactive=" + ",".join(g["inactive_links"])) + if g.get("speed_issues"): + issues.append(f"speed={len(g['speed_issues'])}") + if g.get("error_issues"): + issues.append(f"errors={len(g['error_issues'])}") + table.add_row(str(g["gpu"]), f"{g['active_links']}/{g['expected_links']}", "; ".join(issues) or "OK") + c.print(table) diff --git a/modules/report.py b/modules/report.py index d9e1eba..8411521 100644 --- a/modules/report.py +++ b/modules/report.py @@ -93,8 +93,8 @@ class ReportGenerator: def _generate_html(self, results: dict, output: str) -> str: import socket - hostname = socket.gethostname() - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + hostname = results.get("hostname") or socket.gethostname() + timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S") sections = [] @@ -178,8 +178,8 @@ class ReportGenerator: def _generate_markdown(self, results: dict, output: str) -> str: import socket - hostname = socket.gethostname() - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + hostname = results.get("hostname") or socket.gethostname() + timestamp = results.get("timestamp") or datetime.now().strftime("%Y-%m-%d %H:%M:%S") lines: list[str] = [] @@ -201,6 +201,21 @@ class ReportGenerator: # --- Summary table --- summary_items = self._build_summary(results) if summary_items: + verdict, failures, missing = self._overall_acceptance_verdict(summary_items) + lines.append("## Overall Acceptance Verdict\n") + lines.append(f"**Result: {verdict}**") + lines.append("") + if failures: + lines.append("Failed or unverified items:") + for name, status in failures: + lines.append(f"- {name}: {status}") + lines.append("") + if missing: + lines.append("Missing required evidence:") + for name in missing: + lines.append(f"- {name}") + lines.append("") + lines.append("## Summary\n") lines.append("| Test | Result |") lines.append("|------|--------|") @@ -319,8 +334,6 @@ class ReportGenerator: if use_abs and thr: if val >= thr: status = "PASS" - elif val >= thr * 0.9: - status = "WARN" else: status = "FAIL" lines.append(f"| {dt.upper()} | {val:.1f} | {pk:.0f} | >= {thr} | {status} |") @@ -331,33 +344,194 @@ class ReportGenerator: overall_status = status lines.append("") if use_abs: + if any(not row.get("passed", False) for row in (comp_data.get("consistency", {}) or {}).values()): + overall_status = "FAIL" lines.append(f"**Verdict: {overall_status}** (absolute TFLOPS thresholds; worst efficiency {worst_eff:.1f}%)\n") else: overall_status = "PASS" if worst_eff >= 80 else ("WARN" if worst_eff >= 50 else "FAIL") lines.append(f"**Verdict: {overall_status}** (worst efficiency {worst_eff:.1f}%)\n") + consistency = comp_data.get("consistency", {}) or {} + if consistency: + lines.append("### Compute Consistency\n") + lines.append("| DType | Min | Mean | Max | Spread | Limit | Status |") + lines.append("|-------|-----|------|-----|--------|-------|--------|") + for dt, row in consistency.items(): + status = "PASS" if row.get("passed") else "FAIL" + lines.append( + f"| {dt.upper()} | {row.get('min_tflops', 0):.1f} | " + f"{row.get('mean_tflops', 0):.1f} | {row.get('max_tflops', 0):.1f} | " + f"{row.get('spread_pct', 0):.2f}% | <= {row.get('max_allowed_pct', 3)}% | {status} |" + ) + lines.append("") + + per_gpu = comp_data.get("per_gpu", []) or [] + dtype_order = [dt for dt in per_dtype.keys() if not isinstance(per_dtype.get(dt), str)] + if per_gpu and dtype_order: + lines.append("### Compute Per-GPU TFLOPS\n") + headers = ["GPU", *[dt.upper() for dt in dtype_order]] + lines.append("| " + " | ".join(headers) + " |") + lines.append("|" + "|".join(["---"] * len(headers)) + "|") + for row in per_gpu: + cells = [str(row.get("index", ""))] + for dt in dtype_order: + val = row.get(dt, "") + cells.append(f"{val:.1f}" if isinstance(val, (int, float)) else str(val)) + lines.append("| " + " | ".join(cells) + " |") + lines.append("") + + # --- NCCL --- + nvlink = results.get("nvlink") + if nvlink and not nvlink.get("error"): + lines.append("## NVLink/NVSwitch\n") + lines.append(f"**Overall: {'PASS' if nvlink.get('passed') else 'FAIL'}**\n") + lines.append("| GPU | Active Links | Issues |") + lines.append("|-----|--------------|--------|") + for g in nvlink.get("gpus", []): + issues = [] + if g.get("inactive_links"): + issues.append("inactive=" + ",".join(g["inactive_links"])) + if g.get("speed_issues"): + issues.append(f"speed issues={len(g['speed_issues'])}") + if g.get("error_issues"): + issues.append(f"errors={len(g['error_issues'])}") + lines.append(f"| {g.get('gpu')} | {g.get('active_links')}/{g.get('expected_links')} | {', '.join(issues) or 'OK'} |") + lines.append("") + elif nvlink and nvlink.get("error"): + lines.append("## NVLink/NVSwitch\n") + lines.append(f"**Overall: FAIL** ({nvlink.get('error')})\n") + + dcgm = results.get("dcgm") + if dcgm and not dcgm.get("error"): + lines.append("## DCGM Diagnostic\n") + lines.append(f"**Overall: {'PASS' if dcgm.get('passed') else 'FAIL'}**\n") + if dcgm.get("subtests"): + lines.append("| Subtest | Status |") + lines.append("|---------|--------|") + for s in dcgm.get("subtests", []): + lines.append(f"| {s.get('name', '')} | {s.get('status', '')} |") + lines.append("") + elif dcgm and dcgm.get("error"): + lines.append("## DCGM Diagnostic\n") + lines.append(f"**Overall: FAIL** ({dcgm.get('error')})\n") + # --- NCCL --- nccl = results.get("nccl") if nccl and not nccl.get("error"): lines.append("## NCCL Multi-GPU\n") lines.append(f"Source: {nccl.get('source', 'unknown')} | " f"GPUs: {nccl.get('gpu_count', '?')}\n") + if nccl.get("source") == "torchrun_fallback": + lines.append("> Functional NCCL smoke only: nccl-tests bus bandwidth was not measured, so this does not satisfy production acceptance.\n") tests = nccl.get("tests", {}) if tests: - lines.append("| Operation | Bus BW (GB/s) | Threshold | Status |") - lines.append("|-----------|---------------|-----------|--------|") + lines.append("> Summary reports the best Bus BW observed for each operation. PASS/FAIL is evaluated across every tested message size and repeat run shown in the detail table below.\n") + lines.append("| Operation | Best Bus BW (GB/s) | Failed Sizes | Threshold | Status |") + lines.append("|-----------|--------------------|--------------|-----------|--------|") for op, data in tests.items(): if isinstance(data, dict) and not data.get("error"): bw = data.get("best_busbw_gbps", 0) req = data.get("min_required_gbps", 0) status = data.get("status", "?") - lines.append(f"| {op} | {bw:.1f} | >= {req:.0f} | {status} |") + failed_sizes = [ + str(row.get("size", "?")) + for row in data.get("by_size", []) + if row.get("status") != "PASS" + ] + failed_sizes_text = ", ".join(failed_sizes) if failed_sizes else "-" + lines.append(f"| {op} | {bw:.1f} | {failed_sizes_text} | >= {_format_gbps(req)} | {status} |") elif isinstance(data, dict) and data.get("error"): - lines.append(f"| {op} | - | - | ERROR: {data['error']} |") + lines.append(f"| {op} | - | - | - | ERROR: {data['error']} |") lines.append("") + for op, data in tests.items(): + by_size = data.get("by_size", []) if isinstance(data, dict) else [] + if not by_size: + continue + lines.append(f"### NCCL {op} by size\n") + lines.append("| Size | Runs Bus BW (GB/s) | Worst | Mean | StdDev | Threshold | Status |") + lines.append("|------|---------------------|-------|------|--------|-----------|--------|") + for row in by_size: + runs = ", ".join(str(v) for v in row.get("runs_busbw_gbps", [])) + lines.append( + f"| {row.get('size', '')} | {runs} | " + f"{row.get('worst_busbw_gbps', 0):.1f} | " + f"{row.get('mean_busbw_gbps', 0):.1f} | " + f"{row.get('stddev_pct', 0):.2f}% | " + f">= {_format_gbps(data.get('min_required_gbps', 0))} | " + f"{row.get('status', '?')} |" + ) + lines.append("") passed = nccl.get("passed", False) lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n") + multinode = results.get("multinode_nccl") + if multinode and not multinode.get("error"): + lines.append("## Multi-node NCCL / Cross Leaf\n") + lines.append(f"Source: {multinode.get('source', 'unknown')} | Mode: {multinode.get('mode', 'unknown')}\n") + if multinode.get("artifact_dir"): + lines.append(f"- **Artifacts:** `{multinode.get('artifact_dir')}`") + hosts = multinode.get("hosts", []) + if hosts: + host_text = ", ".join(f"{h.get('name') or h.get('addr')}({h.get('addr')})" for h in hosts) + lines.append(f"- **Hosts:** {host_text}") + preflight = multinode.get("preflight", {}) + if preflight.get("checks"): + failed_checks = [c for c in preflight["checks"] if c.get("status") == "FAIL"] + warn_checks = [c for c in preflight["checks"] if c.get("status") == "WARN"] + lines.append(f"- **Preflight:** {'PASS' if not failed_checks else 'FAIL'}" + f"{f' ({len(warn_checks)} warnings)' if warn_checks else ''}") + lines.append("") + for op, data in (multinode.get("tests") or {}).items(): + lines.append(f"### Multi-node NCCL {op}\n") + lines.append("| Topology | CUDA Visible Devices | Peak Bus BW | Peak Size | Avg Bus BW | Threshold | Status |") + lines.append("|----------|----------------------|-------------|-----------|------------|-----------|--------|") + for topo in data.get("topologies", []): + threshold = topo.get("min_required_gbps", 0) or 0 + threshold_text = f">= {_format_gbps(threshold)} GB/s" if threshold else "-" + cuda_visible = topo.get("cuda_visible_devices") or "-" + lines.append( + f"| {topo.get('label', '')} | {cuda_visible} | {topo.get('peak_busbw_gbps', 0):.2f} GB/s | " + f"{topo.get('peak_size', '')} | {topo.get('avg_busbw_gbps', 0):.2f} GB/s | " + f"{threshold_text} | {topo.get('status', '?')} |" + ) + lines.append("") + diag_rows = [] + for topo in data.get("topologies", []): + net = topo.get("network") or {} + if net: + diag_rows.append((topo, net)) + if diag_rows: + lines.append("| Topology | NCCL Network | GPU Direct RDMA | GDR Enabled HCAs | GDR Disabled HCAs |") + lines.append("|----------|--------------|-----------------|------------------|-------------------|") + for topo, net in diag_rows: + networks = ", ".join(net.get("networks") or []) or "unknown" + gdr = net.get("gpu_direct_rdma", "UNKNOWN") + enabled = ", ".join(net.get("gdr_enabled_hcas") or []) or "-" + disabled = ", ".join(net.get("gdr_disabled_hcas") or []) or "-" + lines.append(f"| {topo.get('label', '')} | {networks} | {gdr} | {enabled} | {disabled} |") + lines.append("") + failed_topos = [topo for topo in data.get("topologies", []) if topo.get("status") == "FAIL"] + if failed_topos: + lines.append("| Topology | Return Code | Error / Output Tail |") + lines.append("|----------|-------------|---------------------|") + for topo in failed_topos: + tail = topo.get("error") or topo.get("stderr_tail") or topo.get("stdout_tail") or "" + tail = str(tail).replace("\n", " ").replace("|", "\\|")[-240:] + lines.append(f"| {topo.get('label', '')} | {topo.get('returncode', '')} | {tail} |") + lines.append("") + lines.append(f"**Overall: {'PASS' if multinode.get('passed') else 'FAIL'}**\n") + elif multinode and multinode.get("error"): + lines.append("## Multi-node NCCL / Cross Leaf\n") + lines.append(f"**Overall: FAIL** ({multinode.get('error')})\n") + preflight = multinode.get("preflight", {}) + if preflight.get("checks"): + lines.append("| Check | Status | Detail |") + lines.append("|-------|--------|--------|") + for check in preflight["checks"]: + detail = str(check.get("detail", "")).replace("\n", " ") + lines.append(f"| {check.get('name', '')} | {check.get('status', '')} | {detail} |") + lines.append("") + # --- Stress Test --- stress = results.get("stress") if stress and not stress.get("error"): @@ -368,6 +542,21 @@ class ReportGenerator: source = stress.get("source", "unknown") lines.append(f"- **Source:** {source}") lines.append(f"- **Duration:** {elapsed:.0f}s (requested {duration}s)") + telemetry = stress.get("telemetry") or {} + if telemetry: + lines.append(f"- **Telemetry samples:** {telemetry.get('samples', 0)}") + lines.append(f"- **Max temp:** {telemetry.get('max_temp_c', {})}") + lines.append(f"- **Avg power:** {telemetry.get('avg_power_w', {})}") + lines.append(f"- **Temp delta:** {telemetry.get('temp_delta_c', 'N/A')} C") + lines.append(f"- **TFLOPS jitter:** {telemetry.get('tflops_jitter_pct', 'N/A')}%") + lines.append(f"- **Steady TFLOPS samples:** {telemetry.get('steady_tflops_samples', 0)}") + lines.append(f"- **Throttle events:** {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}") + lines.append(f"- **XID events:** {len(telemetry.get('xid_events', []))}") + failures = telemetry.get("failures") or [] + if failures: + lines.append("- **Failure reasons:**") + for reason in failures: + lines.append(f" - {reason}") lines.append(f"- **Result: {'PASS' if passed else 'FAIL'}**") lines.append("") @@ -378,26 +567,70 @@ class ReportGenerator: lines.append(f"**Overall: SKIP** [{rdma.get('reason', 'no IB hardware detected')}]\n") elif rdma and not rdma.get("error"): lines.append("## RDMA/InfiniBand\n") + rdma_legacy_note = self._rdma_legacy_note(rdma) + if rdma_legacy_note: + lines.append(f"> {rdma_legacy_note}\n") + port_checks = rdma.get("port_checks", []) + if port_checks: + lines.append("### RDMA Port Checks\n") + lines.append("| Device | Port | State | Rate | Required | Status |") + lines.append("|--------|------|-------|------|----------|--------|") + for p in port_checks: + lines.append( + f"| {p.get('device', '')} | {p.get('port', '')} | " + f"{p.get('state', '')} | {p.get('rate', '')} | " + f">= {p.get('min_rate_gbps', 400):.0f}Gbps ACTIVE | {p.get('status', '?')} |" + ) + lines.append("") bw_tests = rdma.get("bandwidth_tests", []) lat_tests = rdma.get("latency_tests", []) - if bw_tests or lat_tests: + ibping_tests = rdma.get("ibping_tests", []) + if bw_tests or lat_tests or ibping_tests: lines.append("| Test | Value | Threshold | Status |") lines.append("|------|-------|-----------|--------|") for bt in bw_tests: - if not bt.get("error"): + if bt.get("error"): + lines.append(f"| {bt.get('test', 'ib_bw')} | {bt.get('error')} | required runnable test | {bt.get('status', 'FAIL')} |") + else: + threshold, status = self._rdma_bandwidth_verdict(bt) lines.append(f"| {bt['test']} | {bt.get('bandwidth_gbps', 0):.1f} GB/s | " - f">= {bt.get('min_required_gbps', 0)} GB/s | {bt.get('status', '?')} |") + f">= {threshold:g} GB/s | {status} |") for lt in lat_tests: - if not lt.get("error"): + if lt.get("error"): + lines.append(f"| {lt.get('test', 'ib_lat')} | {lt.get('error')} | required runnable test | {lt.get('status', 'FAIL')} |") + else: + threshold, status = self._rdma_latency_verdict(lt) lines.append(f"| {lt['test']} | {lt.get('latency_us', 0):.2f} us | " - f"<= {lt.get('max_allowed_us', 0)} us | {lt.get('status', '?')} |") + f"<= {threshold:g} us | {status} |") + for it in ibping_tests: + direction = it.get("direction") or it.get("role", "N/A") + if it.get("error"): + lines.append(f"| {it.get('test', 'ibping')} | {it.get('error')} | bidirectional peer evidence | {it.get('status', 'FAIL')} |") + else: + lines.append(f"| {it['test']} | {direction} target={it.get('target', 'N/A')} count={it.get('count', 'N/A')} | " + f"0% packet loss | {it.get('status', '?')} |") lines.append("") + fabric = rdma.get("fabric_counters") or {} + if fabric: + counters = fabric.get("counters", {}) + lines.append(f"- **PFC/ECN/CNP/congestion counters checked:** {len(counters)}") + lines.append(f"- **PFC/ECN/CNP/congestion non-zero:** {'yes' if fabric.get('failed') else 'no'}") + if not counters: + lines.append("- **PFC/ECN/CNP/congestion evidence:** missing") + failures = rdma.get("failures") or [] + if not failures: + failures = self._rdma_failure_reasons(rdma) + if failures: + lines.append("- **Failure reasons:**") + for reason in failures: + lines.append(f" - {reason}") passed = rdma.get("passed", False) lines.append(f"**Overall: {'PASS' if passed else 'FAIL'}**\n") # --- Training --- training = results.get("training") if training and not training.get("error"): + training_status, training_detail, training_missing = self._training_verdict(training) lines.append("## Training Simulation\n") lines.append("| Metric | Value |") lines.append("|--------|-------|") @@ -405,8 +638,14 @@ class ReportGenerator: lines.append(f"| Params | {training.get('total_params_m', 0):.1f}M |") lines.append(f"| Throughput | {training.get('throughput_tokens_per_sec', 0):.0f} tokens/sec |") lines.append(f"| Avg Step Time | {training.get('avg_step_time_ms', 0):.1f} ms |") + lines.append(f"| Warmup Steps | {training.get('warmup_steps', 'N/A')} |") lines.append(f"| Peak Memory | {training.get('peak_memory_gb', 0):.1f} GB |") lines.append(f"| Final Loss | {training.get('final_loss', 'N/A')} |") + lines.append(f"| Step Jitter | {training.get('step_jitter_pct', 'N/A')}% |") + lines.append(f"| Distributed Mode | {training.get('distributed_mode', 'N/A')} |") + if training_missing: + lines.append(f"| Acceptance Gaps | missing {', '.join(training_missing)} |") + lines.append(f"| Verdict | {training_status} ({training_detail}) |") lines.append("") # --- Footer --- @@ -441,6 +680,114 @@ class ReportGenerator: return bench["compute"] return {} + @staticmethod + def _training_verdict(training: dict) -> tuple[str, str, list[str]]: + """Return report status for both current and legacy training result schemas.""" + tps = float(training.get("throughput_tokens_per_sec", 0) or 0) + if "passed" in training: + status = "PASS" if training.get("passed") else "FAIL" + return status, f"{tps:.0f} tokens/sec", [] + + required = ["passed", "step_jitter_pct", "distributed_mode", "loss_finite"] + missing = [k for k in required if k not in training] + return "UNVERIFIED", f"{tps:.0f} tokens/sec; legacy result lacks explicit acceptance verdict", missing + + def _rdma_cfg_value(self, key: str, default: float) -> float: + try: + return float((self.config.get("rdma", {}) or {}).get(key, default)) + except (TypeError, ValueError): + return default + + def _rdma_bandwidth_verdict(self, row: dict) -> tuple[float, str]: + threshold = self._rdma_cfg_value("min_bandwidth_gbps", 47.0) + value = float(row.get("bandwidth_gbps", 0) or 0) + return threshold, "PASS" if value >= threshold else "FAIL" + + def _rdma_latency_verdict(self, row: dict) -> tuple[float, str]: + name = row.get("test", "") + if name == "ib_write_lat": + threshold = self._rdma_cfg_value("max_write_latency_us", 2.0) + elif name == "ib_read_lat": + threshold = self._rdma_cfg_value("max_read_latency_us", 3.5) + else: + threshold = self._rdma_cfg_value("max_latency_us", 3.5) + value = float(row.get("latency_us", 0) or 0) + return threshold, "PASS" if 0 < value <= threshold else "FAIL" + + def _rdma_legacy_note(self, rdma: dict) -> str: + """Flag old RDMA result schemas whose embedded thresholds were looser.""" + for row in rdma.get("bandwidth_tests", []) or []: + if row.get("min_required_gbps") != self._rdma_cfg_value("min_bandwidth_gbps", 47.0): + return ( + "Legacy RDMA result re-evaluated with current PDF acceptance thresholds; " + "old WARN statuses and old 50GB/s/10us limits are not used for verdict." + ) + for row in rdma.get("latency_tests", []) or []: + threshold, _ = self._rdma_latency_verdict(row) + if row.get("max_allowed_us") != threshold: + return ( + "Legacy RDMA result re-evaluated with current PDF acceptance thresholds; " + "old WARN statuses and old 50GB/s/10us limits are not used for verdict." + ) + return "" + + def _rdma_failure_reasons(self, rdma: dict) -> list[str]: + failures = [] + for row in rdma.get("bandwidth_tests", []) or []: + threshold, status = self._rdma_bandwidth_verdict(row) + if status != "PASS": + failures.append( + f"{row.get('test')} bandwidth {row.get('bandwidth_gbps', 0)}GB/s < {threshold:g}GB/s" + ) + for row in rdma.get("latency_tests", []) or []: + threshold, status = self._rdma_latency_verdict(row) + if status != "PASS": + failures.append( + f"{row.get('test')} latency {row.get('latency_us', 0)}us > {threshold:g}us" + ) + for row in rdma.get("ibping_tests", []) or []: + if row.get("status") != "PASS": + failures.append(f"{row.get('test')} failed") + return failures + + @staticmethod + def _overall_acceptance_verdict(summary_items: list[tuple[str, str]]) -> tuple[str, list[tuple[str, str]], list[str]]: + """PDF-style verdict for the report scope. + + Full-suite reports require every single-node acceptance item. Standalone + reports, such as `--test multinode-nccl`, should only judge the items + that were actually requested instead of reporting unrelated evidence as + missing. + """ + single_node_required = [ + "GPU Info", + "Health Check", + "Memory Bandwidth", + "Compute Throughput", + "NVLink/NVSwitch", + "NCCL", + "Stress Test", + "RDMA", + "DCGM", + "Training", + ] + status_by_name = dict(summary_items) + present_single_node = [name for name in single_node_required if name in status_by_name] + if len(present_single_node) >= 3: + required = list(single_node_required) + if "Multi-node NCCL" in status_by_name: + required.append("Multi-node NCCL") + else: + required = list(status_by_name) + missing = [name for name in required if name not in status_by_name] + failures = [ + (name, status) + for name, status in summary_items + if name in required and not str(status).startswith("PASS") + ] + verdict = "PASS" if not missing and not failures else "FAIL" + return verdict, failures, missing + def _build_summary(self, results: dict) -> list[tuple[str, str]]: """Build summary verdict list from results.""" items = [] @@ -473,7 +820,7 @@ class ReportGenerator: d2d = mem.get("d2d_bandwidth_gbps") or 0 items.append(("Memory Bandwidth", f"WARN ({d2d:.0f} GB/s via PyTorch fallback)")) else: - eff = mem.get("efficiency_pct") or 0 + eff = mem.get("d2d_efficiency_pct") or mem.get("efficiency_pct") or 0 verdict = "PASS" if eff >= 80 else ("WARN" if eff >= 60 else "FAIL") items.append(("Memory Bandwidth", f"{verdict} ({eff:.1f}%)")) @@ -491,25 +838,43 @@ class ReportGenerator: rank = {"PASS": 0, "WARN": 1, "FAIL": 2} worst_status = "PASS" worst_dt = None + lowest_margin = None for dt, thr in pass_thresholds.items(): val = per_dtype.get(dt) if not isinstance(val, (int, float)): continue if val >= thr: st = "PASS" - elif val >= thr * 0.9: - st = "WARN" else: st = "FAIL" + margin = val / thr if thr else 0 + if lowest_margin is None or margin < lowest_margin: + lowest_margin = margin + worst_dt = dt if rank[st] > rank[worst_status]: worst_status = st - worst_dt = dt if worst_dt: - items.append(( - "Compute Throughput", - f"{worst_status} (worst {worst_dt.upper()} " - f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})" - )) + consistency = comp.get("consistency", {}) or {} + failed_consistency = [ + (dt, row) + for dt, row in consistency.items() + if not row.get("passed", False) + ] + if failed_consistency: + worst_status = "FAIL" + fail_dt, fail_row = failed_consistency[0] + items.append(( + "Compute Throughput", + f"FAIL ({fail_dt.upper()} spread " + f"{fail_row.get('spread_pct', 0):.2f}% > " + f"{fail_row.get('max_allowed_pct', 3)}%)" + )) + else: + items.append(( + "Compute Throughput", + f"{worst_status} (worst {worst_dt.upper()} " + f"{per_dtype[worst_dt]:.0f} vs >= {pass_thresholds[worst_dt]})" + )) else: items.append(("Compute Throughput", f"{worst_status}")) else: @@ -521,16 +886,46 @@ class ReportGenerator: else: items.append(("Compute Throughput", "N/A")) + # NCCL + if "nvlink" in results: + nvl = results["nvlink"] + if nvl.get("error"): + items.append(("NVLink/NVSwitch", f"ERROR: {nvl['error']}")) + elif nvl.get("passed"): + items.append(("NVLink/NVSwitch", "PASS")) + else: + items.append(("NVLink/NVSwitch", "FAIL")) + + if "dcgm" in results: + d = results["dcgm"] + if d.get("error"): + items.append(("DCGM", f"ERROR: {d['error']}")) + elif d.get("passed"): + items.append(("DCGM", "PASS")) + else: + items.append(("DCGM", "FAIL")) + # NCCL if "nccl" in results: n = results["nccl"] if n.get("error"): items.append(("NCCL", f"ERROR: {n['error']}")) + elif n.get("source") == "torchrun_fallback": + items.append(("NCCL", "FAIL (no nccl-tests bus BW)")) elif n.get("passed"): items.append(("NCCL", "PASS")) else: items.append(("NCCL", "FAIL")) + if "multinode_nccl" in results: + mn = results["multinode_nccl"] + if mn.get("error"): + items.append(("Multi-node NCCL", f"ERROR: {mn['error']}")) + elif mn.get("passed"): + items.append(("Multi-node NCCL", "PASS")) + else: + items.append(("Multi-node NCCL", "FAIL")) + # Stress if "stress" in results: s = results["stress"] @@ -559,7 +954,17 @@ class ReportGenerator: if t.get("error"): items.append(("Training", f"ERROR: {t['error']}")) else: - tps = t.get("throughput_tokens_per_sec", 0) - items.append(("Training", f"PASS ({tps:.0f} tokens/sec)")) + status, detail, _missing = self._training_verdict(t) + items.append(("Training", f"{status} ({detail})")) return items + + +def _format_gbps(value) -> str: + try: + numeric = float(value) + except (TypeError, ValueError): + return str(value) + if numeric.is_integer(): + return f"{numeric:.0f}" + return f"{numeric:.2f}" diff --git a/modules/stress_test.py b/modules/stress_test.py index 8b69d1c..460b3b1 100644 --- a/modules/stress_test.py +++ b/modules/stress_test.py @@ -1,9 +1,10 @@ -"""GPU stress test module — wraps gpu-burn for long-running stability tests.""" +"""GPU stress test module — gpu-burn or PyTorch GEMM with telemetry.""" import glob import os import shutil import subprocess +import threading import time from datetime import datetime @@ -46,7 +47,7 @@ class StressTest: memory_pct = cfg.get("memory_pct", 90) target_gpus = cfg.get("gpus", "all") - gpu_burn = self._find_gpu_burn() + gpu_burn = self._find_gpu_burn() if cfg.get("use_gpu_burn", False) else "" if gpu_burn: # Try gpu-burn first @@ -60,7 +61,7 @@ class StressTest: return result - self.console.print("[yellow]gpu_burn not found, using PyTorch stress test[/yellow]") + self.console.print("[yellow]Using PyTorch stress test[/yellow]") return self._run_pytorch_stress(duration_sec, memory_pct) def _run_gpu_burn(self, gpu_burn: str, duration: int, @@ -77,12 +78,26 @@ class StressTest: cmd.append(str(duration)) t0 = time.time() + xid_before = self._collect_xid_events() + interval = int(self.stress_cfg.get("telemetry_interval_sec", 1)) + telemetry = [] + stop_sampling = threading.Event() + sampler = threading.Thread( + target=self._sample_telemetry, + args=(telemetry, stop_sampling, interval), + daemon=True, + ) + sampler.start() try: r = subprocess.run(cmd, capture_output=True, text=True, timeout=duration + 120) elapsed = round(time.time() - t0, 1) + stop_sampling.set() + sampler.join(timeout=interval + 1) output = r.stdout + r.stderr - passed = r.returncode == 0 + xid_events = self._new_xid_events(xid_before, self._collect_xid_events()) + telemetry_summary = self._evaluate_telemetry(telemetry, [], xid_events) + passed = r.returncode == 0 and telemetry_summary.get("passed", False) gpu_results = [] for line in output.split("\n"): @@ -96,25 +111,36 @@ class StressTest: "duration_sec": duration, "elapsed_sec": elapsed, "gpu_results": gpu_results, + "telemetry": telemetry_summary, "raw_output_tail": output[-500:] if output else "", "timestamp": datetime.now().isoformat(), } except subprocess.TimeoutExpired: + stop_sampling.set() return { "source": "gpu-burn", "passed": False, "duration_sec": duration, "error": "timeout", + "telemetry": self._evaluate_telemetry( + telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events()) + ), "timestamp": datetime.now().isoformat(), } except Exception as e: + stop_sampling.set() return { "source": "gpu-burn", "passed": False, "error": str(e), + "telemetry": self._evaluate_telemetry( + telemetry, [], self._new_xid_events(xid_before, self._collect_xid_events()) + ), "timestamp": datetime.now().isoformat(), } + finally: + stop_sampling.set() def _run_pytorch_stress(self, duration: int, memory_pct: int = 90) -> dict: try: @@ -127,58 +153,79 @@ class StressTest: gpu_count = torch.cuda.device_count() self.console.print(f"[cyan]PyTorch Stress Test ({duration}s, {gpu_count} GPUs, target {memory_pct}% memory)[/cyan]") + dtype_name = self.stress_cfg.get("dtype", "bf16") + matrix_size = int(self.stress_cfg.get("matrix_size", 8192)) + interval = int(self.stress_cfg.get("telemetry_interval_sec", 1)) + dtype_map = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32} + dtype = dtype_map.get(dtype_name, torch.bfloat16) + gpu_status = {} + telemetry = [] + stop_sampling = threading.Event() t0 = time.time() + xid_before = self._collect_xid_events() try: + sampler = threading.Thread( + target=self._sample_telemetry, + args=(telemetry, stop_sampling, interval), + daemon=True, + ) + sampler.start() tensors = {} + ballast = {} + pass_tflops = [] for i in range(gpu_count): with torch.cuda.device(i): - # Get actual free memory (accounting for other processes) free_mem, total_mem = torch.cuda.mem_get_info(i) - - # Calculate allocation from configured memory_pct - target_mem = int(total_mem * memory_pct / 100) - - # Cap at actual free memory with 5% safety margin - alloc_bytes = min(target_mem, int(free_mem * 0.95)) - - # matmul(A, A.T) needs 2x input memory (input + output) - mem_side = int((alloc_bytes / 4 / 2) ** 0.5) - # Cap compute matrix so a single matmul completes in ~2s on H100/H200 - # (FP32 ≈ 67 TFLOPS → 2*4096³/67e12 ≈ 2s). Without this cap, a 141GB - # HBM yields side ≈ 131K → single matmul ~68s × 8 GPUs serial → loop - # overshoots a 60s duration request by 10×+. - MAX_COMPUTE_SIDE = 4096 - side = min(mem_side, MAX_COMPUTE_SIDE) - - actual_mem_mb = side * side * 4 / 1024 / 1024 + side = matrix_size + elem = torch.tensor([], dtype=dtype).element_size() + compute_bytes = side * side * elem * 3 + target_mem = min(int(total_mem * memory_pct / 100), int(free_mem * 0.90)) + ballast_bytes = max(0, target_mem - compute_bytes) + if ballast_bytes: + ballast_elems = ballast_bytes // 2 + ballast[i] = torch.empty(ballast_elems, device=f"cuda:{i}", dtype=torch.float16) + actual_mem_mb = (compute_bytes + ballast_bytes) / 1024 / 1024 total_mem_mb = total_mem / 1024 / 1024 free_mem_mb = free_mem / 1024 / 1024 - + self.console.print( f" [dim]GPU {i}: total {total_mem_mb:.0f}MB, free {free_mem_mb:.0f}MB, " f"alloc {actual_mem_mb:.0f}MB ({actual_mem_mb/total_mem_mb*100:.0f}%) - " - f"matrix {side}x{side}[/dim]" + f"{dtype_name} matrix {side}x{side}[/dim]" + ) + tensors[i] = ( + torch.randn(side, side, device=f"cuda:{i}", dtype=dtype), + torch.randn(side, side, device=f"cuda:{i}", dtype=dtype), + torch.empty(side, side, device=f"cuda:{i}", dtype=dtype), ) - tensors[i] = torch.randn(side, side, device=f"cuda:{i}", dtype=torch.float32) self.console.print(f"\n[cyan]Starting stress test for {duration} seconds...[/cyan]") elapsed_check = 0 while time.time() - t0 < duration: + loop_start = time.perf_counter() # Dispatch matmul on all GPUs in parallel — do NOT synchronize between # GPUs, otherwise the 8 GPUs run serially and overshoot the duration. for i in range(gpu_count): with torch.cuda.device(i): - tensors[i] = torch.matmul(tensors[i], tensors[i].T) + a, b, out = tensors[i] + torch.matmul(a, b, out=out) # Single sync per pass — waits for all 8 streams concurrently for i in range(gpu_count): with torch.cuda.device(i): torch.cuda.synchronize() + loop_elapsed = time.perf_counter() - loop_start + current_elapsed = time.time() - t0 + if loop_elapsed > 0: + flops = gpu_count * 2 * (matrix_size ** 3) + pass_tflops.append({ + "elapsed_sec": current_elapsed, + "tflops": flops / loop_elapsed / 1e12, + }) # Show progress every 10 seconds - current_elapsed = time.time() - t0 if int(current_elapsed) != int(elapsed_check) and int(current_elapsed) % 10 == 0: self.console.print(f" [dim]Running {int(current_elapsed)}s / {duration}s[/dim]") elapsed_check = current_elapsed @@ -198,21 +245,196 @@ class StressTest: "duration_sec": duration, "error": error_msg, "gpu_status": gpu_status, + "telemetry": self._evaluate_telemetry( + telemetry, pass_tflops if "pass_tflops" in locals() else [], + self._new_xid_events(xid_before, self._collect_xid_events()), + ), } finally: + stop_sampling.set() tensors.clear() + ballast.clear() torch.cuda.empty_cache() elapsed = round(time.time() - t0, 1) + xid_events = self._new_xid_events(xid_before, self._collect_xid_events()) + telemetry_summary = self._evaluate_telemetry(telemetry, pass_tflops, xid_events) + passed = all(v == "PASS" for v in gpu_status.values()) and telemetry_summary.get("passed", False) return { "source": "pytorch", - "passed": True, + "passed": passed, "duration_sec": duration, "elapsed_sec": elapsed, "gpu_status": gpu_status, + "telemetry": telemetry_summary, "timestamp": datetime.now().isoformat(), } + def _sample_telemetry(self, telemetry: list, stop_event: threading.Event, interval: int): + query = "index,temperature.gpu,power.draw,clocks_throttle_reasons.active" + while not stop_event.is_set(): + try: + r = subprocess.run( + ["nvidia-smi", f"--query-gpu={query}", "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=10, + ) + if r.returncode == 0: + sample = {"time": time.time(), "gpus": []} + for line in r.stdout.splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 4: + sample["gpus"].append({ + "index": int(parts[0]), + "temp_c": float(parts[1]), + "power_w": float(parts[2]), + "throttle": parts[3], + }) + telemetry.append(sample) + except Exception: + pass + stop_event.wait(interval) + + def _collect_xid_events(self) -> list[str]: + try: + r = subprocess.run( + ["dmesg", "--color=never"], + capture_output=True, text=True, timeout=10, + ) + if r.returncode != 0: + return [] + return [ + line.strip() + for line in r.stdout.splitlines() + if any(token in line.upper() for token in ("XID", "NVRM: XID")) + ] + except Exception: + return [] + + @staticmethod + def _new_xid_events(before: list[str], after: list[str]) -> list[str]: + seen = set(before) + return [line for line in after if line not in seen] + + def _evaluate_telemetry(self, telemetry: list, pass_tflops: list, xid_events: list[str] | None = None) -> dict: + cfg = self.stress_cfg + max_temp = float(cfg.get("max_temp_c", 80)) + max_delta = float(cfg.get("max_temp_delta_c", 5)) + min_power = float(cfg.get("min_power_watts", 630)) + max_jitter = float(cfg.get("max_tflops_jitter_pct", 5)) + require_jitter = bool(cfg.get("require_tflops_jitter", True)) + duration = float(cfg.get("duration_sec", 60)) + requested_warmup = float(cfg.get("warmup_sec", 60)) + warmup_sec = min(requested_warmup, max(0.0, duration * 0.2)) + min_steady_samples = int(cfg.get("min_steady_samples", 10)) + temps = {} + powers = {} + throttle_bad = [] + xid_events = xid_events or [] + steady_telemetry = [ + sample for sample in telemetry + if sample.get("time", 0) - telemetry[0].get("time", 0) >= warmup_sec + ] if telemetry else [] + evaluation_samples = steady_telemetry if len(steady_telemetry) >= min_steady_samples else telemetry + for sample in evaluation_samples: + for g in sample.get("gpus", []): + idx = g["index"] + temps.setdefault(idx, []).append(g["temp_c"]) + powers.setdefault(idx, []).append(g["power_w"]) + try: + bitmask = int(str(g["throttle"]), 16) + except ValueError: + bitmask = 0 + real_throttle = bitmask & ~0x1 + if real_throttle: + throttle_bad.append({ + "gpu": idx, + "throttle": g["throttle"], + "real_throttle": f"0x{real_throttle:x}", + }) + max_temps = {idx: max(vals) for idx, vals in temps.items() if vals} + avg_powers = {idx: sum(vals) / len(vals) for idx, vals in powers.items() if vals} + temp_delta = (max(max_temps.values()) - min(max_temps.values())) if len(max_temps) >= 2 else 0 + jitter = 0 + steady_tflops = [] + for item in pass_tflops: + if isinstance(item, dict): + if float(item.get("elapsed_sec", 0)) >= warmup_sec: + steady_tflops.append(float(item.get("tflops", 0))) + else: + steady_tflops.append(float(item)) + if len(steady_tflops) < 2 and pass_tflops: + steady_tflops = [ + float(item.get("tflops", 0)) if isinstance(item, dict) else float(item) + for item in pass_tflops + ] + if steady_tflops: + mean = sum(steady_tflops) / len(steady_tflops) + jitter = max(abs(v - mean) / mean * 100 for v in steady_tflops) if mean else 0 + failures = [] + temp_failures = {idx: v for idx, v in max_temps.items() if v > max_temp} + power_failures = {idx: v for idx, v in avg_powers.items() if v < min_power} + if not evaluation_samples: + failures.append("no telemetry samples available for evaluation") + if temp_failures: + failures.append( + "max temperature above threshold: " + + ", ".join(f"GPU {idx} {val:.1f}C" for idx, val in sorted(temp_failures.items())) + ) + if temp_delta > max_delta: + failures.append(f"GPU temperature delta {temp_delta:.1f}C exceeds {max_delta:.1f}C") + if power_failures: + failures.append( + "average steady-state power below threshold: " + + ", ".join(f"GPU {idx} {val:.1f}W" for idx, val in sorted(power_failures.items())) + ) + if throttle_bad: + failures.append( + f"non-idle throttle reasons observed in {len(throttle_bad)} samples " + f"(first: GPU {throttle_bad[0]['gpu']} {throttle_bad[0]['real_throttle']})" + ) + if xid_events: + failures.append(f"{len(xid_events)} new XID/NVRM XID events observed") + if require_jitter and len(steady_tflops) < 2: + failures.append( + f"insufficient steady TFLOPS samples for jitter evaluation: {len(steady_tflops)} < 2" + ) + if jitter > max_jitter: + failures.append(f"TFLOPS jitter {jitter:.2f}% exceeds {max_jitter:.2f}%") + passed = ( + bool(evaluation_samples) + and all(v <= max_temp for v in max_temps.values()) + and temp_delta <= max_delta + and all(v >= min_power for v in avg_powers.values()) + and not throttle_bad + and not xid_events + and (not require_jitter or len(steady_tflops) >= 2) + and jitter <= max_jitter + ) + return { + "passed": passed, + "samples": len(telemetry), + "steady_samples": len(evaluation_samples), + "warmup_sec": round(warmup_sec, 1), + "max_temp_c": {k: round(v, 1) for k, v in max_temps.items()}, + "avg_power_w": {k: round(v, 1) for k, v in avg_powers.items()}, + "temp_delta_c": round(temp_delta, 1), + "throttle_events": throttle_bad[:20], + "throttle_event_count": len(throttle_bad), + "xid_events": xid_events[-20:], + "tflops_jitter_pct": round(jitter, 2), + "steady_tflops_samples": len(steady_tflops), + "failures": failures, + "thresholds": { + "max_temp_c": max_temp, + "max_temp_delta_c": max_delta, + "min_power_w": min_power, + "max_tflops_jitter_pct": max_jitter, + "require_tflops_jitter": require_jitter, + "warmup_sec": requested_warmup, + "min_steady_samples": min_steady_samples, + }, + } + @staticmethod def print_results(results: dict, console: Console = None): c = console or Console() @@ -245,5 +467,21 @@ class StressTest: color = "green" if status == "PASS" else "red" c.print(f" GPU {gid}: [{color}]{status}[/{color}]") + telemetry = results.get("telemetry") or {} + if telemetry: + c.print("\n Telemetry:") + c.print(f" Samples: {telemetry.get('samples', 0)} total, {telemetry.get('steady_samples', 0)} evaluated after {telemetry.get('warmup_sec', 0)}s warmup") + c.print(f" Avg steady power: {telemetry.get('avg_power_w', {})}") + c.print(f" Max steady temp: {telemetry.get('max_temp_c', {})}") + c.print(f" Temp delta: {telemetry.get('temp_delta_c', 'N/A')} C") + c.print(f" TFLOPS jitter: {telemetry.get('tflops_jitter_pct', 'N/A')}%") + c.print(f" Throttle events: {telemetry.get('throttle_event_count', len(telemetry.get('throttle_events', [])))}") + c.print(f" XID events: {len(telemetry.get('xid_events', []))}") + failures = telemetry.get("failures", []) + if failures: + c.print(" [red]Failure reasons:[/red]") + for reason in failures: + c.print(f" [red]- {reason}[/red]") + if results.get("error"): c.print(f" [red]Error: {results['error']}[/red]") diff --git a/modules/training_sim.py b/modules/training_sim.py index dc7f5a3..af93850 100644 --- a/modules/training_sim.py +++ b/modules/training_sim.py @@ -1,8 +1,13 @@ """Training simulation module - LLM training workload with PyTorch.""" +import json +import os +import sys +import tempfile import time import subprocess import shutil +import math from datetime import datetime from typing import Optional @@ -36,6 +41,7 @@ class TrainingSim: batch_size = self.train_cfg.get("batch_size", 8) seq_length = self.train_cfg.get("seq_length", 2048) num_steps = self.train_cfg.get("num_steps", 50) + warmup_steps = int(self.train_cfg.get("warmup_steps", 5)) dtype_str = self.train_cfg.get("dtype", "bf16") dtype_map = { @@ -47,7 +53,13 @@ class TrainingSim: self.console.print(f"[cyan]Training Simulation[/cyan]") self.console.print(f" Model: {model_name} | Batch: {batch_size} | Seq: {seq_length} | " - f"DType: {dtype_str} | Steps: {num_steps} | GPUs: {gpu_count}") + f"DType: {dtype_str} | Steps: {num_steps} | Warmup: {warmup_steps} | GPUs: {gpu_count}") + + if self.train_cfg.get("mode", "ddp") == "ddp" and gpu_count > 1: + ddp_result = self._run_synthetic_ddp(gpu_count, batch_size, seq_length, num_steps, dtype_str) + if ddp_result.get("passed") or not self.train_cfg.get("allow_fallback", False): + return ddp_result + self.console.print("[yellow]DDP synthetic training failed, falling back to single-process synthetic path[/yellow]") try: from transformers import AutoModelForCausalLM, AutoTokenizer @@ -87,9 +99,10 @@ class TrainingSim: BarColumn(), TextColumn("{task.completed}/{task.total}"), TimeElapsedColumn(), console=self.console, ) as progress: - task = progress.add_task("Training steps...", total=num_steps) + total_steps = num_steps + warmup_steps + task = progress.add_task("Training steps...", total=total_steps) - for step in range(num_steps): + for step in range(total_steps): torch.cuda.synchronize() t0 = time.perf_counter() @@ -119,8 +132,15 @@ class TrainingSim: progress.advance(task) - avg_step_time = sum(step_times) / len(step_times) + measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times + avg_step_time = sum(measured_steps) / len(measured_steps) throughput = batch_size * seq_length / avg_step_time + jitter = self._jitter_pct(measured_steps) + peak_mem = round(max(mem_usage) if mem_usage else 0, 2) + final_loss = float(loss.item()) if hasattr(loss, "item") else float("nan") + passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss) + if self.train_cfg.get("require_distributed", True): + passed = False return { "model": model_name, @@ -130,11 +150,18 @@ class TrainingSim: "batch_size": batch_size, "seq_length": seq_length, "num_steps": num_steps, + "warmup_steps": warmup_steps, + "total_steps": total_steps, "avg_step_time_ms": round(avg_step_time * 1000, 1), "throughput_tokens_per_sec": round(throughput, 0), "throughput_samples_per_sec": round(batch_size / avg_step_time, 2), - "peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2), - "final_loss": round(loss.item(), 4) if hasattr(loss, 'item') else None, + "peak_memory_gb": peak_mem, + "final_loss": round(final_loss, 4), + "step_jitter_pct": round(jitter, 2), + "distributed_mode": "device_map", + "loss_finite": math.isfinite(final_loss), + "passed": passed, + "acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "", "timestamp": datetime.now().isoformat(), } @@ -142,6 +169,196 @@ class TrainingSim: self.console.print(f"[yellow]Model loading failed: {e}[/yellow]") return self._run_synthetic(gpu_count, batch_size, seq_length, num_steps, dtype) + def _run_synthetic_ddp(self, gpu_count: int, batch_size: int, seq_length: int, + num_steps: int, dtype_str: str) -> dict: + """Run the 1.5B synthetic Transformer with one process per GPU.""" + torchrun = os.path.join(os.path.dirname(sys.executable), "torchrun") + if not os.path.isfile(torchrun): + torchrun = shutil.which("torchrun") or "" + if not torchrun: + return { + "model": "synthetic_transformer_1.5b", + "gpu_count": gpu_count, + "distributed_mode": "ddp", + "passed": False, + "error": "torchrun not found", + "timestamp": datetime.now().isoformat(), + } + + script = r''' +import json +import math +import os +import time +import torch +import torch.distributed as dist +from torch.nn.parallel import DistributedDataParallel as DDP + +def main(): + local_rank = int(os.environ["LOCAL_RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + torch.cuda.set_device(local_rank) + dist.init_process_group("nccl") + + global_batch = int(os.environ["TRAIN_BATCH_SIZE"]) + local_batch = max(1, global_batch // world_size) + seq_length = int(os.environ["TRAIN_SEQ_LENGTH"]) + num_steps = int(os.environ["TRAIN_NUM_STEPS"]) + warmup_steps = int(os.environ.get("TRAIN_WARMUP_STEPS", "5")) + total_steps = num_steps + warmup_steps + dtype_name = os.environ.get("TRAIN_DTYPE", "bf16") + dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}.get(dtype_name, torch.bfloat16) + + hidden_size = 4096 + num_layers = 6 + num_heads = 32 + vocab_size = 32000 + + class SyntheticTransformer(torch.nn.Module): + def __init__(self): + super().__init__() + self.embed = torch.nn.Embedding(vocab_size, hidden_size) + self.layers = torch.nn.ModuleList([ + torch.nn.TransformerEncoderLayer( + d_model=hidden_size, + nhead=num_heads, + dim_feedforward=hidden_size * 4, + batch_first=True, + dtype=dtype, + ) for _ in range(num_layers) + ]) + self.head = torch.nn.Linear(hidden_size, vocab_size, dtype=dtype) + + def forward(self, x): + h = self.embed(x).to(dtype) + for layer in self.layers: + h = layer(h) + return self.head(h) + + model = SyntheticTransformer().cuda() + total_params = sum(p.numel() for p in model.parameters()) + model = DDP(model, device_ids=[local_rank], output_device=local_rank) + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) + input_ids = torch.randint(0, vocab_size, (local_batch, seq_length), device="cuda") + step_times = [] + last_loss = torch.tensor(float("nan"), device="cuda") + torch.cuda.reset_peak_memory_stats(local_rank) + + for _ in range(total_steps): + torch.cuda.synchronize() + t0 = time.perf_counter() + with torch.amp.autocast("cuda", dtype=dtype, enabled=dtype in (torch.float16, torch.bfloat16)): + logits = model(input_ids) + loss = torch.nn.functional.cross_entropy(logits.reshape(-1, vocab_size), input_ids.reshape(-1)) + loss.backward() + optimizer.step() + optimizer.zero_grad(set_to_none=True) + torch.cuda.synchronize() + step_times.append(time.perf_counter() - t0) + last_loss = loss.detach() + + peak_mem = torch.tensor(torch.cuda.max_memory_allocated(local_rank) / 1024**3, device="cuda") + dist.all_reduce(peak_mem, op=dist.ReduceOp.MAX) + finite = torch.tensor(1 if math.isfinite(float(last_loss.item())) else 0, device="cuda") + dist.all_reduce(finite, op=dist.ReduceOp.MIN) + + if dist.get_rank() == 0: + measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times + avg_step = sum(measured_steps) / len(measured_steps) + mean = avg_step + jitter = max(abs(v - mean) / mean * 100 for v in measured_steps) if mean else 0.0 + throughput = global_batch * seq_length / avg_step if avg_step else 0.0 + print("TRAINING_DDP_JSON=" + json.dumps({ + "model": "synthetic_transformer_1.5b", + "total_params_m": round(total_params / 1e6, 1), + "num_layers": num_layers, + "hidden_size": hidden_size, + "gpu_count": world_size, + "dtype": dtype_name, + "batch_size": global_batch, + "local_batch_size": local_batch, + "seq_length": seq_length, + "num_steps": num_steps, + "warmup_steps": warmup_steps, + "total_steps": total_steps, + "avg_step_time_ms": round(avg_step * 1000, 1), + "throughput_tokens_per_sec": round(throughput, 0), + "throughput_samples_per_sec": round(global_batch / avg_step, 2) if avg_step else 0, + "peak_memory_gb": round(float(peak_mem.item()), 2), + "final_loss": round(float(last_loss.item()), 4), + "step_jitter_pct": round(jitter, 2), + "distributed_mode": "ddp", + "loss_finite": bool(int(finite.item())), + }), flush=True) + dist.destroy_process_group() + +if __name__ == "__main__": + main() +''' + tmp = tempfile.NamedTemporaryFile("w", suffix="_training_ddp.py", delete=False) + tmp.write(script) + tmp.close() + + env = { + **os.environ, + "TRAIN_BATCH_SIZE": str(batch_size), + "TRAIN_SEQ_LENGTH": str(seq_length), + "TRAIN_NUM_STEPS": str(num_steps), + "TRAIN_WARMUP_STEPS": str(int(self.train_cfg.get("warmup_steps", 5))), + "TRAIN_DTYPE": dtype_str, + "NCCL_DEBUG": os.environ.get("NCCL_DEBUG", "WARN"), + } + cmd = [torchrun, f"--nproc_per_node={gpu_count}", tmp.name] + self.console.print(f" Running synthetic 1.5B DDP via torchrun ({gpu_count} processes)...") + try: + timeout = int(self.train_cfg.get("timeout_sec", max(600, num_steps * 180))) + r = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout, env=env) + except subprocess.TimeoutExpired: + os.unlink(tmp.name) + return { + "model": "synthetic_transformer_1.5b", + "gpu_count": gpu_count, + "distributed_mode": "ddp", + "passed": False, + "error": "training_ddp_timeout", + "timestamp": datetime.now().isoformat(), + } + finally: + if os.path.exists(tmp.name): + try: + os.unlink(tmp.name) + except OSError: + pass + + marker = "TRAINING_DDP_JSON=" + payload = None + for line in (r.stdout + "\n" + r.stderr).splitlines(): + if marker in line: + payload = line.split(marker, 1)[1].strip() + if r.returncode != 0 or not payload: + return { + "model": "synthetic_transformer_1.5b", + "gpu_count": gpu_count, + "distributed_mode": "ddp", + "passed": False, + "error": (r.stderr or r.stdout or "training_ddp_failed")[-1000:], + "timestamp": datetime.now().isoformat(), + } + + result = json.loads(payload) + loss_value = float(result.get("final_loss", "nan")) + passed = self._acceptance_pass( + float(result.get("throughput_tokens_per_sec", 0)), + float(result.get("step_jitter_pct", 999)), + float(result.get("peak_memory_gb", 999)), + loss_value, + ) and bool(result.get("loss_finite", False)) and result.get("gpu_count") == gpu_count + result.update({ + "passed": passed, + "timestamp": datetime.now().isoformat(), + }) + return result + def _run_synthetic(self, gpu_count, batch_size, seq_length, num_steps, dtype) -> dict: self.console.print(" Running synthetic training benchmark...") @@ -170,11 +387,17 @@ class TrainingSim: h = layer(h) return self.head(h) - model = SyntheticTransformer().cuda() + model = SyntheticTransformer() total_params = sum(p.numel() for p in model.parameters()) self.console.print(f" Synthetic params: {total_params / 1e6:.1f}M") + distributed_mode = "single_gpu" + if gpu_count > 1: + model = torch.nn.DataParallel(model).cuda() + distributed_mode = "data_parallel" + else: + model = model.cuda() model.train() optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) @@ -183,14 +406,17 @@ class TrainingSim: step_times = [] mem_usage = [] + warmup_steps = int(self.train_cfg.get("warmup_steps", 5)) + total_steps = num_steps + warmup_steps + with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TextColumn("{task.completed}/{task.total}"), TimeElapsedColumn(), console=self.console, ) as progress: - task = progress.add_task("Synthetic training...", total=num_steps) + task = progress.add_task("Synthetic training...", total=total_steps) - for step in range(num_steps): + for step in range(total_steps): torch.cuda.synchronize() t0 = time.perf_counter() @@ -206,14 +432,22 @@ class TrainingSim: elapsed = time.perf_counter() - t0 step_times.append(elapsed) - mem_used = torch.cuda.max_memory_allocated() / 1024**3 + mem_used = max(torch.cuda.max_memory_allocated(i) for i in range(gpu_count)) / 1024**3 mem_usage.append(mem_used) - torch.cuda.reset_peak_memory_stats() + for i in range(gpu_count): + torch.cuda.reset_peak_memory_stats(i) progress.advance(task) - avg_step_time = sum(step_times) / len(step_times) + measured_steps = step_times[warmup_steps:] if len(step_times) > warmup_steps else step_times + avg_step_time = sum(measured_steps) / len(measured_steps) throughput = batch_size * seq_length / avg_step_time + jitter = self._jitter_pct(measured_steps) + peak_mem = round(max(mem_usage) if mem_usage else 0, 2) + final_loss = float(loss.item()) + passed = self._acceptance_pass(throughput, jitter, peak_mem, final_loss) + if self.train_cfg.get("require_distributed", True): + passed = False return { "model": "synthetic_transformer", @@ -225,14 +459,36 @@ class TrainingSim: "batch_size": batch_size, "seq_length": seq_length, "num_steps": num_steps, + "warmup_steps": warmup_steps, + "total_steps": total_steps, "avg_step_time_ms": round(avg_step_time * 1000, 1), "throughput_tokens_per_sec": round(throughput, 0), "throughput_samples_per_sec": round(batch_size / avg_step_time, 2), - "peak_memory_gb": round(max(mem_usage) if mem_usage else 0, 2), - "final_loss": round(loss.item(), 4), + "peak_memory_gb": peak_mem, + "final_loss": round(final_loss, 4), + "step_jitter_pct": round(jitter, 2), + "distributed_mode": distributed_mode, + "loss_finite": math.isfinite(final_loss), + "passed": passed, + "acceptance_gap": "8-GPU DDP was not used" if self.train_cfg.get("require_distributed", True) else "", "timestamp": datetime.now().isoformat(), } + @staticmethod + def _jitter_pct(step_times: list[float]) -> float: + if not step_times: + return 0.0 + mean = sum(step_times) / len(step_times) + return max(abs(v - mean) / mean * 100 for v in step_times) if mean else 0.0 + + def _acceptance_pass(self, throughput: float, jitter: float, peak_mem: float, loss_value: float) -> bool: + return ( + throughput >= float(self.train_cfg.get("min_tokens_per_sec", 45000)) + and jitter <= float(self.train_cfg.get("max_step_jitter_pct", 3)) + and peak_mem <= float(self.train_cfg.get("max_peak_memory_gb", 70)) + and math.isfinite(loss_value) + ) + @staticmethod def print_results(results: dict, console: Console = None): c = console or Console() @@ -254,11 +510,15 @@ class TrainingSim: ("Batch Size", str(results.get("batch_size", "N/A"))), ("Seq Length", str(results.get("seq_length", "N/A"))), ("Steps", str(results.get("num_steps", "N/A"))), + ("Warmup Steps", str(results.get("warmup_steps", "N/A"))), ("Avg Step Time", f"{results.get('avg_step_time_ms', 'N/A')} ms"), ("Throughput", f"{results.get('throughput_tokens_per_sec', 'N/A')} tokens/s"), ("Samples/sec", f"{results.get('throughput_samples_per_sec', 'N/A')}"), ("Peak Memory", f"{results.get('peak_memory_gb', 'N/A')} GB"), ("Final Loss", str(results.get("final_loss", "N/A"))), + ("Step Jitter", f"{results.get('step_jitter_pct', 'N/A')}%"), + ("Distributed Mode", results.get("distributed_mode", "N/A")), + ("Verdict", "PASS" if results.get("passed") else "FAIL"), ] for label, val in metrics: table.add_row(label, str(val)) diff --git a/scripts/cublaslt_fp8_gemm_bench.cu b/scripts/cublaslt_fp8_gemm_bench.cu new file mode 100644 index 0000000..a401f36 --- /dev/null +++ b/scripts/cublaslt_fp8_gemm_bench.cu @@ -0,0 +1,291 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define CHECK_CUDA(call) \ + do { \ + cudaError_t status = (call); \ + if (status != cudaSuccess) { \ + std::fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(status)); \ + std::exit(1); \ + } \ + } while (0) + +#define CHECK_CUBLAS(call) \ + do { \ + cublasStatus_t status = (call); \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + std::fprintf(stderr, "cuBLASLt error %s:%d: status=%d\n", __FILE__, \ + __LINE__, static_cast(status)); \ + std::exit(1); \ + } \ + } while (0) + +__global__ void fill_fp8(__nv_fp8_e4m3 *ptr, size_t count, float value) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = blockDim.x * gridDim.x; + for (size_t i = tid; i < count; i += stride) { + ptr[i] = __nv_fp8_e4m3(value); + } +} + +struct Args { + int matrix_size = 8192; + int warmup = 20; + int iterations = 200; + int first_gpu = 0; + int gpu_count = -1; + size_t workspace_mb = 256; + int fast_accum = 1; +}; + +static Args parse_args(int argc, char **argv) { + Args args; + for (int i = 1; i < argc; ++i) { + auto need = [&](const char *name) { + if (i + 1 >= argc) { + std::fprintf(stderr, "Missing value for %s\n", name); + std::exit(2); + } + return argv[++i]; + }; + if (!std::strcmp(argv[i], "--matrix-size")) { + args.matrix_size = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--warmup")) { + args.warmup = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--iterations")) { + args.iterations = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--first-gpu")) { + args.first_gpu = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--gpu-count")) { + args.gpu_count = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--workspace-mb")) { + args.workspace_mb = static_cast(std::atoll(need(argv[i]))); + } else if (!std::strcmp(argv[i], "--fast-accum")) { + args.fast_accum = std::atoi(need(argv[i])); + } else if (!std::strcmp(argv[i], "--help") || !std::strcmp(argv[i], "-h")) { + std::puts("Usage: cublaslt_fp8_gemm_bench [--matrix-size N] [--warmup N] " + "[--iterations N] [--first-gpu N] [--gpu-count N] " + "[--workspace-mb N] [--fast-accum 0|1]"); + std::exit(0); + } else { + std::fprintf(stderr, "Unknown argument: %s\n", argv[i]); + std::exit(2); + } + } + return args; +} + +static double run_one_gpu(int gpu, const Args &args) { + CHECK_CUDA(cudaSetDevice(gpu)); + + const int64_t m = args.matrix_size; + const int64_t n = args.matrix_size; + const int64_t k = args.matrix_size; + const size_t a_elems = static_cast(m) * k; + const size_t b_elems = static_cast(k) * n; + const size_t d_elems = static_cast(m) * n; + + __nv_fp8_e4m3 *d_a = nullptr; + __nv_fp8_e4m3 *d_b = nullptr; + __nv_bfloat16 *d_d = nullptr; + void *workspace = nullptr; + float *d_scale_a = nullptr; + float *d_scale_b = nullptr; + const float scale = 1.0f; + const size_t workspace_bytes = args.workspace_mb * 1024ULL * 1024ULL; + + CHECK_CUDA(cudaMalloc(&d_a, a_elems * sizeof(__nv_fp8_e4m3))); + CHECK_CUDA(cudaMalloc(&d_b, b_elems * sizeof(__nv_fp8_e4m3))); + CHECK_CUDA(cudaMalloc(&d_d, d_elems * sizeof(__nv_bfloat16))); + CHECK_CUDA(cudaMalloc(&workspace, workspace_bytes)); + CHECK_CUDA(cudaMalloc(&d_scale_a, sizeof(float))); + CHECK_CUDA(cudaMalloc(&d_scale_b, sizeof(float))); + CHECK_CUDA(cudaMemcpy(d_scale_a, &scale, sizeof(scale), cudaMemcpyHostToDevice)); + CHECK_CUDA(cudaMemcpy(d_scale_b, &scale, sizeof(scale), cudaMemcpyHostToDevice)); + + const int threads = 256; + const int blocks = 4096; + fill_fp8<<>>(d_a, a_elems, 0.01f); + fill_fp8<<>>(d_b, b_elems, 0.01f); + CHECK_CUDA(cudaMemset(d_d, 0, d_elems * sizeof(__nv_bfloat16))); + CHECK_CUDA(cudaGetLastError()); + CHECK_CUDA(cudaDeviceSynchronize()); + + cublasLtHandle_t lt; + cublasLtMatmulDesc_t op_desc; + cublasLtMatrixLayout_t a_desc, b_desc, d_desc; + cublasLtMatmulPreference_t preference; + CHECK_CUBLAS(cublasLtCreate(<)); + CHECK_CUBLAS(cublasLtMatmulDescCreate(&op_desc, CUBLAS_COMPUTE_32F, CUDA_R_32F)); + + // cuBLASLt FP8 kernels require TN format: A is transposed, B is non-transposed. + // With square GEMMs this keeps the benchmark FLOP count identical to the PDF + // acceptance shape while satisfying the library's FP8 kernel constraints. + cublasOperation_t transa = CUBLAS_OP_T; + cublasOperation_t transb = CUBLAS_OP_N; + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + op_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + op_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + op_desc, CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, &d_scale_a, + sizeof(d_scale_a))); + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + op_desc, CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, &d_scale_b, + sizeof(d_scale_b))); + int8_t fast_accum = args.fast_accum ? 1 : 0; + CHECK_CUBLAS(cublasLtMatmulDescSetAttribute( + op_desc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fast_accum, + sizeof(fast_accum))); + + CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8F_E4M3, k, m, k)); + CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8F_E4M3, k, n, k)); + CHECK_CUBLAS(cublasLtMatrixLayoutCreate(&d_desc, CUDA_R_16BF, m, n, m)); + + CHECK_CUBLAS(cublasLtMatmulPreferenceCreate(&preference)); + CHECK_CUBLAS(cublasLtMatmulPreferenceSetAttribute( + preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_bytes, + sizeof(workspace_bytes))); + + cublasLtMatmulHeuristicResult_t heuristic; + int returned = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoGetHeuristic( + lt, op_desc, a_desc, b_desc, d_desc, d_desc, preference, 1, &heuristic, + &returned)); + if (returned == 0) { + std::fprintf(stderr, "No cuBLASLt heuristic returned for GPU %d\n", gpu); + std::exit(1); + } + + auto get_algo_attr_i32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) { + int32_t value = -1; + size_t written = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute( + &heuristic.algo, attr, &value, sizeof(value), &written)); + return static_cast(value); + }; + auto get_algo_attr_u32 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) { + uint32_t value = 0; + size_t written = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute( + &heuristic.algo, attr, &value, sizeof(value), &written)); + return static_cast(value); + }; + auto get_algo_attr_u16 = [&](cublasLtMatmulAlgoConfigAttributes_t attr) { + uint16_t value = 0; + size_t written = 0; + CHECK_CUBLAS(cublasLtMatmulAlgoConfigGetAttribute( + &heuristic.algo, attr, &value, sizeof(value), &written)); + return static_cast(value); + }; + const int algo_id = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_ID); + const int tile_id = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_TILE_ID); + const int splitk = get_algo_attr_i32(CUBLASLT_ALGO_CONFIG_SPLITK_NUM); + const int stages = get_algo_attr_u32(CUBLASLT_ALGO_CONFIG_STAGES_ID); + const int inner_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID); + const int cluster_shape = get_algo_attr_u16(CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID); + + const float alpha = 1.0f; + const float beta = 0.0f; + auto matmul = [&]() { + CHECK_CUBLAS(cublasLtMatmul(lt, op_desc, &alpha, d_a, a_desc, d_b, b_desc, + &beta, d_d, d_desc, d_d, d_desc, + &heuristic.algo, workspace, workspace_bytes, 0)); + }; + + for (int i = 0; i < args.warmup; ++i) { + matmul(); + } + CHECK_CUDA(cudaDeviceSynchronize()); + + cudaEvent_t start, stop; + CHECK_CUDA(cudaEventCreate(&start)); + CHECK_CUDA(cudaEventCreate(&stop)); + CHECK_CUDA(cudaEventRecord(start)); + for (int i = 0; i < args.iterations; ++i) { + matmul(); + } + CHECK_CUDA(cudaEventRecord(stop)); + CHECK_CUDA(cudaEventSynchronize(stop)); + float elapsed_ms = 0.0f; + CHECK_CUDA(cudaEventElapsedTime(&elapsed_ms, start, stop)); + const double flops = + 2.0 * static_cast(m) * static_cast(n) * + static_cast(k) * static_cast(args.iterations); + const double tflops = flops / (static_cast(elapsed_ms) / 1000.0) / 1e12; + std::printf( + " {\"index\": %d, \"fp8_tflops\": %.1f, \"algo_id\": %d, " + "\"tile_id\": %d, \"splitk\": %d, \"stages_id\": %d, " + "\"inner_shape_id\": %d, \"cluster_shape_id\": %d}%s\n", + gpu, tflops, algo_id, tile_id, splitk, stages, inner_shape, cluster_shape, + (gpu + 1 == args.first_gpu + args.gpu_count) ? "" : ","); + std::fflush(stdout); + + CHECK_CUDA(cudaEventDestroy(start)); + CHECK_CUDA(cudaEventDestroy(stop)); + CHECK_CUBLAS(cublasLtMatmulPreferenceDestroy(preference)); + CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(a_desc)); + CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(b_desc)); + CHECK_CUBLAS(cublasLtMatrixLayoutDestroy(d_desc)); + CHECK_CUBLAS(cublasLtMatmulDescDestroy(op_desc)); + CHECK_CUBLAS(cublasLtDestroy(lt)); + CHECK_CUDA(cudaFree(d_a)); + CHECK_CUDA(cudaFree(d_b)); + CHECK_CUDA(cudaFree(d_d)); + CHECK_CUDA(cudaFree(workspace)); + CHECK_CUDA(cudaFree(d_scale_a)); + CHECK_CUDA(cudaFree(d_scale_b)); + CHECK_CUDA(cudaDeviceSynchronize()); + + return tflops; +} + +int main(int argc, char **argv) { + Args args = parse_args(argc, argv); + int device_count = 0; + CHECK_CUDA(cudaGetDeviceCount(&device_count)); + if (args.gpu_count < 0) { + args.gpu_count = device_count - args.first_gpu; + } + if (args.first_gpu < 0 || args.first_gpu + args.gpu_count > device_count) { + std::fprintf(stderr, "Invalid GPU range first=%d count=%d device_count=%d\n", + args.first_gpu, args.gpu_count, device_count); + return 2; + } + + std::vector values; + std::printf("{\n"); + std::printf(" \"source\": \"cuBLASLt\",\n"); + std::printf(" \"dtype\": \"fp8_e4m3_inputs_bf16_output_fp32_accum\",\n"); + std::printf(" \"matrix_size\": %d,\n", args.matrix_size); + std::printf(" \"warmup\": %d,\n", args.warmup); + std::printf(" \"iterations\": %d,\n", args.iterations); + std::printf(" \"fast_accum\": %d,\n", args.fast_accum ? 1 : 0); + std::printf(" \"per_gpu\": [\n"); + for (int i = 0; i < args.gpu_count; ++i) { + int gpu = args.first_gpu + i; + double tflops = run_one_gpu(gpu, args); + values.push_back(tflops); + } + double mean = std::accumulate(values.begin(), values.end(), 0.0) / values.size(); + auto minmax = std::minmax_element(values.begin(), values.end()); + double spread = ((*minmax.second - *minmax.first) / mean) * 100.0; + std::printf(" ],\n"); + std::printf(" \"mean_tflops\": %.1f,\n", mean); + std::printf(" \"min_tflops\": %.1f,\n", *minmax.first); + std::printf(" \"max_tflops\": %.1f,\n", *minmax.second); + std::printf(" \"spread_pct\": %.2f\n", spread); + std::printf("}\n"); + return mean >= 1400.0 ? 0 : 1; +} diff --git a/scripts/multinode_nccl_deep_diagnose.sh b/scripts/multinode_nccl_deep_diagnose.sh new file mode 100755 index 0000000..b16409c --- /dev/null +++ b/scripts/multinode_nccl_deep_diagnose.sh @@ -0,0 +1,425 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Deep-diagnose multi-node NCCL behavior from the coordinator node. +# Default values match the current 2-node H100 cross-leaf investigation. + +MODE="${1:-all}" + +MPI_BIN="${MPI_BIN:-/usr/mpi/gcc/openmpi-4.1.9a1/bin/mpirun}" +NCCL_TESTS_DIR="${NCCL_TESTS_DIR:-/data/nccl-tests-latest/build}" +HOSTS="${HOSTS:-172.72.8.12:8,172.72.8.16:8}" +PEER_HOST="${PEER_HOST:-172.72.8.16}" +SSH_USER="${SSH_USER:-root}" +HCAS="${HCAS:-mlx5_0 mlx5_1 mlx5_6 mlx5_7}" +HCA_CSV="${HCA_CSV:-mlx5_0,mlx5_1,mlx5_6,mlx5_7}" +OUT_DIR="${OUT_DIR:-/tmp/nccl_deep_diagnose_$(date +%Y%m%d_%H%M%S)}" + +BEGIN_SIZE="${BEGIN_SIZE:-16G}" +END_SIZE="${END_SIZE:-16G}" +WARMUP_ITERS="${WARMUP_ITERS:-10}" +ITERS="${ITERS:-10}" +GRAPH_WARMUP_ITERS="${GRAPH_WARMUP_ITERS:-1}" +GRAPH_ITERS="${GRAPH_ITERS:-1}" +SWEEP_WARMUP_ITERS="${SWEEP_WARMUP_ITERS:-3}" +SWEEP_ITERS="${SWEEP_ITERS:-5}" + +NCCL_LD_LIBRARY_PATH="${NCCL_LD_LIBRARY_PATH:-/usr/mpi/gcc/openmpi-4.1.9a1/lib:/tmp/nccl-2.27.7-cuda12.4/usr/lib/x86_64-linux-gnu:/usr/local/cuda-12.4/targets/x86_64-linux/lib}" +DEFAULT_NCCL_DEBUG="${NCCL_DEBUG:-WARN}" + +COUNTERS="${COUNTERS:-port_xmit_data port_rcv_data port_xmit_packets port_rcv_packets port_xmit_wait port_xmit_discards port_rcv_errors port_rcv_remote_physical_errors port_rcv_switch_relay_errors port_xmit_constraint_errors port_rcv_constraint_errors symbol_error link_error_recovery link_downed local_link_integrity_errors excessive_buffer_overrun_errors VL15_dropped}" +HW_COUNTERS="${HW_COUNTERS:-roce_adp_retrans roce_adp_retrans_to roce_slow_restart roce_slow_restart_cnps roce_slow_restart_trans packet_seq_err out_of_sequence out_of_buffer duplicate_request implied_nak_seq_err local_ack_timeout_err req_transport_retries_exceeded rnr_nak_retry_err rx_write_requests rx_read_requests}" + +mkdir -p "$OUT_DIR" + +mpi_base=( + "$MPI_BIN" + --allow-run-as-root + --mca btl_openib_warn_no_device_params_found 0 + --mca btl_tcp_if_include bond0 + --mca oob_tcp_if_include bond0 + --mca plm_rsh_args "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes -o ConnectTimeout=10" + -H "$HOSTS" + --map-by ppr:8:node + -np 16 +) + +base_exports=( + LD_LIBRARY_PATH + NCCL_IB_GID_INDEX NCCL_IB_SL NCCL_IB_TC NCCL_SOCKET_IFNAME + NCCL_DEBUG NCCL_DEBUG_SUBSYS NCCL_IB_TIMEOUT NCCL_IB_HCA + NCCL_NET_PLUGIN NCCL_NVLS_ENABLE NCCL_NET_GDR_LEVEL NCCL_NET_GDR_READ + NCCL_DMABUF_ENABLE NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION + NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS + NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER + NCCL_IB_AR_THRESHOLD +) + +set_common_env() { + unset NCCL_DEBUG_SUBSYS NCCL_PXN_DISABLE NCCL_IB_QPS_PER_CONNECTION + unset NCCL_IB_SPLIT_DATA_ON_QPS NCCL_MIN_NCHANNELS NCCL_MAX_NCHANNELS + unset NCCL_BUFFSIZE NCCL_P2P_NET_CHUNKSIZE NCCL_NCHANNELS_PER_NET_PEER + unset NCCL_IB_AR_THRESHOLD + + export LD_LIBRARY_PATH="$NCCL_LD_LIBRARY_PATH" + export NCCL_IB_GID_INDEX="${NCCL_IB_GID_INDEX:-3}" + export NCCL_IB_SL="${NCCL_IB_SL:-5}" + export NCCL_IB_TC="${NCCL_IB_TC:-136}" + export NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME:-bond0}" + export NCCL_DEBUG="$DEFAULT_NCCL_DEBUG" + export NCCL_IB_TIMEOUT="${NCCL_IB_TIMEOUT:-22}" + export NCCL_IB_HCA="$HCA_CSV" + export NCCL_NET_PLUGIN="${NCCL_NET_PLUGIN:-none}" + export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-1}" + export NCCL_NET_GDR_LEVEL="${NCCL_NET_GDR_LEVEL:-5}" + export NCCL_NET_GDR_READ="${NCCL_NET_GDR_READ:-1}" + export NCCL_DMABUF_ENABLE="${NCCL_DMABUF_ENABLE:-0}" +} + +mpi_xargs() { + for name in "${base_exports[@]}"; do + if [[ -n "${!name+x}" ]]; then + printf -- '-x\n%s\n' "$name" + fi + done +} + +run_nccl() { + local op="$1" + local bin="$2" + local log="$3" + local warmup="$4" + local iters="$5" + mapfile -t xargs < <(mpi_xargs) + "${mpi_base[@]}" "${xargs[@]}" \ + "$bin" -b "$BEGIN_SIZE" -e "$END_SIZE" -g 1 -f 2 -w "$warmup" -n "$iters" \ + >"$log" 2>&1 + awk -v op="$op" '/Avg bus bandwidth/ {print op, $0}' "$log" +} + +read_one_snapshot() { + local host_label="$1" + local out="$2" + : >"$out" + for hca in $HCAS; do + for c in $COUNTERS; do + local f="/sys/class/infiniband/$hca/ports/1/counters/$c" + if [[ -r "$f" ]]; then + printf '%s %s counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out" + fi + done + for c in $HW_COUNTERS; do + local f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c" + if [[ -r "$f" ]]; then + printf '%s %s hw_counters %s %s\n' "$host_label" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" >>"$out" + fi + done + done +} + +read_remote_snapshot() { + local out="$1" + ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ + -o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \ + "HCAS='$HCAS' COUNTERS='$COUNTERS' HW_COUNTERS='$HW_COUNTERS' bash -s" <<'EOS' >"$out" +for hca in $HCAS; do + for c in $COUNTERS; do + f="/sys/class/infiniband/$hca/ports/1/counters/$c" + if [ -r "$f" ]; then + printf '%s %s counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" + fi + done + for c in $HW_COUNTERS; do + f="/sys/class/infiniband/$hca/ports/1/hw_counters/$c" + if [ -r "$f" ]; then + printf '%s %s hw_counters %s %s\n' "$HOSTNAME" "$hca" "$c" "$(cat "$f" 2>/dev/null || echo 0)" + fi + done +done +EOS +} + +summarize_counter_delta() { + local before_a="$1" + local before_b="$2" + local after_a="$3" + local after_b="$4" + local out="$5" + python3 - "$before_a" "$before_b" "$after_a" "$after_b" >"$out" <<'PY' +import pathlib +import sys + +interesting = { + "port_xmit_wait", "port_xmit_discards", "port_rcv_errors", + "port_rcv_remote_physical_errors", "port_rcv_switch_relay_errors", + "port_xmit_constraint_errors", "port_rcv_constraint_errors", + "symbol_error", "link_error_recovery", "link_downed", + "local_link_integrity_errors", "excessive_buffer_overrun_errors", + "VL15_dropped", "roce_adp_retrans", "roce_adp_retrans_to", + "roce_slow_restart", "roce_slow_restart_cnps", "roce_slow_restart_trans", + "packet_seq_err", "out_of_sequence", "out_of_buffer", + "duplicate_request", "implied_nak_seq_err", "local_ack_timeout_err", + "req_transport_retries_exceeded", "rnr_nak_retry_err", +} + +def load(path): + data = {} + for line in pathlib.Path(path).read_text().splitlines(): + parts = line.split() + if len(parts) != 5: + continue + host, hca, kind, counter, value = parts + try: + data[(host, hca, kind, counter)] = int(value) + except ValueError: + pass + return data + +before = {} +after = {} +before.update(load(sys.argv[1])) +before.update(load(sys.argv[2])) +after.update(load(sys.argv[3])) +after.update(load(sys.argv[4])) + +print("NONZERO_DELTAS") +for key in sorted(set(before) | set(after)): + delta = after.get(key, 0) - before.get(key, 0) + if not delta: + continue + host, hca, kind, counter = key + if counter in {"port_xmit_data", "port_rcv_data"}: + gib = delta * 4 / (1024 ** 3) + print(f"{host} {hca} {kind} {counter} {delta} words4B {gib:.2f} GiB") + else: + print(f"{host} {hca} {kind} {counter} {delta}") + +print("ERROR_OR_CONGESTION_DELTAS") +seen = False +for key in sorted(set(before) | set(after)): + delta = after.get(key, 0) - before.get(key, 0) + if delta and key[3] in interesting: + seen = True + print(*key, delta) +if not seen: + print("none") +PY +} + +run_counter_case() { + local op="$1" + local bin="$2" + local extra="${3:-}" + set_common_env + if [[ -n "$extra" ]]; then + eval "export $extra" + fi + local dir="$OUT_DIR/${op}_counter" + mkdir -p "$dir" + read_one_snapshot "$(hostname)" "$dir/before.local" + read_remote_snapshot "$dir/before.remote" + run_nccl "$op" "$bin" "$dir/${op}.log" "$WARMUP_ITERS" "$ITERS" + read_one_snapshot "$(hostname)" "$dir/after.local" + read_remote_snapshot "$dir/after.remote" + summarize_counter_delta "$dir/before.local" "$dir/before.remote" "$dir/after.local" "$dir/after.remote" "$dir/counter_delta.txt" + echo "$dir" +} + +summarize_graph_log() { + local log="$1" + local out="$2" + python3 - "$log" >"$out" <<'PY' +from pathlib import Path +import collections +import re +import sys + +text = Path(sys.argv[1]).read_text(errors="ignore") +print("avg_busbw", (re.findall(r"Avg bus bandwidth\s*:\s*([0-9.]+)", text) or ["NA"])[-1]) +print("nccl_version", sorted(set(re.findall(r"NCCL version ([^\s]+)", text)))) +print("plugin_missing", len(re.findall(r"Could not find: none libnccl-net-none\.so", text))) +print("gdr_enabled_lines", len(re.findall(r"GPU Direct RDMA Enabled", text))) +print("using_hca") +for value, count in collections.Counter(re.findall(r"NET/IB : Using \[(.*?)\]; OOB", text)).most_common(4): + print(f" {count} {value}") +print("pattern_counts") +patterns = re.findall( + r"Pattern (\d+), crossNic (\d+), nChannels (\d+), bw ([0-9.]+)/([0-9.]+), type ([^,]+), sameChannels (\d+)", + text, +) +for key, count in collections.Counter(patterns).most_common(): + print(f" {count} {key}") +print("channel_summary") +for value, count in collections.Counter( + re.findall(r"(\d+ coll channels, \d+ collnet channels, \d+ nvls channels, \d+ p2p channels, \d+ p2p channels per peer)", text) +).most_common(): + print(f" {count} {value}") +print("p2p_chunks", collections.Counter(re.findall(r"P2P Chunksize set to (\d+)", text))) +print("check_p2p", collections.Counter(re.findall(r"Check P2P Type ([^\n]+)", text))) +for token in ["NET/IB/0/GDRDMA", "NET/IB/1/GDRDMA", "NET/IB/2/GDRDMA", "NET/IB/3/GDRDMA", "P2P/CUMEM", "P2P/IPC", "SHM"]: + print(token, text.count(token)) +print("channel_edge_lines", len([line for line in text.splitlines() if "Channel " in line and ("via NET/IB" in line or "via P2P" in line)])) +PY +} + +run_graph_case() { + local op="$1" + local bin="$2" + local extra="${3:-}" + set_common_env + export NCCL_DEBUG=INFO + export NCCL_DEBUG_SUBSYS=INIT,NET,GRAPH,TUNING,COLL + if [[ -n "$extra" ]]; then + eval "export $extra" + fi + local dir="$OUT_DIR/graph" + mkdir -p "$dir" + local log="$dir/${op}.log" + run_nccl "$op" "$bin" "$log" "$GRAPH_WARMUP_ITERS" "$GRAPH_ITERS" + summarize_graph_log "$log" "$dir/${op}_summary.txt" + echo "$dir/${op}_summary.txt" +} + +run_pxn_sweep() { + local dir="$OUT_DIR/pxn_sweep" + mkdir -p "$dir" + local cases=( + "baseline|" + "nvls_off|NCCL_NVLS_ENABLE=0" + "qps4_split1|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=1" + "qps8_split1|NCCL_IB_QPS_PER_CONNECTION=8 NCCL_IB_SPLIT_DATA_ON_QPS=1" + "qps4_split0|NCCL_IB_QPS_PER_CONNECTION=4 NCCL_IB_SPLIT_DATA_ON_QPS=0" + "channels16|NCCL_MIN_NCHANNELS=16 NCCL_MAX_NCHANNELS=16" + "buff8m|NCCL_BUFFSIZE=8388608" + "p2pchunk4m|NCCL_P2P_NET_CHUNKSIZE=4194304" + "netpeer8|NCCL_NCHANNELS_PER_NET_PEER=8" + "ar0|NCCL_IB_AR_THRESHOLD=0" + ) + : >"$dir/summary.txt" + for item in "${cases[@]}"; do + local name="${item%%|*}" + local extra="${item#*|}" + set_common_env + export NCCL_PXN_DISABLE=1 + if [[ -n "$extra" ]]; then + eval "export $extra" + fi + local log="$dir/${name}.log" + { + echo "===== CASE $name =====" + echo "extra: ${extra:-none}" + run_nccl "alltoall" "$NCCL_TESTS_DIR/alltoall_perf" "$log" "$SWEEP_WARMUP_ITERS" "$SWEEP_ITERS" + awk '/Avg bus bandwidth/ {print}' "$log" | tail -1 + } | tee -a "$dir/summary.txt" + done + echo "$dir/summary.txt" +} + +run_preflight() { + set_common_env + local out="$OUT_DIR/preflight.txt" + { + echo "===== LOCAL =====" + echo "hostname: $(hostname)" + echo "mpirun: $MPI_BIN" + if [[ -x "$MPI_BIN" ]]; then + "$MPI_BIN" --version 2>&1 | sed -n '1p' + else + echo "MISSING executable: $MPI_BIN" + fi + for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do + if [[ -x "$bin" ]]; then + echo "OK executable: $bin" + else + echo "MISSING executable: $bin" + fi + done + for hca in $HCAS; do + local state="/sys/class/infiniband/$hca/ports/1/state" + local rate="/sys/class/infiniband/$hca/ports/1/rate" + if [[ -r "$state" ]]; then + echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)" + else + echo "MISSING HCA path: $hca" + fi + done + + echo "===== REMOTE =====" + ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ + -o BatchMode=yes -o ConnectTimeout=5 "${SSH_USER}@${PEER_HOST}" \ + "MPI_BIN='$MPI_BIN' NCCL_TESTS_DIR='$NCCL_TESTS_DIR' HCAS='$HCAS' bash -s" <<'EOS' +echo "hostname: $(hostname)" +echo "mpirun: $MPI_BIN" +if [ -x "$MPI_BIN" ]; then + "$MPI_BIN" --version 2>&1 | sed -n '1p' +else + echo "MISSING executable: $MPI_BIN" +fi +for bin in "$NCCL_TESTS_DIR/all_reduce_perf" "$NCCL_TESTS_DIR/alltoall_perf"; do + if [ -x "$bin" ]; then + echo "OK executable: $bin" + else + echo "MISSING executable: $bin" + fi +done +for hca in $HCAS; do + state="/sys/class/infiniband/$hca/ports/1/state" + rate="/sys/class/infiniband/$hca/ports/1/rate" + if [ -r "$state" ]; then + echo "OK HCA: $hca state=$(cat "$state") rate=$(cat "$rate" 2>/dev/null || echo unknown)" + else + echo "MISSING HCA path: $hca" + fi +done +EOS + } | tee "$out" + echo "$out" +} + +usage() { + cat </dev/null 2>&1 +} + +emit_cmd() { + local title="$1" + shift + { + echo + echo "### $title" + echo + echo '```text' + "$@" 2>&1 || true + echo '```' + } >>"$OUT_FILE" +} + +active_400g_hcas=() +non_400g_rows=() + +{ + echo "# NCCL/RDMA 环境快照" + echo + echo "- Host: \`$HOST\`" + echo "- Time: \`$(date -Is 2>/dev/null || date)\`" + echo "- Kernel: \`$(uname -r 2>/dev/null || echo unknown)\`" + echo + echo "## HCA / Port 状态" + echo + echo "| HCA | Port | State | Phys State | Rate | Link Layer | 400G IB Rail |" + echo "|---|---:|---|---|---:|---|---|" +} >"$OUT_FILE" + +hca_paths=(/sys/class/infiniband/mlx5_*) +if ((${#hca_paths[@]})); then + for hca_path in "${hca_paths[@]}"; do + hca="$(basename "$hca_path")" + for port_path in "$hca_path"/ports/*; do + [[ -d "$port_path" ]] || continue + port="$(basename "$port_path")" + state="$(cat "$port_path/state" 2>/dev/null || echo NA)" + phys_state="$(cat "$port_path/phys_state" 2>/dev/null || echo NA)" + rate="$(cat "$port_path/rate" 2>/dev/null || echo NA)" + layer="$(cat "$port_path/link_layer" 2>/dev/null || echo NA)" + is_400g="NO" + if [[ "$state" == *"ACTIVE"* && "$rate" == 400\ Gb/sec* && "$layer" == "InfiniBand" ]]; then + is_400g="YES" + active_400g_hcas+=("$hca") + else + non_400g_rows+=("$hca port=$port state=$state rate=$rate layer=$layer") + fi + printf '| `%s` | `%s` | `%s` | `%s` | `%s` | `%s` | `%s` |\n' \ + "$hca" "$port" "$state" "$phys_state" "$rate" "$layer" "$is_400g" >>"$OUT_FILE" + done + done +else + printf '| N/A | N/A | `%s` | N/A | N/A | N/A | NO |\n' "/sys/class/infiniband/mlx5_* not found" >>"$OUT_FILE" +fi + +{ + echo + echo "## Rail 摘要" + echo + if ((${#active_400g_hcas[@]})); then + hca_csv="$(IFS=,; echo "${active_400g_hcas[*]}")" + echo "- Active 400G IB rail count: \`${#active_400g_hcas[@]}\`" + echo "- Candidate \`NCCL_IB_HCA\`: \`$hca_csv\`" + echo "- Theoretical one-way raw bandwidth: \`${#active_400g_hcas[@]} * 400Gb/s / 8 = $((${#active_400g_hcas[@]} * 50)) GB/s\`" + else + echo "- Active 400G IB rail count: \`0\`" + echo "- Candidate \`NCCL_IB_HCA\`: \`N/A\`" + fi + echo + echo "Non-400G / non-IB / down ports:" + echo + if ((${#non_400g_rows[@]})); then + for row in "${non_400g_rows[@]}"; do + echo "- \`$row\`" + done + else + echo "- none" + fi + echo + echo "## PDF 目标换算" + echo + echo "- PDF allreduce busbw target: \`${PDF_ALLREDUCE_BUSBW} GB/s\`" + echo "- PDF alltoall busbw target: \`${PDF_ALLTOALL_BUSBW} GB/s\`" +} >>"$OUT_FILE" + +python3 - "$PDF_ALLREDUCE_BUSBW" "${#active_400g_hcas[@]}" >>"$OUT_FILE" <<'PY' || true +import sys + +busbw = float(sys.argv[1]) +rail_count = int(sys.argv[2]) +algbw = busbw / 1.875 +raw = rail_count * 50.0 +print(f"- 16-rank allreduce implied algbw: `{algbw:.2f} GB/s`") +if rail_count: + pct = algbw / raw * 100 + print(f"- Implied algbw / current raw 400G rail bandwidth: `{pct:.1f}%`") + if algbw > raw: + print("- Interpretation: PDF allreduce target is above current 400G rail one-way raw bandwidth.") + else: + print("- Interpretation: PDF allreduce target is within current 400G rail one-way raw bandwidth.") +else: + print("- Interpretation: no active 400G IB rail was detected.") +PY + +{ + echo + echo "## NCCL Net Plugin / SHARP 文件" + echo + echo '```text' +} >>"$OUT_FILE" + +read -r -a plugin_roots <<<"$PLUGIN_SEARCH_ROOTS" +find "${plugin_roots[@]}" \( -name 'libnccl-net*.so*' -o -name 'libsharp*.so*' \) \ + 2>/dev/null | sort >>"$OUT_FILE" || true + +if ! grep -q 'libnccl-net\|libsharp' "$OUT_FILE"; then + echo "none found under $PLUGIN_SEARCH_ROOTS" >>"$OUT_FILE" +fi + +echo '```' >>"$OUT_FILE" + +if have_cmd dpkg; then + emit_cmd "Relevant Debian packages" bash -lc "dpkg -l | egrep -i 'nccl|sharp|hcoll|ucx|ofed|mlnx' | sed -n '1,160p'" +else + emit_cmd "Relevant packages" bash -lc "echo 'dpkg not found'" +fi + +if have_cmd nvidia-smi; then + emit_cmd "nvidia-smi topo -m" nvidia-smi topo -m +else + emit_cmd "nvidia-smi topo -m" bash -lc "echo 'nvidia-smi not found'" +fi + +if have_cmd ibstat; then + emit_cmd "ibstat" ibstat +fi + +{ + echo + echo "## 建议判断" + echo + echo "1. 如果 Active 400G IB rail 少于 PDF 参考环境,不能直接按 PDF 阈值判断等价。" + echo "2. 如果没有 \`libnccl-net*.so*\` / \`libsharp*.so*\`,NCCL 可能只能走 internal IB plugin。" + echo "3. 若要追 PDF 2x8 目标,请先确认 rail 数量、SHARP/NCCL net plugin、跨 Leaf 交换策略是否与 PDF 环境一致。" + echo + echo "Snapshot written to: \`$OUT_FILE\`" +} >>"$OUT_FILE" + +echo "$OUT_FILE" diff --git a/scripts/pytorch_fp8_path_bench.py b/scripts/pytorch_fp8_path_bench.py new file mode 100755 index 0000000..ab35af8 --- /dev/null +++ b/scripts/pytorch_fp8_path_bench.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python3 +"""Compare FP8 GEMM paths used for H100/H200 acceptance debugging. + +Paths: + A. torch._scaled_mm eager, default accumulation + B. torch._scaled_mm eager, use_fast_accum=True + C. CUDA Graph replay of torch._scaled_mm(out=..., use_fast_accum=True) + D. Transformer Engine Linear under fp8_autocast, when installed +""" + +from __future__ import annotations + +import argparse +import json +import statistics +import sys +import time +from typing import Any, Callable + +import torch + + +def tflops_from_ms(matrix_size: int, iterations: int, elapsed_ms: float) -> float: + flops = 2.0 * matrix_size * matrix_size * matrix_size * iterations + return flops / (elapsed_ms / 1000.0) / 1e12 + + +def cuda_event_bench( + name: str, + matrix_size: int, + iterations: int, + warmup: int, + func: Callable[[int], Any], +) -> dict[str, Any]: + for i in range(warmup): + func(i) + torch.cuda.synchronize() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + wall_start = time.perf_counter() + start.record() + for i in range(iterations): + func(i) + end.record() + torch.cuda.synchronize() + wall_elapsed = time.perf_counter() - wall_start + elapsed_ms = start.elapsed_time(end) + return { + "name": name, + "status": "ok", + "matrix_size": matrix_size, + "iterations": iterations, + "warmup": warmup, + "event_ms_total": round(elapsed_ms, 3), + "event_us_per_iter": round(elapsed_ms * 1000.0 / iterations, 3), + "wall_ms_total": round(wall_elapsed * 1000.0, 3), + "tflops": round(tflops_from_ms(matrix_size, iterations, elapsed_ms), 1), + } + + +def make_fp8_inputs(matrix_size: int, pools: int, device: str) -> tuple[list[torch.Tensor], list[torch.Tensor]]: + a = [ + torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) + for _ in range(pools) + ] + b = [ + torch.randn(matrix_size, matrix_size, device=device, dtype=torch.float32).to(torch.float8_e4m3fn) + for _ in range(pools) + ] + torch.cuda.synchronize() + return a, b + + +def bench_scaled_mm(args: argparse.Namespace) -> list[dict[str, Any]]: + device = f"cuda:{args.gpu_index}" + torch.cuda.set_device(args.gpu_index) + scale_a = torch.tensor(1.0, device=device) + scale_b = torch.tensor(1.0, device=device) + pools_a, pools_b = make_fp8_inputs(args.matrix_size, args.pools, device) + results: list[dict[str, Any]] = [] + + def eager_default(i: int) -> torch.Tensor: + idx = i % args.pools + return torch._scaled_mm( + pools_a[idx], + pools_b[idx].T, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + ) + + def eager_fast(i: int) -> torch.Tensor: + idx = i % args.pools + return torch._scaled_mm( + pools_a[idx], + pools_b[idx].T, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True, + ) + + results.append( + cuda_event_bench( + "A_eager_scaled_mm_default", + args.matrix_size, + args.iterations, + args.warmup, + eager_default, + ) + ) + results.append( + cuda_event_bench( + "B_eager_scaled_mm_fast_accum", + args.matrix_size, + args.iterations, + args.warmup, + eager_fast, + ) + ) + + graph_out = torch.empty( + (args.matrix_size, args.matrix_size), + device=device, + dtype=torch.bfloat16, + ) + static_a = pools_a[0] + static_b_t = pools_b[0].T + + try: + side_stream = torch.cuda.Stream() + side_stream.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(side_stream): + for _ in range(max(3, args.warmup // 2)): + torch._scaled_mm( + static_a, + static_b_t, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True, + out=graph_out, + ) + torch.cuda.current_stream().wait_stream(side_stream) + torch.cuda.synchronize() + + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph): + torch._scaled_mm( + static_a, + static_b_t, + scale_a=scale_a, + scale_b=scale_b, + out_dtype=torch.bfloat16, + use_fast_accum=True, + out=graph_out, + ) + + def graph_replay(_: int) -> None: + graph.replay() + + results.append( + cuda_event_bench( + "C_cuda_graph_scaled_mm_fast_accum", + args.matrix_size, + args.iterations, + 3, + graph_replay, + ) + ) + except Exception as exc: # noqa: BLE001 + results.append( + { + "name": "C_cuda_graph_scaled_mm_fast_accum", + "status": "unavailable", + "reason": f"{type(exc).__name__}: {exc}", + } + ) + + return results + + +def bench_transformer_engine(args: argparse.Namespace) -> dict[str, Any]: + try: + import transformer_engine.pytorch as te # type: ignore[import-not-found] + from transformer_engine.common.recipe import DelayedScaling, Format # type: ignore[import-not-found] + except Exception as exc: # noqa: BLE001 + return { + "name": "D_transformer_engine_fp8_linear", + "status": "unavailable", + "reason": f"{type(exc).__name__}: {exc}", + } + + device = f"cuda:{args.gpu_index}" + x = torch.randn(args.matrix_size, args.matrix_size, device=device, dtype=torch.bfloat16) + layer = te.Linear( + args.matrix_size, + args.matrix_size, + bias=False, + params_dtype=torch.bfloat16, + device=device, + ) + recipe = DelayedScaling(fp8_format=Format.HYBRID) + + def run(_: int) -> torch.Tensor: + with te.fp8_autocast(enabled=True, fp8_recipe=recipe): + return layer(x) + + try: + result = cuda_event_bench( + "D_transformer_engine_fp8_linear", + args.matrix_size, + args.iterations, + args.warmup, + run, + ) + except Exception as exc: # noqa: BLE001 + return { + "name": "D_transformer_engine_fp8_linear", + "status": "error", + "reason": f"{type(exc).__name__}: {exc}", + } + result["note"] = "Transformer Engine Linear forward under fp8_autocast; includes TE module/cast overhead." + return result + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--matrix-size", type=int, default=8192) + parser.add_argument("--warmup", type=int, default=20) + parser.add_argument("--iterations", type=int, default=100) + parser.add_argument("--gpu-index", type=int, default=0) + parser.add_argument("--pools", type=int, default=4) + args = parser.parse_args() + + if not torch.cuda.is_available(): + print(json.dumps({"error": "cuda unavailable"}, indent=2)) + return 1 + if not hasattr(torch, "_scaled_mm") or not hasattr(torch, "float8_e4m3fn"): + print(json.dumps({"error": "torch FP8 _scaled_mm unavailable"}, indent=2)) + return 1 + + torch.cuda.set_device(args.gpu_index) + props = torch.cuda.get_device_properties(args.gpu_index) + payload = { + "source": "pytorch_fp8_path_bench", + "torch": torch.__version__, + "cuda": torch.version.cuda, + "gpu_index": args.gpu_index, + "gpu_name": props.name, + "matrix_size": args.matrix_size, + "warmup": args.warmup, + "iterations": args.iterations, + "results": [], + } + try: + payload["results"].extend(bench_scaled_mm(args)) + payload["results"].append(bench_transformer_engine(args)) + except torch.cuda.OutOfMemoryError as exc: + payload["error"] = f"CUDA OOM: {exc}" + print(json.dumps(payload, indent=2)) + return 1 + + ok_values = [r["tflops"] for r in payload["results"] if r.get("status") == "ok"] + if ok_values: + payload["summary"] = { + "max_tflops": round(max(ok_values), 1), + "min_tflops": round(min(ok_values), 1), + "mean_tflops": round(statistics.mean(ok_values), 1), + } + print(json.dumps(payload, indent=2)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/run_cublaslt_fp8_gemm.sh b/scripts/run_cublaslt_fp8_gemm.sh new file mode 100755 index 0000000..49f4787 --- /dev/null +++ b/scripts/run_cublaslt_fp8_gemm.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +set -uo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" + +CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}" +NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}" +OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" +MATRIX_SIZE="${MATRIX_SIZE:-8192}" +WARMUP="${WARMUP:-20}" +ITERATIONS="${ITERATIONS:-200}" +GPU_COUNT="${GPU_COUNT:-8}" +FIRST_GPU="${FIRST_GPU:-0}" +WORKSPACE_MB="${WORKSPACE_MB:-256}" + +if [[ ! -x "$NVCC" ]]; then + echo "nvcc not found: $NVCC" >&2 + exit 1 +fi + +mkdir -p "$OUT_DIR" "$PROJECT_DIR/build" +HOST="$(hostname 2>/dev/null || echo unknown)" +TS="$(date +%Y%m%d_%H%M%S)" +BIN="$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" +REPORT="$OUT_DIR/cublaslt_fp8_gemm_${HOST}_${TS}.json" + +"$NVCC" -O3 -std=c++17 -arch=sm_90 \ + "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \ + -lcublasLt -lcublas -o "$BIN" + +set +e +"$BIN" \ + --matrix-size "$MATRIX_SIZE" \ + --warmup "$WARMUP" \ + --iterations "$ITERATIONS" \ + --first-gpu "$FIRST_GPU" \ + --gpu-count "$GPU_COUNT" \ + --workspace-mb "$WORKSPACE_MB" \ + | tee "$REPORT" +status=${PIPESTATUS[0]} +set -e + +echo "Report written to: $REPORT" +exit "$status" diff --git a/scripts/run_fp8_path_comparison.sh b/scripts/run_fp8_path_comparison.sh new file mode 100755 index 0000000..46fd0e2 --- /dev/null +++ b/scripts/run_fp8_path_comparison.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" + +PYTHON="${PYTHON:-/root/gpu-test-venv/bin/python}" +CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.4}" +NVCC="${NVCC:-$CUDA_HOME/bin/nvcc}" +OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" +MATRIX_SIZE="${MATRIX_SIZE:-8192}" +WARMUP="${WARMUP:-20}" +ITERATIONS="${ITERATIONS:-100}" +GPU_INDEX="${GPU_INDEX:-0}" +WORKSPACE_MB="${WORKSPACE_MB:-256}" +VENV_SITE_PACKAGES="$("$PYTHON" - <<'PY' +import site +print(site.getsitepackages()[0]) +PY +)" +export LD_LIBRARY_PATH="$VENV_SITE_PACKAGES/nvidia/cudnn/lib:$VENV_SITE_PACKAGES/nvidia/nccl/lib:${LD_LIBRARY_PATH:-}" + +mkdir -p "$PROJECT_DIR/build" "$OUT_DIR" + +HOST="$(hostname 2>/dev/null || echo unknown)" +TS="$(date +%Y%m%d_%H%M%S)" +PY_REPORT="$OUT_DIR/fp8_paths_pytorch_${HOST}_${TS}.json" +CUBLAS_REPORT="$OUT_DIR/fp8_paths_cublaslt_${HOST}_${TS}.json" +COMBINED_REPORT="$OUT_DIR/fp8_paths_combined_${HOST}_${TS}.json" + +"$PYTHON" "$PROJECT_DIR/scripts/pytorch_fp8_path_bench.py" \ + --matrix-size "$MATRIX_SIZE" \ + --warmup "$WARMUP" \ + --iterations "$ITERATIONS" \ + --gpu-index "$GPU_INDEX" | tee "$PY_REPORT" + +"$NVCC" -O3 -std=c++17 -arch=sm_90 \ + "$PROJECT_DIR/scripts/cublaslt_fp8_gemm_bench.cu" \ + -lcublasLt -lcublas -o "$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" + +"$PROJECT_DIR/build/cublaslt_fp8_gemm_bench" \ + --matrix-size "$MATRIX_SIZE" \ + --warmup "$WARMUP" \ + --iterations "$ITERATIONS" \ + --first-gpu "$GPU_INDEX" \ + --gpu-count 1 \ + --workspace-mb "$WORKSPACE_MB" \ + --fast-accum 1 | tee "$CUBLAS_REPORT" + +"$PYTHON" - "$PY_REPORT" "$CUBLAS_REPORT" "$COMBINED_REPORT" <<'PY' +import json +import pathlib +import sys + +py_report = pathlib.Path(sys.argv[1]) +cublas_report = pathlib.Path(sys.argv[2]) +combined_report = pathlib.Path(sys.argv[3]) + +with py_report.open() as f: + py_payload = json.load(f) +with cublas_report.open() as f: + cublas_payload = json.load(f) + +combined = { + "source": "fp8_path_comparison", + "host": cublas_payload.get("host"), + "matrix_size": py_payload.get("matrix_size"), + "gpu_index": py_payload.get("gpu_index"), + "pytorch": py_payload, + "cublaslt": cublas_payload, + "results": [], +} +combined["results"].extend(py_payload.get("results", [])) +per_gpu = cublas_payload.get("per_gpu", []) +if per_gpu: + row = dict(per_gpu[0]) + row.update({ + "name": "E_direct_cublaslt_fast_accum", + "status": "ok", + "tflops": row.pop("fp8_tflops"), + "matrix_size": cublas_payload.get("matrix_size"), + "iterations": cublas_payload.get("iterations"), + "warmup": cublas_payload.get("warmup"), + "fast_accum": cublas_payload.get("fast_accum"), + "note": "Direct cuBLASLt FP8 GEMM, bypasses PyTorch eager.", + }) + combined["results"].append(row) + +combined_report.write_text(json.dumps(combined, indent=2), encoding="utf-8") +print(f"Combined report written to: {combined_report}") +PY + +echo "$COMBINED_REPORT" diff --git a/scripts/run_h100_single_node_all.sh b/scripts/run_h100_single_node_all.sh new file mode 100755 index 0000000..91d25fe --- /dev/null +++ b/scripts/run_h100_single_node_all.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +set -uo pipefail + +# Run the single-node H100 acceptance suite and keep the raw report paths stable. +# The suite itself still lives in gpu_tester.py; this wrapper only standardizes +# snapshot/report naming for repeated machine-level runs. + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" + +PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}" +CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/default.yaml}" +OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" +FORMAT="${FORMAT:-md}" +DRY_RUN=0 +SNAPSHOT=1 + +usage() { + cat <<'EOF' +Usage: run_h100_single_node_all.sh [options] + +Options: + --python PATH Python executable (default: /root/gpu-test-venv/bin/python) + --config PATH gpu_tester config file (default: configs/default.yaml) + --out-dir PATH Report output directory (default: reports) + --format FORMAT Report format: md, json, or html (default: md) + --no-snapshot Do not run nccl_environment_snapshot.sh first + --dry-run Print commands without running them + -h, --help Show this help +EOF +} + +while (($#)); do + case "$1" in + --python) + PYTHON_BIN="$2" + shift 2 + ;; + --config) + CONFIG_FILE="$2" + shift 2 + ;; + --out-dir) + OUT_DIR="$2" + shift 2 + ;; + --format) + FORMAT="$2" + shift 2 + ;; + --no-snapshot) + SNAPSHOT=0 + shift + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac +done + +if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then + echo "Unsupported format: $FORMAT" >&2 + exit 2 +fi + +if [[ ! -x "$PYTHON_BIN" ]]; then + PYTHON_BIN="$(command -v python3 || true)" +fi + +if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then + echo "Python executable not found. Set --python or PYTHON_BIN." >&2 + exit 1 +fi + +HOST="$(hostname 2>/dev/null || echo unknown)" +TS="$(date +%Y%m%d_%H%M%S)" +mkdir -p "$OUT_DIR" + +SNAPSHOT_FILE="$OUT_DIR/nccl_environment_snapshot_${HOST}_${TS}.md" +REPORT_FILE="$OUT_DIR/h100_single_node_all_${HOST}_${TS}.${FORMAT}" + +snapshot_cmd=(bash "$PROJECT_DIR/scripts/nccl_environment_snapshot.sh" "$SNAPSHOT_FILE") +test_cmd=( + "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py" + --config "$CONFIG_FILE" + --test all + --report + --format "$FORMAT" + --output "$REPORT_FILE" +) + +echo "Project: $PROJECT_DIR" +echo "Host: $HOST" +echo "Config: $CONFIG_FILE" +echo "Report: $REPORT_FILE" +if ((SNAPSHOT)); then + echo "Snapshot: $SNAPSHOT_FILE" +fi + +if ((DRY_RUN)); then + if ((SNAPSHOT)); then + printf 'DRY RUN snapshot:' + printf ' %q' "${snapshot_cmd[@]}" + printf '\n' + fi + printf 'DRY RUN test:' + printf ' %q' "${test_cmd[@]}" + printf '\n' + exit 0 +fi + +if ((SNAPSHOT)); then + "${snapshot_cmd[@]}" +fi + +"${test_cmd[@]}" +status=$? + +echo "Report written to: $REPORT_FILE" +if ((SNAPSHOT)); then + echo "Snapshot written to: $SNAPSHOT_FILE" +fi + +exit "$status" diff --git a/scripts/run_multinode_nccl_all_collectives.sh b/scripts/run_multinode_nccl_all_collectives.sh new file mode 100755 index 0000000..819e893 --- /dev/null +++ b/scripts/run_multinode_nccl_all_collectives.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +set -uo pipefail + +# Run a two-node, eight-GPU-per-node NCCL evidence pass across the six +# collectives used by the single-node H100 acceptance flow. + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" + +PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}" +CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_all_collectives_2x8.yaml}" +OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" +FORMAT="${FORMAT:-md}" +DRY_RUN=0 +RUN_PREFLIGHT=1 +PREFLIGHT_ONLY=0 + +usage() { + cat <<'EOF' +Usage: run_multinode_nccl_all_collectives.sh [options] + +Options: + --python PATH Python executable (default: /root/gpu-test-venv/bin/python) + --config PATH Config file (default: configs/multinode_nccl_nccl227_all_collectives_2x8.yaml) + --out-dir PATH Report output directory (default: reports) + --format FORMAT Report format: md, json, or html (default: md) + --no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight + --preflight-only Run only the preflight check, not the workload + --dry-run Print commands without running them + -h, --help Show this help +EOF +} + +while (($#)); do + case "$1" in + --python) + PYTHON_BIN="$2" + shift 2 + ;; + --config) + CONFIG_FILE="$2" + shift 2 + ;; + --out-dir) + OUT_DIR="$2" + shift 2 + ;; + --format) + FORMAT="$2" + shift 2 + ;; + --no-preflight) + RUN_PREFLIGHT=0 + shift + ;; + --preflight-only) + PREFLIGHT_ONLY=1 + shift + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac +done + +if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then + echo "Unsupported format: $FORMAT" >&2 + exit 2 +fi + +if [[ ! -x "$PYTHON_BIN" ]]; then + PYTHON_BIN="$(command -v python3 || true)" +fi + +if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then + echo "Python executable not found. Set --python or PYTHON_BIN." >&2 + exit 1 +fi + +TS="$(date +%Y%m%d_%H%M%S)" +mkdir -p "$OUT_DIR" + +REPORT_FILE="$OUT_DIR/multinode_nccl_all_collectives_${TS}.${FORMAT}" +ARTIFACT_DIR="$OUT_DIR/multinode_nccl_all_collectives_${TS}_artifacts" +PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight) +RUN_CMD=( + "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py" + --config "$CONFIG_FILE" + --test multinode-nccl + --report + --format "$FORMAT" + --output "$REPORT_FILE" +) + +echo "Project: $PROJECT_DIR" +echo "Config: $CONFIG_FILE" +echo "Report: $REPORT_FILE" +echo "Artifacts: $ARTIFACT_DIR" +echo "Collectives: allreduce, alltoall, broadcast, reducescatter, allgather, sendrecv" +echo "Topology: 2 nodes x 8 GPUs per node; 16G" + +if ((DRY_RUN)); then + if ((RUN_PREFLIGHT)); then + printf 'DRY RUN preflight:' + printf ' %q' "${PREFLIGHT_CMD[@]}" + printf '\n' + fi + if ((PREFLIGHT_ONLY)); then + exit 0 + fi + printf 'DRY RUN workload:' + printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR" + printf ' %q' "${RUN_CMD[@]}" + printf '\n' + exit 0 +fi + +if ((RUN_PREFLIGHT)); then + "${PREFLIGHT_CMD[@]}" + preflight_status=$? + if ((preflight_status != 0)); then + echo "Preflight failed with exit code $preflight_status" >&2 + exit "$preflight_status" + fi +fi + +if ((PREFLIGHT_ONLY)); then + exit 0 +fi + +mkdir -p "$ARTIFACT_DIR" +MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${RUN_CMD[@]}" +status=$? + +echo "Report written to: $REPORT_FILE" +echo "Artifacts written to: $ARTIFACT_DIR" +exit "$status" diff --git a/scripts/run_multinode_nccl_pdf_matrix.sh b/scripts/run_multinode_nccl_pdf_matrix.sh new file mode 100755 index 0000000..572ce04 --- /dev/null +++ b/scripts/run_multinode_nccl_pdf_matrix.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +set -uo pipefail + +# Run the formal cross-node NCCL PDF matrix for the current two-node H100 pair. +# This wrapper standardizes the command, output naming, and preflight hook; the +# actual benchmark implementation remains in gpu_tester.py / MultiNodeNCCLTest. + +SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +PROJECT_DIR="$(cd -- "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)" + +PYTHON_BIN="${PYTHON_BIN:-/root/gpu-test-venv/bin/python}" +CONFIG_FILE="${CONFIG_FILE:-$PROJECT_DIR/configs/multinode_nccl_nccl227_pdf_matrix.yaml}" +OUT_DIR="${OUT_DIR:-$PROJECT_DIR/reports}" +FORMAT="${FORMAT:-md}" +DRY_RUN=0 +RUN_PREFLIGHT=1 +PREFLIGHT_ONLY=0 + +usage() { + cat <<'EOF' +Usage: run_multinode_nccl_pdf_matrix.sh [options] + +Options: + --python PATH Python executable (default: /root/gpu-test-venv/bin/python) + --config PATH Matrix config file (default: configs/multinode_nccl_nccl227_pdf_matrix.yaml) + --out-dir PATH Report output directory (default: reports) + --format FORMAT Report format: md, json, or html (default: md) + --no-preflight Skip scripts/multinode_nccl_deep_diagnose.sh preflight + --preflight-only Run only the preflight check, not the matrix workload + --dry-run Print commands without running them + -h, --help Show this help +EOF +} + +while (($#)); do + case "$1" in + --python) + PYTHON_BIN="$2" + shift 2 + ;; + --config) + CONFIG_FILE="$2" + shift 2 + ;; + --out-dir) + OUT_DIR="$2" + shift 2 + ;; + --format) + FORMAT="$2" + shift 2 + ;; + --no-preflight) + RUN_PREFLIGHT=0 + shift + ;; + --preflight-only) + PREFLIGHT_ONLY=1 + shift + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac +done + +if [[ "$FORMAT" != "md" && "$FORMAT" != "json" && "$FORMAT" != "html" ]]; then + echo "Unsupported format: $FORMAT" >&2 + exit 2 +fi + +if [[ ! -x "$PYTHON_BIN" ]]; then + PYTHON_BIN="$(command -v python3 || true)" +fi + +if [[ -z "$PYTHON_BIN" || ! -x "$PYTHON_BIN" ]]; then + echo "Python executable not found. Set --python or PYTHON_BIN." >&2 + exit 1 +fi + +TS="$(date +%Y%m%d_%H%M%S)" +mkdir -p "$OUT_DIR" + +REPORT_FILE="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}.${FORMAT}" +ARTIFACT_DIR="$OUT_DIR/multinode_nccl_pdf_matrix_${TS}_artifacts" +PREFLIGHT_CMD=(bash "$PROJECT_DIR/scripts/multinode_nccl_deep_diagnose.sh" preflight) +MATRIX_CMD=( + "$PYTHON_BIN" "$PROJECT_DIR/gpu_tester.py" + --config "$CONFIG_FILE" + --test multinode-nccl + --report + --format "$FORMAT" + --output "$REPORT_FILE" +) + +echo "Project: $PROJECT_DIR" +echo "Config: $CONFIG_FILE" +echo "Report: $REPORT_FILE" +echo "Artifacts: $ARTIFACT_DIR" +echo "Matrix: 2 nodes x {1,2,4,8} GPUs per node; all_reduce_perf + alltoall_perf; 16G" + +if ((DRY_RUN)); then + if ((RUN_PREFLIGHT)); then + printf 'DRY RUN preflight:' + printf ' %q' "${PREFLIGHT_CMD[@]}" + printf '\n' + fi + if ((PREFLIGHT_ONLY)); then + exit 0 + fi + printf 'DRY RUN matrix:' + printf ' MULTINODE_NCCL_ARTIFACT_DIR=%q' "$ARTIFACT_DIR" + printf ' %q' "${MATRIX_CMD[@]}" + printf '\n' + exit 0 +fi + +if ((RUN_PREFLIGHT)); then + "${PREFLIGHT_CMD[@]}" + preflight_status=$? + if ((preflight_status != 0)); then + echo "Preflight failed with exit code $preflight_status" >&2 + exit "$preflight_status" + fi +fi + +if ((PREFLIGHT_ONLY)); then + exit 0 +fi + +mkdir -p "$ARTIFACT_DIR" +MULTINODE_NCCL_ARTIFACT_DIR="$ARTIFACT_DIR" "${MATRIX_CMD[@]}" +status=$? + +echo "Report written to: $REPORT_FILE" +echo "Artifacts written to: $ARTIFACT_DIR" +exit "$status"